tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +1 -1
- tinygrad/apps/llm.py +206 -0
- tinygrad/codegen/__init__.py +116 -0
- tinygrad/codegen/devectorizer.py +315 -172
- tinygrad/codegen/expander.py +8 -16
- tinygrad/codegen/gpudims.py +89 -0
- tinygrad/codegen/linearize.py +205 -203
- tinygrad/codegen/lowerer.py +92 -139
- tinygrad/codegen/opt/__init__.py +38 -0
- tinygrad/codegen/opt/heuristic.py +125 -0
- tinygrad/codegen/opt/kernel.py +510 -0
- tinygrad/{engine → codegen/opt}/search.py +51 -35
- tinygrad/codegen/opt/swizzler.py +134 -0
- tinygrad/codegen/opt/tc.py +127 -0
- tinygrad/codegen/quantize.py +67 -0
- tinygrad/device.py +122 -132
- tinygrad/dtype.py +152 -35
- tinygrad/engine/jit.py +81 -54
- tinygrad/engine/memory.py +46 -27
- tinygrad/engine/realize.py +82 -41
- tinygrad/engine/schedule.py +70 -445
- tinygrad/frontend/__init__.py +0 -0
- tinygrad/frontend/onnx.py +1253 -0
- tinygrad/frontend/torch.py +5 -0
- tinygrad/gradient.py +19 -27
- tinygrad/helpers.py +95 -47
- tinygrad/nn/__init__.py +7 -8
- tinygrad/nn/optim.py +72 -41
- tinygrad/nn/state.py +37 -23
- tinygrad/renderer/__init__.py +40 -60
- tinygrad/renderer/cstyle.py +143 -128
- tinygrad/renderer/llvmir.py +113 -62
- tinygrad/renderer/ptx.py +50 -32
- tinygrad/renderer/wgsl.py +27 -23
- tinygrad/runtime/autogen/am/am.py +5861 -0
- tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
- tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
- tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
- tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
- tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
- tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
- tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
- tinygrad/runtime/autogen/comgr.py +35 -9
- tinygrad/runtime/autogen/comgr_3.py +906 -0
- tinygrad/runtime/autogen/cuda.py +2419 -494
- tinygrad/runtime/autogen/hsa.py +57 -16
- tinygrad/runtime/autogen/ib.py +7171 -0
- tinygrad/runtime/autogen/io_uring.py +917 -118
- tinygrad/runtime/autogen/kfd.py +748 -26
- tinygrad/runtime/autogen/libc.py +613 -218
- tinygrad/runtime/autogen/libusb.py +1643 -0
- tinygrad/runtime/autogen/nv/nv.py +8602 -0
- tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
- tinygrad/runtime/autogen/opencl.py +2 -4
- tinygrad/runtime/autogen/sqtt.py +1789 -0
- tinygrad/runtime/autogen/vfio.py +3 -3
- tinygrad/runtime/autogen/webgpu.py +273 -264
- tinygrad/runtime/graph/cuda.py +3 -3
- tinygrad/runtime/graph/hcq.py +68 -29
- tinygrad/runtime/graph/metal.py +29 -13
- tinygrad/runtime/graph/remote.py +114 -0
- tinygrad/runtime/ops_amd.py +537 -320
- tinygrad/runtime/ops_cpu.py +108 -7
- tinygrad/runtime/ops_cuda.py +12 -14
- tinygrad/runtime/ops_disk.py +13 -10
- tinygrad/runtime/ops_dsp.py +47 -40
- tinygrad/runtime/ops_gpu.py +13 -11
- tinygrad/runtime/ops_hip.py +6 -9
- tinygrad/runtime/ops_llvm.py +35 -15
- tinygrad/runtime/ops_metal.py +29 -19
- tinygrad/runtime/ops_npy.py +5 -3
- tinygrad/runtime/ops_null.py +28 -0
- tinygrad/runtime/ops_nv.py +306 -234
- tinygrad/runtime/ops_python.py +62 -52
- tinygrad/runtime/ops_qcom.py +28 -39
- tinygrad/runtime/ops_remote.py +482 -0
- tinygrad/runtime/ops_webgpu.py +28 -28
- tinygrad/runtime/support/am/amdev.py +114 -249
- tinygrad/runtime/support/am/ip.py +211 -172
- tinygrad/runtime/support/amd.py +138 -0
- tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
- tinygrad/runtime/support/compiler_cuda.py +8 -11
- tinygrad/runtime/support/elf.py +2 -1
- tinygrad/runtime/support/hcq.py +184 -97
- tinygrad/runtime/support/ib.py +172 -0
- tinygrad/runtime/support/llvm.py +3 -4
- tinygrad/runtime/support/memory.py +251 -0
- tinygrad/runtime/support/nv/__init__.py +0 -0
- tinygrad/runtime/support/nv/ip.py +581 -0
- tinygrad/runtime/support/nv/nvdev.py +183 -0
- tinygrad/runtime/support/system.py +170 -0
- tinygrad/runtime/support/usb.py +268 -0
- tinygrad/runtime/support/webgpu.py +18 -0
- tinygrad/schedule/__init__.py +0 -0
- tinygrad/schedule/grouper.py +119 -0
- tinygrad/schedule/kernelize.py +368 -0
- tinygrad/schedule/multi.py +231 -0
- tinygrad/shape/shapetracker.py +40 -46
- tinygrad/shape/view.py +88 -52
- tinygrad/tensor.py +968 -542
- tinygrad/uop/__init__.py +117 -0
- tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
- tinygrad/uop/mathtraits.py +169 -0
- tinygrad/uop/ops.py +1021 -0
- tinygrad/uop/spec.py +228 -0
- tinygrad/{codegen → uop}/symbolic.py +239 -216
- tinygrad/uop/upat.py +163 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
- tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
- tinygrad/viz/index.html +203 -403
- tinygrad/viz/js/index.js +718 -0
- tinygrad/viz/js/worker.js +29 -0
- tinygrad/viz/serve.py +224 -102
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
- tinygrad-0.11.0.dist-info/RECORD +141 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/kernel.py +0 -693
- tinygrad/engine/multi.py +0 -161
- tinygrad/ops.py +0 -1003
- tinygrad/runtime/ops_cloud.py +0 -220
- tinygrad/runtime/support/allocator.py +0 -94
- tinygrad/spec.py +0 -155
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
- tinygrad/viz/perfetto.html +0 -178
- tinygrad-0.10.2.dist-info/RECORD +0 -99
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_cpu.py
CHANGED
@@ -1,8 +1,15 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
from tinygrad.
|
1
|
+
from __future__ import annotations
|
2
|
+
import platform, subprocess, sys, ctypes, functools, time, mmap, threading, queue
|
3
|
+
from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, wait_cond, cpu_profile
|
4
|
+
from tinygrad.device import Compiler, BufferSpec, DMACPURef
|
5
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
4
6
|
from tinygrad.runtime.support.elf import jit_loader
|
5
7
|
from tinygrad.renderer.cstyle import ClangRenderer
|
8
|
+
from tinygrad.uop.ops import sint
|
9
|
+
|
10
|
+
class CPUSignal(HCQSignal):
|
11
|
+
def _sleep(self, time_spent_waiting_ms:int):
|
12
|
+
if self.is_timeline and self.owner is not None: self.owner.tasks.join()
|
6
13
|
|
7
14
|
class ClangJITCompiler(Compiler):
|
8
15
|
def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
|
@@ -11,14 +18,108 @@ class ClangJITCompiler(Compiler):
|
|
11
18
|
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
|
12
19
|
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
|
13
20
|
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
|
14
|
-
|
21
|
+
# on arm march means "runs on this arch and superset" instead of "optimize for this arch". x86 march == arm mcpu
|
22
|
+
arch = '-march=native' if platform.machine() in ('x86_64', 'AMD64') else '-mcpu=native'
|
23
|
+
args = [arch, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
|
15
24
|
arch_args = ['-ffixed-x18'] if target == 'arm64' else []
|
16
25
|
obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
17
26
|
return jit_loader(obj)
|
18
27
|
|
19
28
|
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
|
20
29
|
|
21
|
-
class
|
22
|
-
def __init__(self,
|
30
|
+
class CPUWorker(threading.Thread):
|
31
|
+
def __init__(self, dev):
|
32
|
+
super().__init__()
|
33
|
+
self.dev, self.tasks, self.daemon = dev, dev.tasks, True
|
34
|
+
|
35
|
+
def run(self):
|
36
|
+
while True:
|
37
|
+
cmd_iter = iter(self.tasks.get())
|
38
|
+
for cmd in cmd_iter:
|
39
|
+
args_cnt = next(cmd_iter)
|
40
|
+
cmd(*[next(cmd_iter) for _ in range(args_cnt)])
|
41
|
+
self.tasks.task_done()
|
42
|
+
|
43
|
+
class CPUComputeQueue(HWQueue):
|
44
|
+
def _exec(self, prg, bufs, *args):
|
45
|
+
prg.fxn(*map(ctypes.c_uint64, args[:bufs]), *map(ctypes.c_int64 if platform.machine() == "arm64" else ctypes.c_int32, args[bufs:]))
|
46
|
+
def _signal(self, signal_addr, value): to_mv(signal_addr, 4).cast('I')[0] = value
|
47
|
+
def _wait(self, signal_addr, value): wait_cond(lambda: to_mv(signal_addr, 4).cast('I')[0] >= value, timeout_ms=60000)
|
48
|
+
def _timestamp(self, timestamp_addr): to_mv(timestamp_addr, 8).cast('Q')[0] = time.perf_counter_ns()
|
49
|
+
def cmd(self, cmd, *args):
|
50
|
+
self.q(cmd, len(args), *args)
|
51
|
+
return self
|
52
|
+
|
53
|
+
def memory_barrier(self): return self
|
54
|
+
def exec(self, prg:CPUProgram, args_state:HCQArgsState, global_size, local_size):
|
55
|
+
return self.cmd(self._exec, prg, len(args_state.bufs), *[x.va_addr for x in args_state.bufs], *args_state.vals)
|
56
|
+
def wait(self, signal, value=0): return self.cmd(self._wait, signal.value_addr, value)
|
57
|
+
def timestamp(self, signal): return self.cmd(self._timestamp, signal.timestamp_addr)
|
58
|
+
def signal(self, signal, value:sint=0): return self.cmd(self._signal, signal.value_addr, value)
|
59
|
+
def _submit(self, dev): dev.tasks.put(self._q[:])
|
60
|
+
|
61
|
+
# NOTE: MAP_JIT is added to mmap module in python 3.13
|
62
|
+
MAP_JIT = 0x0800
|
63
|
+
|
64
|
+
class CPUProgram(HCQProgram):
|
65
|
+
rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
|
66
|
+
|
67
|
+
def __init__(self, dev, name:str, lib:bytes):
|
68
|
+
if sys.platform == "win32":
|
69
|
+
PAGE_EXECUTE_READWRITE, MEM_COMMIT, MEM_RESERVE = 0x40, 0x1000, 0x2000
|
70
|
+
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
|
71
|
+
self.mem = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_void_p(0), ctypes.c_size_t(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
|
72
|
+
ctypes.memmove(self.mem, lib, len(lib))
|
73
|
+
ctypes.windll.kernel32.GetCurrentProcess.restype = ctypes.c_void_p
|
74
|
+
proc = ctypes.windll.kernel32.GetCurrentProcess()
|
75
|
+
ctypes.windll.kernel32.FlushInstructionCache(ctypes.c_void_p(proc), ctypes.c_void_p(self.mem), ctypes.c_size_t(len(lib)))
|
76
|
+
self.fxn = ctypes.CFUNCTYPE(None)(self.mem)
|
77
|
+
else:
|
78
|
+
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
|
79
|
+
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
|
80
|
+
self.mem = mmap.mmap(-1, len(lib), mmap.MAP_ANON|mmap.MAP_PRIVATE|(MAP_JIT if OSX else 0), mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
|
81
|
+
|
82
|
+
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False)
|
83
|
+
self.mem.write(lib)
|
84
|
+
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True)
|
85
|
+
|
86
|
+
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
|
87
|
+
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
|
88
|
+
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
|
89
|
+
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
|
90
|
+
CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
|
91
|
+
|
92
|
+
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
|
93
|
+
|
94
|
+
super().__init__(HCQArgsState, dev, name, kernargs_alloc_size=0)
|
95
|
+
|
96
|
+
def __del__(self):
|
97
|
+
if getattr(sys, 'is_finalizing', lambda: True)(): return
|
98
|
+
if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
|
99
|
+
|
100
|
+
class CPUAllocator(HCQAllocatorBase):
|
101
|
+
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
102
|
+
if options.external_ptr: addr, buf = options.external_ptr, None
|
103
|
+
elif sys.platform == "win32": addr = mv_address(buf:=mmap.mmap(-1, size, access=mmap.ACCESS_WRITE))
|
104
|
+
else: addr = mv_address(buf:=mmap.mmap(-1, size, mmap.MAP_ANON | mmap.MAP_PRIVATE, mmap.PROT_READ | mmap.PROT_WRITE))
|
105
|
+
return HCQBuffer(va:=addr, sz:=size, meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev)
|
106
|
+
def _as_buffer(self, src) -> memoryview:
|
107
|
+
self.dev.synchronize()
|
108
|
+
return to_mv(src.va_addr, src.size)
|
109
|
+
def _as_dmaref(self, buf):
|
110
|
+
self.dev.synchronize()
|
111
|
+
return DMACPURef(buf.va_addr, buf.size)
|
112
|
+
def _copyin(self, dest, src:memoryview):
|
113
|
+
self.dev.synchronize()
|
114
|
+
with cpu_profile('TINY -> CPU', self.dev.device, is_copy=True): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
|
115
|
+
def _copyout(self, dest:memoryview, src):
|
116
|
+
self.dev.synchronize()
|
117
|
+
with cpu_profile('CPU -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
|
118
|
+
def _map(self, buf:HCQBuffer):
|
119
|
+
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
|
23
120
|
|
24
|
-
CPUDevice
|
121
|
+
class CPUDevice(HCQCompiled):
|
122
|
+
def __init__(self, device:str=""):
|
123
|
+
self.tasks:queue.Queue = queue.Queue()
|
124
|
+
CPUWorker(self).start()
|
125
|
+
super().__init__(device, CPUAllocator(self), ClangRenderer(), ClangJITCompiler(), functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
|
tinygrad/runtime/ops_cuda.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import ctypes, ctypes.util, functools
|
3
|
-
from tinygrad.helpers import DEBUG, getenv,
|
3
|
+
from tinygrad.helpers import DEBUG, getenv, mv_address, init_c_var, init_c_struct_t, suppress_finalizing
|
4
4
|
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
|
5
5
|
from tinygrad.renderer.cstyle import CUDARenderer
|
6
6
|
from tinygrad.renderer.ptx import PTXRenderer
|
7
7
|
from tinygrad.runtime.autogen import cuda
|
8
|
-
from tinygrad.runtime.support.compiler_cuda import
|
8
|
+
from tinygrad.runtime.support.compiler_cuda import pretty_ptx, CUDACompiler, PTXCompiler, PTX
|
9
9
|
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
10
10
|
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
|
11
11
|
|
@@ -34,21 +34,19 @@ class CUDAProgram:
|
|
34
34
|
def __init__(self, dev:CUDADevice, name:str, lib:bytes, smem:int=0):
|
35
35
|
self.dev, self.name, self.lib, self.smem = dev, name, lib, smem
|
36
36
|
if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
|
37
|
-
if DEBUG >= 6: cuda_disassemble(lib, dev.arch)
|
38
37
|
|
39
38
|
check(cuda.cuCtxSetCurrent(self.dev.context))
|
40
39
|
self.module = cuda.CUmodule()
|
41
40
|
status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
|
42
41
|
if status != 0:
|
43
42
|
del self.module
|
44
|
-
cuda_disassemble(lib, dev.arch)
|
45
43
|
raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
|
46
44
|
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
|
47
45
|
self.prg = prg
|
48
46
|
if self.smem > 0: check(cuda.cuFuncSetAttribute(self.prg, cuda.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, self.smem))
|
49
47
|
|
50
|
-
|
51
|
-
|
48
|
+
@suppress_finalizing
|
49
|
+
def __del__(self): check(cuda.cuModuleUnload(self.module))
|
52
50
|
|
53
51
|
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
54
52
|
check(cuda.cuCtxSetCurrent(self.dev.context))
|
@@ -62,27 +60,27 @@ class CUDAProgram:
|
|
62
60
|
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
63
61
|
return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, self.smem, None, None, self.vargs)), enable=wait)
|
64
62
|
|
65
|
-
class CUDAAllocator(LRUAllocator):
|
66
|
-
def __init__(self, dev:CUDADevice):
|
67
|
-
self.dev = dev
|
68
|
-
super().__init__()
|
63
|
+
class CUDAAllocator(LRUAllocator['CUDADevice']):
|
69
64
|
def _alloc(self, size, options:BufferSpec):
|
70
65
|
check(cuda.cuCtxSetCurrent(self.dev.context))
|
66
|
+
if options.external_ptr: return cuda.CUdeviceptr_v2(options.external_ptr)
|
71
67
|
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
|
72
68
|
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
|
73
69
|
def _free(self, opaque, options:BufferSpec):
|
74
|
-
|
75
|
-
|
70
|
+
try:
|
71
|
+
if options.host: check(cuda.cuMemFreeHost(opaque))
|
72
|
+
else: check(cuda.cuMemFree_v2(opaque))
|
73
|
+
except (TypeError, AttributeError): pass
|
76
74
|
def _copyin(self, dest, src:memoryview):
|
77
75
|
check(cuda.cuCtxSetCurrent(self.dev.context))
|
78
76
|
host_mem = self.alloc(len(src), BufferSpec(host=True))
|
79
77
|
self.dev.pending_copyin.append((host_mem, len(src), BufferSpec(host=True)))
|
80
|
-
ctypes.memmove(host_mem,
|
78
|
+
ctypes.memmove(host_mem, mv_address(src), len(src))
|
81
79
|
check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
|
82
80
|
def _copyout(self, dest:memoryview, src):
|
83
81
|
CUDADevice.synchronize_system()
|
84
82
|
check(cuda.cuCtxSetCurrent(self.dev.context))
|
85
|
-
check(cuda.cuMemcpyDtoH_v2(
|
83
|
+
check(cuda.cuMemcpyDtoH_v2(mv_address(dest), src, len(dest)))
|
86
84
|
def _transfer(self, dest, src, sz:int, src_dev, dest_dev):
|
87
85
|
check(cuda.cuCtxSetCurrent(src_dev.context))
|
88
86
|
check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
|
tinygrad/runtime/ops_disk.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
import os, sys, mmap, io, ctypes,
|
2
|
-
from typing import
|
1
|
+
import os, sys, mmap, io, ctypes, contextlib, pathlib
|
2
|
+
from typing import Generator, Callable
|
3
3
|
from tinygrad.helpers import OSX, round_up
|
4
4
|
from tinygrad.device import Compiled, Allocator
|
5
5
|
with contextlib.suppress(ImportError):
|
@@ -12,14 +12,15 @@ class DiskDevice(Compiled):
|
|
12
12
|
def __init__(self, device:str):
|
13
13
|
if not DiskDevice._tried_io_uring_init: self._iouring_setup()
|
14
14
|
|
15
|
-
self.size:
|
16
|
-
self.fd:
|
15
|
+
self.size: int|None = None
|
16
|
+
self.fd: int|None = None
|
17
17
|
self.count = 0
|
18
18
|
super().__init__(device, DiskAllocator(self), None, None, None)
|
19
|
-
def _might_open(self, size):
|
20
|
-
self.count += 1
|
19
|
+
def _might_open(self, size:int):
|
21
20
|
assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
|
22
|
-
if self.size is not None:
|
21
|
+
if self.size is not None and hasattr(self.device, "mem"):
|
22
|
+
self.count += 1
|
23
|
+
return
|
23
24
|
filename = self.device[len("disk:"):]
|
24
25
|
self.size = size
|
25
26
|
|
@@ -30,10 +31,11 @@ class DiskDevice(Compiled):
|
|
30
31
|
else:
|
31
32
|
try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|getattr(os, "O_DIRECT", 0))
|
32
33
|
except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
|
33
|
-
if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
|
34
|
+
if not pathlib.Path(filename).is_block_device() and os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
|
34
35
|
self.mem = mmap.mmap(self.fd, self.size)
|
35
36
|
if hasattr(self.mem, 'madvise') and (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
|
36
37
|
with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
|
38
|
+
self.count += 1
|
37
39
|
def _might_close(self):
|
38
40
|
self.count -= 1
|
39
41
|
if self.count == 0:
|
@@ -72,7 +74,7 @@ class DiskBuffer:
|
|
72
74
|
|
73
75
|
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
|
74
76
|
class DiskAllocator(Allocator):
|
75
|
-
def __init__(self, dev:DiskDevice):
|
77
|
+
def __init__(self, dev:DiskDevice): super().__init__(dev)
|
76
78
|
def _alloc(self, size:int, options):
|
77
79
|
self.dev._might_open(size)
|
78
80
|
return DiskBuffer(self.dev, size)
|
@@ -84,7 +86,8 @@ class DiskAllocator(Allocator):
|
|
84
86
|
# OSX doesn't seem great at mmap, this is faster
|
85
87
|
with io.FileIO(self.dev.fd, "a+b", closefd=False) as fo:
|
86
88
|
fo.seek(src.offset)
|
87
|
-
|
89
|
+
bytes_read = 0
|
90
|
+
while (n := fo.readinto(dest[bytes_read:])) is not None and n > 0: bytes_read += n
|
88
91
|
else:
|
89
92
|
dest[:] = src._buf()
|
90
93
|
|
tinygrad/runtime/ops_dsp.py
CHANGED
@@ -1,53 +1,60 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
|
3
3
|
assert sys.platform != 'win32'
|
4
|
-
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler
|
4
|
+
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler
|
5
|
+
from tinygrad.runtime.ops_cpu import CPUAllocator
|
5
6
|
from tinygrad.dtype import dtypes, DType, PtrDType
|
6
|
-
from tinygrad.ops import Ops, UOp
|
7
|
-
from tinygrad.helpers import
|
7
|
+
from tinygrad.uop.ops import Ops, UOp
|
8
|
+
from tinygrad.helpers import getenv, round_up, mv_address, to_mv, cpu_objdump, DEBUG
|
8
9
|
from tinygrad.renderer.cstyle import ClangRenderer
|
9
10
|
from tinygrad.runtime.autogen import libc, qcom_dsp
|
10
11
|
if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
|
11
12
|
|
12
|
-
from tinygrad.ops import PatternMatcher, UPat
|
13
|
+
from tinygrad.uop.ops import PatternMatcher, UPat
|
13
14
|
|
14
15
|
dsp_pm = PatternMatcher([
|
15
16
|
(((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
|
16
17
|
lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
|
17
18
|
arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
|
18
19
|
(UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
|
19
|
-
|
20
|
+
"__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 and x.src[0].dtype.count > 1 else None),
|
20
21
|
])
|
21
22
|
|
22
23
|
dsp_pm_late = PatternMatcher([
|
23
|
-
(UPat.var("x")+UPat(Ops.VECTORIZE,
|
24
|
-
(UPat.var("x")*UPat(Ops.VECTORIZE,
|
25
|
-
(UPat.var("x")//UPat(Ops.VECTORIZE,
|
26
|
-
(UPat(Ops.
|
27
|
-
lambda d: d.replace(src=(UOp(Ops.
|
24
|
+
(UPat.var("x")+UPat(Ops.VECTORIZE,src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI else None),
|
25
|
+
(UPat.var("x")*UPat(Ops.VECTORIZE,src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI else None),
|
26
|
+
(UPat.var("x")//UPat(Ops.VECTORIZE,src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI else None),
|
27
|
+
(UPat(Ops.DEFINE_REG, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
|
28
|
+
lambda d: d.replace(src=(UOp(Ops.CUSTOMI, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
|
29
|
+
])
|
30
|
+
|
31
|
+
# NOTE: this just increases readability of the generated code
|
32
|
+
dsp_string = PatternMatcher([
|
33
|
+
(UPat(Ops.CONST, (dtypes.int8, dtypes.uint8), name="x"), lambda ctx,x: str(x.arg)),
|
28
34
|
])
|
29
35
|
|
30
36
|
class DSPRenderer(ClangRenderer):
|
31
37
|
device = "DSP"
|
32
38
|
supports_float4 = True
|
33
39
|
buffer_suffix = " restrict __attribute__((align_value(128)))"
|
34
|
-
|
40
|
+
kernel_typedef = "__attribute__((noinline)) void"
|
35
41
|
pre_matcher = dsp_pm
|
36
42
|
extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
|
43
|
+
string_rewrite = dsp_string+ClangRenderer.string_rewrite
|
37
44
|
type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
|
38
|
-
code_for_op = {
|
39
|
-
Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
|
40
|
-
Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
|
45
|
+
code_for_op = {k:v for k,v in ClangRenderer.code_for_op.items() if k != Ops.SQRT}
|
41
46
|
|
42
|
-
def
|
43
|
-
|
44
|
-
msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
|
47
|
+
def _render_defines(self, uops) -> list[str]:
|
48
|
+
return ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
|
45
49
|
_Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
|
46
50
|
'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
|
47
51
|
'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
|
48
|
-
'unsigned long long HAP_perf_get_time_us(void);'
|
49
|
-
|
50
|
-
|
52
|
+
'unsigned long long HAP_perf_get_time_us(void);'] + super()._render_defines(uops)
|
53
|
+
|
54
|
+
def _render_entry(self, function_name:str, bufs:list[tuple[str,tuple[DType,bool]]]) -> str:
|
55
|
+
msrc = ['int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
|
56
|
+
'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
|
57
|
+
'HAP_power_set((void*)handle, (void*)&req);']
|
51
58
|
msrc += ['if ((sc>>24) != 2) return 0;']
|
52
59
|
msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
|
53
60
|
msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
|
@@ -57,7 +64,7 @@ class DSPRenderer(ClangRenderer):
|
|
57
64
|
msrc += ["*(unsigned long long *)(pra[2].buf.pv) = HAP_perf_get_time_us() - start;"]
|
58
65
|
msrc += [f'HAP_munmap(buf_{i}, sz_or_val_{i});' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
|
59
66
|
msrc += ["return 0; }"]
|
60
|
-
return
|
67
|
+
return '\n'.join(msrc)
|
61
68
|
|
62
69
|
def rpc_sc(method=0, ins=0, outs=0, fds=0): return (method << 24) | (ins << 16) | (outs << 8) | fds
|
63
70
|
def rpc_prep_args(ins=None, outs=None, in_fds=None):
|
@@ -88,11 +95,7 @@ class DSPBuffer:
|
|
88
95
|
def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
|
89
96
|
self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
|
90
97
|
|
91
|
-
class DSPAllocator(Allocator):
|
92
|
-
def __init__(self, dev:DSPDevice):
|
93
|
-
self.dev = dev
|
94
|
-
super().__init__()
|
95
|
-
|
98
|
+
class DSPAllocator(Allocator['DSPDevice']):
|
96
99
|
def _alloc(self, size:int, options:BufferSpec):
|
97
100
|
b = qcom_dsp.ION_IOC_ALLOC(self.dev.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
|
98
101
|
share_info = qcom_dsp.ION_IOC_SHARE(self.dev.ion_fd, handle=b.handle)
|
@@ -106,8 +109,8 @@ class DSPAllocator(Allocator):
|
|
106
109
|
qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
|
107
110
|
|
108
111
|
def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
|
109
|
-
def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr,
|
110
|
-
def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(
|
112
|
+
def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes)
|
113
|
+
def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(mv_address(dest), src.va_addr, dest.nbytes)
|
111
114
|
def _offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
|
112
115
|
|
113
116
|
class ClangCompiler(Compiler):
|
@@ -128,14 +131,19 @@ class ClangCompiler(Compiler):
|
|
128
131
|
class DSPDevice(Compiled):
|
129
132
|
def __init__(self, device:str=""):
|
130
133
|
compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b"]
|
131
|
-
|
134
|
+
if getenv("MOCKDSP"):
|
135
|
+
super().__init__(device, CPUAllocator(self), MockDSPRenderer(),
|
136
|
+
ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
|
137
|
+
else:
|
132
138
|
self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
|
133
139
|
# Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
|
134
|
-
sections = ['
|
140
|
+
sections = ['text', 'rela.plt', 'rela.dyn', 'plt', 'data', 'bss', 'hash', 'dynamic',
|
141
|
+
'got', 'got.plt', 'dynsym', 'dynstr', 'symtab', 'shstrtab', 'strtab']
|
135
142
|
sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
|
136
143
|
with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
|
137
144
|
self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
|
138
145
|
self.link_ld.flush()
|
146
|
+
|
139
147
|
super().__init__(device, DSPAllocator(self), DSPRenderer(),
|
140
148
|
ClangCompiler("compile_dsp", ["-shared"] + compiler_args + [f"-T{self.link_ld.name}"], 'llvm-objdump'), functools.partial(DSPProgram, self))
|
141
149
|
fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
|
@@ -144,8 +152,6 @@ class DSPDevice(Compiled):
|
|
144
152
|
|
145
153
|
self.init_dsp()
|
146
154
|
RPCListener(self).start()
|
147
|
-
except FileNotFoundError:
|
148
|
-
super().__init__(device, MallocAllocator, MockDSPRenderer(), ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
|
149
155
|
|
150
156
|
def open_lib(self, lib):
|
151
157
|
self.binded_lib, self.binded_lib_off = lib, 0
|
@@ -169,7 +175,8 @@ class DSPDevice(Compiled):
|
|
169
175
|
except (OSError, PermissionError):
|
170
176
|
# DSP might ask for a connection reset or just fail with operation not permitted, try to reset connection.
|
171
177
|
self.init_dsp()
|
172
|
-
_exec_lib()
|
178
|
+
try: _exec_lib()
|
179
|
+
except (OSError, PermissionError) as e: raise RuntimeError(e)
|
173
180
|
|
174
181
|
def init_dsp(self):
|
175
182
|
if hasattr(self, 'rpc_fd'):
|
@@ -259,8 +266,8 @@ static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd,
|
|
259
266
|
return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
|
260
267
|
|
261
268
|
class MockDSPRenderer(DSPRenderer):
|
262
|
-
def
|
263
|
-
|
269
|
+
def _render_defines(self, uops) -> list[str]: return ClangRenderer._render_defines(self, uops)
|
270
|
+
def _render_entry(self, function_name:str, bufs:list[tuple[str,tuple[DType,bool]]]) -> str:
|
264
271
|
# https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
|
265
272
|
# control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
|
266
273
|
msrc = [mockdsp_boilerplate, 'void _start(void) {']
|
@@ -277,7 +284,7 @@ class MockDSPRenderer(DSPRenderer):
|
|
277
284
|
for i,b in enumerate(bufs):
|
278
285
|
if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
|
279
286
|
msrc.append('exit(0); }')
|
280
|
-
return
|
287
|
+
return '\n'.join(msrc)
|
281
288
|
|
282
289
|
class MockDSPProgram:
|
283
290
|
def __init__(self, name:str, lib:bytes): self.lib = lib
|
@@ -288,11 +295,11 @@ class MockDSPProgram:
|
|
288
295
|
os.chmod(dsp_lib.name, 0o0777)
|
289
296
|
# NOTE: this timing includes a docker launch
|
290
297
|
proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
|
291
|
-
|
292
|
-
|
298
|
+
"qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
|
299
|
+
input=b''.join([bytes(to_mv(x.va_addr, x.size)) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
|
293
300
|
offset = 4
|
294
301
|
for x in bufs:
|
295
|
-
x[:] = proc.stdout[offset:offset+
|
296
|
-
offset +=
|
302
|
+
x.cpu_view()[:] = proc.stdout[offset:offset+x.size]
|
303
|
+
offset += x.size
|
297
304
|
assert offset == len(proc.stdout)
|
298
305
|
return struct.unpack("I", proc.stdout[0:4])[0] / 1e9 # pretend it's 1 Ghz, but this is an inscount, not a time
|
tinygrad/runtime/ops_gpu.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import
|
3
|
-
import ctypes, functools, hashlib
|
2
|
+
from typing import cast
|
3
|
+
import ctypes, functools, hashlib
|
4
4
|
from tinygrad.runtime.autogen import opencl as cl
|
5
|
-
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
|
5
|
+
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address, suppress_finalizing
|
6
6
|
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
|
7
7
|
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError
|
8
8
|
|
@@ -41,15 +41,19 @@ class CLProgram:
|
|
41
41
|
self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
|
42
42
|
|
43
43
|
def __del__(self):
|
44
|
-
|
45
|
-
|
44
|
+
try: check(cl.clReleaseKernel(self.kernel))
|
45
|
+
except (TypeError, AttributeError): pass
|
46
|
+
try: check(cl.clReleaseProgram(self.program))
|
47
|
+
except (TypeError, AttributeError): pass
|
46
48
|
|
47
|
-
def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:
|
49
|
+
def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]|None=None,
|
50
|
+
vals:tuple[int, ...]=(), wait=False) -> float|None:
|
48
51
|
for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
|
49
52
|
for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
|
50
53
|
if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
|
51
54
|
event = cl.cl_event() if wait else None
|
52
|
-
check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size),
|
55
|
+
check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size),
|
56
|
+
(ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event))
|
53
57
|
if wait:
|
54
58
|
assert event is not None
|
55
59
|
check(cl.clWaitForEvents(1, event))
|
@@ -58,16 +62,14 @@ class CLProgram:
|
|
58
62
|
return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
|
59
63
|
return None
|
60
64
|
|
61
|
-
class CLAllocator(LRUAllocator):
|
62
|
-
def __init__(self, dev:CLDevice):
|
63
|
-
self.dev = dev
|
64
|
-
super().__init__()
|
65
|
+
class CLAllocator(LRUAllocator['CLDevice']):
|
65
66
|
def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
|
66
67
|
if options.image is not None:
|
67
68
|
return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
|
68
69
|
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
|
69
70
|
options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
|
70
71
|
return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
|
72
|
+
@suppress_finalizing
|
71
73
|
def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
|
72
74
|
def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
|
73
75
|
if dest[1].image is not None:
|
tinygrad/runtime/ops_hip.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
import ctypes, functools
|
2
|
-
from tinygrad.helpers import init_c_var,
|
2
|
+
from tinygrad.helpers import init_c_var, mv_address, init_c_struct_t, getenv
|
3
3
|
from tinygrad.device import Compiled, LRUAllocator, BufferSpec
|
4
4
|
from tinygrad.runtime.autogen import hip
|
5
|
-
from tinygrad.runtime.support.
|
5
|
+
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
6
6
|
from tinygrad.renderer.cstyle import HIPRenderer
|
7
7
|
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
8
8
|
|
@@ -14,7 +14,7 @@ class HIPDevice(Compiled):
|
|
14
14
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
15
15
|
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
|
16
16
|
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
17
|
-
super().__init__(device, HIPAllocator(self), HIPRenderer(),
|
17
|
+
super().__init__(device, HIPAllocator(self), HIPRenderer(self.arch), HIPCompiler(self.arch), functools.partial(HIPProgram, self))
|
18
18
|
def synchronize(self):
|
19
19
|
check(hip.hipSetDevice(self.device_id))
|
20
20
|
check(hip.hipDeviceSynchronize())
|
@@ -50,17 +50,14 @@ class HIPProgram:
|
|
50
50
|
check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.dev.time_event_st, self.dev.time_event_en))
|
51
51
|
return ret.value * 1e-3
|
52
52
|
|
53
|
-
class HIPAllocator(LRUAllocator):
|
54
|
-
def __init__(self, dev:HIPDevice):
|
55
|
-
self.dev = dev
|
56
|
-
super().__init__()
|
53
|
+
class HIPAllocator(LRUAllocator[HIPDevice]):
|
57
54
|
def _alloc(self, size:int, options:BufferSpec):
|
58
55
|
check(hip.hipSetDevice(self.dev.device_id))
|
59
56
|
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
|
60
57
|
def _free(self, opaque, options:BufferSpec): check(hip.hipFree(opaque))
|
61
58
|
def _copyin(self, dest, src: memoryview):
|
62
59
|
check(hip.hipSetDevice(self.dev.device_id))
|
63
|
-
check(hip.hipMemcpy(dest,
|
60
|
+
check(hip.hipMemcpy(dest, mv_address(src), len(src), hip.hipMemcpyHostToDevice))
|
64
61
|
def _copyout(self, dest:memoryview, src):
|
65
62
|
self.dev.synchronize()
|
66
|
-
check(hip.hipMemcpy(
|
63
|
+
check(hip.hipMemcpy(mv_address(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
|
tinygrad/runtime/ops_llvm.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
-
import ctypes, platform
|
2
|
-
from tinygrad.device import
|
1
|
+
import ctypes, platform, functools, queue
|
2
|
+
from tinygrad.device import Compiler
|
3
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQSignal
|
4
|
+
from tinygrad.runtime.ops_cpu import CPUAllocator, CPUProgram, CPUComputeQueue, CPUWorker
|
3
5
|
from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
|
4
6
|
from tinygrad.renderer.llvmir import LLVMRenderer
|
5
7
|
import tinygrad.runtime.autogen.llvm as llvm
|
@@ -12,15 +14,15 @@ def expect(x, err, ret=None):
|
|
12
14
|
return ret
|
13
15
|
|
14
16
|
class LLVMCompiler(Compiler):
|
15
|
-
|
16
|
-
|
17
|
+
jit = True
|
18
|
+
target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()]
|
19
|
+
def __init__(self, processor:str, feats:str):
|
20
|
+
for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
|
17
21
|
|
18
|
-
triple = {'AArch64': b'aarch64', 'X86': b'x86_64'
|
22
|
+
triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch]
|
19
23
|
target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
|
20
|
-
|
21
|
-
|
22
|
-
if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
|
23
|
-
self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
|
24
|
+
if DEBUG >= 3: print(f"LLVM init for {processor!r} with {feats!r}")
|
25
|
+
self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, processor.encode(), feats.encode(),
|
24
26
|
llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
|
25
27
|
|
26
28
|
self.pbo = llvm.LLVMCreatePassBuilderOptions()
|
@@ -33,11 +35,21 @@ class LLVMCompiler(Compiler):
|
|
33
35
|
else:
|
34
36
|
self.passes = b'default<O0>'
|
35
37
|
|
36
|
-
|
38
|
+
self.diag_msgs: list[str] = []
|
39
|
+
@ctypes.CFUNCTYPE(None, llvm.LLVMDiagnosticInfoRef, ctypes.c_void_p)
|
40
|
+
def handle_diag(diag_ref, _arg):
|
41
|
+
severity = llvm.LLVMGetDiagInfoSeverity(diag_ref)
|
42
|
+
msg = ctypes.string_at(llvm.LLVMGetDiagInfoDescription(diag_ref)).decode()
|
43
|
+
if severity == llvm.LLVMDSError:
|
44
|
+
self.diag_msgs.append(msg)
|
45
|
+
self.handle_diag = handle_diag
|
46
|
+
llvm.LLVMContextSetDiagnosticHandler(llvm.LLVMGetGlobalContext(), handle_diag, None)
|
47
|
+
super().__init__(f"compile_llvm_{self.target_arch}{'_jit' if self.jit else ''}{'_opt' if opt else ''}")
|
37
48
|
|
38
49
|
def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
|
39
50
|
|
40
51
|
def compile(self, src:str) -> bytes:
|
52
|
+
self.diag_msgs.clear()
|
41
53
|
src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
|
42
54
|
mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
|
43
55
|
expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
|
@@ -48,11 +60,19 @@ class LLVMCompiler(Compiler):
|
|
48
60
|
llvm.LLVMDisposeModule(mod)
|
49
61
|
obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
|
50
62
|
llvm.LLVMDisposeMemoryBuffer(obj_buf)
|
51
|
-
|
63
|
+
if self.diag_msgs: raise RuntimeError("llvm diagnostic: " + "\n".join(self.diag_msgs))
|
64
|
+
return jit_loader(obj) if self.jit else obj
|
52
65
|
|
53
66
|
def disassemble(self, lib:bytes): capstone_flatdump(lib)
|
54
67
|
|
55
|
-
class
|
56
|
-
def __init__(self
|
57
|
-
|
58
|
-
|
68
|
+
class HostLLVMCompiler(LLVMCompiler):
|
69
|
+
def __init__(self):
|
70
|
+
# +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
|
71
|
+
cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
|
72
|
+
super().__init__(cpu.decode(), feats.decode())
|
73
|
+
|
74
|
+
class LLVMDevice(HCQCompiled):
|
75
|
+
def __init__(self, device:str=""):
|
76
|
+
self.tasks:queue.Queue = queue.Queue()
|
77
|
+
CPUWorker(self).start()
|
78
|
+
super().__init__(device, CPUAllocator(self), LLVMRenderer(), HostLLVMCompiler(), functools.partial(CPUProgram, self), HCQSignal, CPUComputeQueue)
|