tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. tinygrad/__init__.py +1 -1
  2. tinygrad/apps/llm.py +206 -0
  3. tinygrad/codegen/__init__.py +116 -0
  4. tinygrad/codegen/devectorizer.py +315 -172
  5. tinygrad/codegen/expander.py +8 -16
  6. tinygrad/codegen/gpudims.py +89 -0
  7. tinygrad/codegen/linearize.py +205 -203
  8. tinygrad/codegen/lowerer.py +92 -139
  9. tinygrad/codegen/opt/__init__.py +38 -0
  10. tinygrad/codegen/opt/heuristic.py +125 -0
  11. tinygrad/codegen/opt/kernel.py +510 -0
  12. tinygrad/{engine → codegen/opt}/search.py +51 -35
  13. tinygrad/codegen/opt/swizzler.py +134 -0
  14. tinygrad/codegen/opt/tc.py +127 -0
  15. tinygrad/codegen/quantize.py +67 -0
  16. tinygrad/device.py +122 -132
  17. tinygrad/dtype.py +152 -35
  18. tinygrad/engine/jit.py +81 -54
  19. tinygrad/engine/memory.py +46 -27
  20. tinygrad/engine/realize.py +82 -41
  21. tinygrad/engine/schedule.py +70 -445
  22. tinygrad/frontend/__init__.py +0 -0
  23. tinygrad/frontend/onnx.py +1253 -0
  24. tinygrad/frontend/torch.py +5 -0
  25. tinygrad/gradient.py +19 -27
  26. tinygrad/helpers.py +95 -47
  27. tinygrad/nn/__init__.py +7 -8
  28. tinygrad/nn/optim.py +72 -41
  29. tinygrad/nn/state.py +37 -23
  30. tinygrad/renderer/__init__.py +40 -60
  31. tinygrad/renderer/cstyle.py +143 -128
  32. tinygrad/renderer/llvmir.py +113 -62
  33. tinygrad/renderer/ptx.py +50 -32
  34. tinygrad/renderer/wgsl.py +27 -23
  35. tinygrad/runtime/autogen/am/am.py +5861 -0
  36. tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
  37. tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
  38. tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
  39. tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
  40. tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
  41. tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
  42. tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
  43. tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
  44. tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
  45. tinygrad/runtime/autogen/comgr.py +35 -9
  46. tinygrad/runtime/autogen/comgr_3.py +906 -0
  47. tinygrad/runtime/autogen/cuda.py +2419 -494
  48. tinygrad/runtime/autogen/hsa.py +57 -16
  49. tinygrad/runtime/autogen/ib.py +7171 -0
  50. tinygrad/runtime/autogen/io_uring.py +917 -118
  51. tinygrad/runtime/autogen/kfd.py +748 -26
  52. tinygrad/runtime/autogen/libc.py +613 -218
  53. tinygrad/runtime/autogen/libusb.py +1643 -0
  54. tinygrad/runtime/autogen/nv/nv.py +8602 -0
  55. tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
  56. tinygrad/runtime/autogen/opencl.py +2 -4
  57. tinygrad/runtime/autogen/sqtt.py +1789 -0
  58. tinygrad/runtime/autogen/vfio.py +3 -3
  59. tinygrad/runtime/autogen/webgpu.py +273 -264
  60. tinygrad/runtime/graph/cuda.py +3 -3
  61. tinygrad/runtime/graph/hcq.py +68 -29
  62. tinygrad/runtime/graph/metal.py +29 -13
  63. tinygrad/runtime/graph/remote.py +114 -0
  64. tinygrad/runtime/ops_amd.py +537 -320
  65. tinygrad/runtime/ops_cpu.py +108 -7
  66. tinygrad/runtime/ops_cuda.py +12 -14
  67. tinygrad/runtime/ops_disk.py +13 -10
  68. tinygrad/runtime/ops_dsp.py +47 -40
  69. tinygrad/runtime/ops_gpu.py +13 -11
  70. tinygrad/runtime/ops_hip.py +6 -9
  71. tinygrad/runtime/ops_llvm.py +35 -15
  72. tinygrad/runtime/ops_metal.py +29 -19
  73. tinygrad/runtime/ops_npy.py +5 -3
  74. tinygrad/runtime/ops_null.py +28 -0
  75. tinygrad/runtime/ops_nv.py +306 -234
  76. tinygrad/runtime/ops_python.py +62 -52
  77. tinygrad/runtime/ops_qcom.py +28 -39
  78. tinygrad/runtime/ops_remote.py +482 -0
  79. tinygrad/runtime/ops_webgpu.py +28 -28
  80. tinygrad/runtime/support/am/amdev.py +114 -249
  81. tinygrad/runtime/support/am/ip.py +211 -172
  82. tinygrad/runtime/support/amd.py +138 -0
  83. tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
  84. tinygrad/runtime/support/compiler_cuda.py +8 -11
  85. tinygrad/runtime/support/elf.py +2 -1
  86. tinygrad/runtime/support/hcq.py +184 -97
  87. tinygrad/runtime/support/ib.py +172 -0
  88. tinygrad/runtime/support/llvm.py +3 -4
  89. tinygrad/runtime/support/memory.py +251 -0
  90. tinygrad/runtime/support/nv/__init__.py +0 -0
  91. tinygrad/runtime/support/nv/ip.py +581 -0
  92. tinygrad/runtime/support/nv/nvdev.py +183 -0
  93. tinygrad/runtime/support/system.py +170 -0
  94. tinygrad/runtime/support/usb.py +268 -0
  95. tinygrad/runtime/support/webgpu.py +18 -0
  96. tinygrad/schedule/__init__.py +0 -0
  97. tinygrad/schedule/grouper.py +119 -0
  98. tinygrad/schedule/kernelize.py +368 -0
  99. tinygrad/schedule/multi.py +231 -0
  100. tinygrad/shape/shapetracker.py +40 -46
  101. tinygrad/shape/view.py +88 -52
  102. tinygrad/tensor.py +968 -542
  103. tinygrad/uop/__init__.py +117 -0
  104. tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
  105. tinygrad/uop/mathtraits.py +169 -0
  106. tinygrad/uop/ops.py +1021 -0
  107. tinygrad/uop/spec.py +228 -0
  108. tinygrad/{codegen → uop}/symbolic.py +239 -216
  109. tinygrad/uop/upat.py +163 -0
  110. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
  111. tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
  112. tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
  113. tinygrad/viz/index.html +203 -403
  114. tinygrad/viz/js/index.js +718 -0
  115. tinygrad/viz/js/worker.js +29 -0
  116. tinygrad/viz/serve.py +224 -102
  117. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
  118. tinygrad-0.11.0.dist-info/RECORD +141 -0
  119. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
  120. tinygrad/codegen/kernel.py +0 -693
  121. tinygrad/engine/multi.py +0 -161
  122. tinygrad/ops.py +0 -1003
  123. tinygrad/runtime/ops_cloud.py +0 -220
  124. tinygrad/runtime/support/allocator.py +0 -94
  125. tinygrad/spec.py +0 -155
  126. tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
  127. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
  128. tinygrad/viz/perfetto.html +0 -178
  129. tinygrad-0.10.2.dist-info/RECORD +0 -99
  130. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
  131. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,15 @@
1
- import platform, subprocess, sys
2
- from tinygrad.helpers import capstone_flatdump, getenv
3
- from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
1
+ from __future__ import annotations
2
+ import platform, subprocess, sys, ctypes, functools, time, mmap, threading, queue
3
+ from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, wait_cond, cpu_profile
4
+ from tinygrad.device import Compiler, BufferSpec, DMACPURef
5
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
4
6
  from tinygrad.runtime.support.elf import jit_loader
5
7
  from tinygrad.renderer.cstyle import ClangRenderer
8
+ from tinygrad.uop.ops import sint
9
+
10
+ class CPUSignal(HCQSignal):
11
+ def _sleep(self, time_spent_waiting_ms:int):
12
+ if self.is_timeline and self.owner is not None: self.owner.tasks.join()
6
13
 
7
14
  class ClangJITCompiler(Compiler):
8
15
  def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
@@ -11,14 +18,108 @@ class ClangJITCompiler(Compiler):
11
18
  # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
12
19
  # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
13
20
  target = 'x86_64' if sys.platform == 'win32' else platform.machine()
14
- args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
21
+ # on arm march means "runs on this arch and superset" instead of "optimize for this arch". x86 march == arm mcpu
22
+ arch = '-march=native' if platform.machine() in ('x86_64', 'AMD64') else '-mcpu=native'
23
+ args = [arch, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
15
24
  arch_args = ['-ffixed-x18'] if target == 'arm64' else []
16
25
  obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
17
26
  return jit_loader(obj)
18
27
 
19
28
  def disassemble(self, lib:bytes): return capstone_flatdump(lib)
20
29
 
21
- class ClangDevice(Compiled):
22
- def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
30
+ class CPUWorker(threading.Thread):
31
+ def __init__(self, dev):
32
+ super().__init__()
33
+ self.dev, self.tasks, self.daemon = dev, dev.tasks, True
34
+
35
+ def run(self):
36
+ while True:
37
+ cmd_iter = iter(self.tasks.get())
38
+ for cmd in cmd_iter:
39
+ args_cnt = next(cmd_iter)
40
+ cmd(*[next(cmd_iter) for _ in range(args_cnt)])
41
+ self.tasks.task_done()
42
+
43
+ class CPUComputeQueue(HWQueue):
44
+ def _exec(self, prg, bufs, *args):
45
+ prg.fxn(*map(ctypes.c_uint64, args[:bufs]), *map(ctypes.c_int64 if platform.machine() == "arm64" else ctypes.c_int32, args[bufs:]))
46
+ def _signal(self, signal_addr, value): to_mv(signal_addr, 4).cast('I')[0] = value
47
+ def _wait(self, signal_addr, value): wait_cond(lambda: to_mv(signal_addr, 4).cast('I')[0] >= value, timeout_ms=60000)
48
+ def _timestamp(self, timestamp_addr): to_mv(timestamp_addr, 8).cast('Q')[0] = time.perf_counter_ns()
49
+ def cmd(self, cmd, *args):
50
+ self.q(cmd, len(args), *args)
51
+ return self
52
+
53
+ def memory_barrier(self): return self
54
+ def exec(self, prg:CPUProgram, args_state:HCQArgsState, global_size, local_size):
55
+ return self.cmd(self._exec, prg, len(args_state.bufs), *[x.va_addr for x in args_state.bufs], *args_state.vals)
56
+ def wait(self, signal, value=0): return self.cmd(self._wait, signal.value_addr, value)
57
+ def timestamp(self, signal): return self.cmd(self._timestamp, signal.timestamp_addr)
58
+ def signal(self, signal, value:sint=0): return self.cmd(self._signal, signal.value_addr, value)
59
+ def _submit(self, dev): dev.tasks.put(self._q[:])
60
+
61
+ # NOTE: MAP_JIT is added to mmap module in python 3.13
62
+ MAP_JIT = 0x0800
63
+
64
+ class CPUProgram(HCQProgram):
65
+ rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
66
+
67
+ def __init__(self, dev, name:str, lib:bytes):
68
+ if sys.platform == "win32":
69
+ PAGE_EXECUTE_READWRITE, MEM_COMMIT, MEM_RESERVE = 0x40, 0x1000, 0x2000
70
+ ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
71
+ self.mem = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_void_p(0), ctypes.c_size_t(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
72
+ ctypes.memmove(self.mem, lib, len(lib))
73
+ ctypes.windll.kernel32.GetCurrentProcess.restype = ctypes.c_void_p
74
+ proc = ctypes.windll.kernel32.GetCurrentProcess()
75
+ ctypes.windll.kernel32.FlushInstructionCache(ctypes.c_void_p(proc), ctypes.c_void_p(self.mem), ctypes.c_size_t(len(lib)))
76
+ self.fxn = ctypes.CFUNCTYPE(None)(self.mem)
77
+ else:
78
+ # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
79
+ # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
80
+ self.mem = mmap.mmap(-1, len(lib), mmap.MAP_ANON|mmap.MAP_PRIVATE|(MAP_JIT if OSX else 0), mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
81
+
82
+ if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False)
83
+ self.mem.write(lib)
84
+ if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True)
85
+
86
+ # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
87
+ # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
88
+ # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
89
+ # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
90
+ CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
91
+
92
+ self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
93
+
94
+ super().__init__(HCQArgsState, dev, name, kernargs_alloc_size=0)
95
+
96
+ def __del__(self):
97
+ if getattr(sys, 'is_finalizing', lambda: True)(): return
98
+ if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
99
+
100
+ class CPUAllocator(HCQAllocatorBase):
101
+ def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
102
+ if options.external_ptr: addr, buf = options.external_ptr, None
103
+ elif sys.platform == "win32": addr = mv_address(buf:=mmap.mmap(-1, size, access=mmap.ACCESS_WRITE))
104
+ else: addr = mv_address(buf:=mmap.mmap(-1, size, mmap.MAP_ANON | mmap.MAP_PRIVATE, mmap.PROT_READ | mmap.PROT_WRITE))
105
+ return HCQBuffer(va:=addr, sz:=size, meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev)
106
+ def _as_buffer(self, src) -> memoryview:
107
+ self.dev.synchronize()
108
+ return to_mv(src.va_addr, src.size)
109
+ def _as_dmaref(self, buf):
110
+ self.dev.synchronize()
111
+ return DMACPURef(buf.va_addr, buf.size)
112
+ def _copyin(self, dest, src:memoryview):
113
+ self.dev.synchronize()
114
+ with cpu_profile('TINY -> CPU', self.dev.device, is_copy=True): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
115
+ def _copyout(self, dest:memoryview, src):
116
+ self.dev.synchronize()
117
+ with cpu_profile('CPU -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
118
+ def _map(self, buf:HCQBuffer):
119
+ if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
23
120
 
24
- CPUDevice = ClangDevice
121
+ class CPUDevice(HCQCompiled):
122
+ def __init__(self, device:str=""):
123
+ self.tasks:queue.Queue = queue.Queue()
124
+ CPUWorker(self).start()
125
+ super().__init__(device, CPUAllocator(self), ClangRenderer(), ClangJITCompiler(), functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
  import ctypes, ctypes.util, functools
3
- from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
3
+ from tinygrad.helpers import DEBUG, getenv, mv_address, init_c_var, init_c_struct_t, suppress_finalizing
4
4
  from tinygrad.device import Compiled, BufferSpec, LRUAllocator
5
5
  from tinygrad.renderer.cstyle import CUDARenderer
6
6
  from tinygrad.renderer.ptx import PTXRenderer
7
7
  from tinygrad.runtime.autogen import cuda
8
- from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
8
+ from tinygrad.runtime.support.compiler_cuda import pretty_ptx, CUDACompiler, PTXCompiler, PTX
9
9
  if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
10
10
  if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
11
11
 
@@ -34,21 +34,19 @@ class CUDAProgram:
34
34
  def __init__(self, dev:CUDADevice, name:str, lib:bytes, smem:int=0):
35
35
  self.dev, self.name, self.lib, self.smem = dev, name, lib, smem
36
36
  if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
37
- if DEBUG >= 6: cuda_disassemble(lib, dev.arch)
38
37
 
39
38
  check(cuda.cuCtxSetCurrent(self.dev.context))
40
39
  self.module = cuda.CUmodule()
41
40
  status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
42
41
  if status != 0:
43
42
  del self.module
44
- cuda_disassemble(lib, dev.arch)
45
43
  raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
46
44
  check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
47
45
  self.prg = prg
48
46
  if self.smem > 0: check(cuda.cuFuncSetAttribute(self.prg, cuda.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, self.smem))
49
47
 
50
- def __del__(self):
51
- if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
48
+ @suppress_finalizing
49
+ def __del__(self): check(cuda.cuModuleUnload(self.module))
52
50
 
53
51
  def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
54
52
  check(cuda.cuCtxSetCurrent(self.dev.context))
@@ -62,27 +60,27 @@ class CUDAProgram:
62
60
  for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
63
61
  return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, self.smem, None, None, self.vargs)), enable=wait)
64
62
 
65
- class CUDAAllocator(LRUAllocator):
66
- def __init__(self, dev:CUDADevice):
67
- self.dev = dev
68
- super().__init__()
63
+ class CUDAAllocator(LRUAllocator['CUDADevice']):
69
64
  def _alloc(self, size, options:BufferSpec):
70
65
  check(cuda.cuCtxSetCurrent(self.dev.context))
66
+ if options.external_ptr: return cuda.CUdeviceptr_v2(options.external_ptr)
71
67
  if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
72
68
  return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
73
69
  def _free(self, opaque, options:BufferSpec):
74
- if options.host: check(cuda.cuMemFreeHost(opaque))
75
- else: check(cuda.cuMemFree_v2(opaque))
70
+ try:
71
+ if options.host: check(cuda.cuMemFreeHost(opaque))
72
+ else: check(cuda.cuMemFree_v2(opaque))
73
+ except (TypeError, AttributeError): pass
76
74
  def _copyin(self, dest, src:memoryview):
77
75
  check(cuda.cuCtxSetCurrent(self.dev.context))
78
76
  host_mem = self.alloc(len(src), BufferSpec(host=True))
79
77
  self.dev.pending_copyin.append((host_mem, len(src), BufferSpec(host=True)))
80
- ctypes.memmove(host_mem, from_mv(src), len(src))
78
+ ctypes.memmove(host_mem, mv_address(src), len(src))
81
79
  check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
82
80
  def _copyout(self, dest:memoryview, src):
83
81
  CUDADevice.synchronize_system()
84
82
  check(cuda.cuCtxSetCurrent(self.dev.context))
85
- check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
83
+ check(cuda.cuMemcpyDtoH_v2(mv_address(dest), src, len(dest)))
86
84
  def _transfer(self, dest, src, sz:int, src_dev, dest_dev):
87
85
  check(cuda.cuCtxSetCurrent(src_dev.context))
88
86
  check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
@@ -1,5 +1,5 @@
1
- import os, sys, mmap, io, ctypes, ctypes.util, contextlib
2
- from typing import Optional, Generator, Callable
1
+ import os, sys, mmap, io, ctypes, contextlib, pathlib
2
+ from typing import Generator, Callable
3
3
  from tinygrad.helpers import OSX, round_up
4
4
  from tinygrad.device import Compiled, Allocator
5
5
  with contextlib.suppress(ImportError):
@@ -12,14 +12,15 @@ class DiskDevice(Compiled):
12
12
  def __init__(self, device:str):
13
13
  if not DiskDevice._tried_io_uring_init: self._iouring_setup()
14
14
 
15
- self.size: Optional[int] = None
16
- self.fd: Optional[int] = None
15
+ self.size: int|None = None
16
+ self.fd: int|None = None
17
17
  self.count = 0
18
18
  super().__init__(device, DiskAllocator(self), None, None, None)
19
- def _might_open(self, size):
20
- self.count += 1
19
+ def _might_open(self, size:int):
21
20
  assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
22
- if self.size is not None: return
21
+ if self.size is not None and hasattr(self.device, "mem"):
22
+ self.count += 1
23
+ return
23
24
  filename = self.device[len("disk:"):]
24
25
  self.size = size
25
26
 
@@ -30,10 +31,11 @@ class DiskDevice(Compiled):
30
31
  else:
31
32
  try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|getattr(os, "O_DIRECT", 0))
32
33
  except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
33
- if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
34
+ if not pathlib.Path(filename).is_block_device() and os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
34
35
  self.mem = mmap.mmap(self.fd, self.size)
35
36
  if hasattr(self.mem, 'madvise') and (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
36
37
  with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
38
+ self.count += 1
37
39
  def _might_close(self):
38
40
  self.count -= 1
39
41
  if self.count == 0:
@@ -72,7 +74,7 @@ class DiskBuffer:
72
74
 
73
75
  MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
74
76
  class DiskAllocator(Allocator):
75
- def __init__(self, dev:DiskDevice): self.dev = dev
77
+ def __init__(self, dev:DiskDevice): super().__init__(dev)
76
78
  def _alloc(self, size:int, options):
77
79
  self.dev._might_open(size)
78
80
  return DiskBuffer(self.dev, size)
@@ -84,7 +86,8 @@ class DiskAllocator(Allocator):
84
86
  # OSX doesn't seem great at mmap, this is faster
85
87
  with io.FileIO(self.dev.fd, "a+b", closefd=False) as fo:
86
88
  fo.seek(src.offset)
87
- fo.readinto(dest)
89
+ bytes_read = 0
90
+ while (n := fo.readinto(dest[bytes_read:])) is not None and n > 0: bytes_read += n
88
91
  else:
89
92
  dest[:] = src._buf()
90
93
 
@@ -1,53 +1,60 @@
1
1
  from __future__ import annotations
2
2
  import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
3
3
  assert sys.platform != 'win32'
4
- from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
4
+ from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler
5
+ from tinygrad.runtime.ops_cpu import CPUAllocator
5
6
  from tinygrad.dtype import dtypes, DType, PtrDType
6
- from tinygrad.ops import Ops, UOp
7
- from tinygrad.helpers import from_mv, getenv, round_up, mv_address, to_mv, cpu_objdump, DEBUG
7
+ from tinygrad.uop.ops import Ops, UOp
8
+ from tinygrad.helpers import getenv, round_up, mv_address, to_mv, cpu_objdump, DEBUG
8
9
  from tinygrad.renderer.cstyle import ClangRenderer
9
10
  from tinygrad.runtime.autogen import libc, qcom_dsp
10
11
  if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
11
12
 
12
- from tinygrad.ops import PatternMatcher, UPat
13
+ from tinygrad.uop.ops import PatternMatcher, UPat
13
14
 
14
15
  dsp_pm = PatternMatcher([
15
16
  (((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
16
17
  lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
17
18
  arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
18
19
  (UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
19
- "__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 else None),
20
+ "__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 and x.src[0].dtype.count > 1 else None),
20
21
  ])
21
22
 
22
23
  dsp_pm_late = PatternMatcher([
23
- (UPat.var("x")+UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
24
- (UPat.var("x")*UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
25
- (UPat.var("x")//UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
26
- (UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
27
- lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
24
+ (UPat.var("x")+UPat(Ops.VECTORIZE,src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI else None),
25
+ (UPat.var("x")*UPat(Ops.VECTORIZE,src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI else None),
26
+ (UPat.var("x")//UPat(Ops.VECTORIZE,src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI else None),
27
+ (UPat(Ops.DEFINE_REG, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
28
+ lambda d: d.replace(src=(UOp(Ops.CUSTOMI, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
29
+ ])
30
+
31
+ # NOTE: this just increases readability of the generated code
32
+ dsp_string = PatternMatcher([
33
+ (UPat(Ops.CONST, (dtypes.int8, dtypes.uint8), name="x"), lambda ctx,x: str(x.arg)),
28
34
  ])
29
35
 
30
36
  class DSPRenderer(ClangRenderer):
31
37
  device = "DSP"
32
38
  supports_float4 = True
33
39
  buffer_suffix = " restrict __attribute__((align_value(128)))"
34
- kernel_prefix = "__attribute__((noinline)) "
40
+ kernel_typedef = "__attribute__((noinline)) void"
35
41
  pre_matcher = dsp_pm
36
42
  extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
43
+ string_rewrite = dsp_string+ClangRenderer.string_rewrite
37
44
  type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
38
- code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
39
- Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
40
- Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
45
+ code_for_op = {k:v for k,v in ClangRenderer.code_for_op.items() if k != Ops.SQRT}
41
46
 
42
- def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
43
- ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
44
- msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
47
+ def _render_defines(self, uops) -> list[str]:
48
+ return ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
45
49
  _Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
46
50
  'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
47
51
  'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
48
- 'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
49
- 'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
50
- 'HAP_power_set((void*)handle, (void*)&req);']
52
+ 'unsigned long long HAP_perf_get_time_us(void);'] + super()._render_defines(uops)
53
+
54
+ def _render_entry(self, function_name:str, bufs:list[tuple[str,tuple[DType,bool]]]) -> str:
55
+ msrc = ['int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
56
+ 'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
57
+ 'HAP_power_set((void*)handle, (void*)&req);']
51
58
  msrc += ['if ((sc>>24) != 2) return 0;']
52
59
  msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
53
60
  msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
@@ -57,7 +64,7 @@ class DSPRenderer(ClangRenderer):
57
64
  msrc += ["*(unsigned long long *)(pra[2].buf.pv) = HAP_perf_get_time_us() - start;"]
58
65
  msrc += [f'HAP_munmap(buf_{i}, sz_or_val_{i});' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
59
66
  msrc += ["return 0; }"]
60
- return ret + '\n' + '\n'.join(msrc)
67
+ return '\n'.join(msrc)
61
68
 
62
69
  def rpc_sc(method=0, ins=0, outs=0, fds=0): return (method << 24) | (ins << 16) | (outs << 8) | fds
63
70
  def rpc_prep_args(ins=None, outs=None, in_fds=None):
@@ -88,11 +95,7 @@ class DSPBuffer:
88
95
  def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
89
96
  self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
90
97
 
91
- class DSPAllocator(Allocator):
92
- def __init__(self, dev:DSPDevice):
93
- self.dev = dev
94
- super().__init__()
95
-
98
+ class DSPAllocator(Allocator['DSPDevice']):
96
99
  def _alloc(self, size:int, options:BufferSpec):
97
100
  b = qcom_dsp.ION_IOC_ALLOC(self.dev.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
98
101
  share_info = qcom_dsp.ION_IOC_SHARE(self.dev.ion_fd, handle=b.handle)
@@ -106,8 +109,8 @@ class DSPAllocator(Allocator):
106
109
  qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
107
110
 
108
111
  def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
109
- def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
110
- def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
112
+ def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes)
113
+ def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(mv_address(dest), src.va_addr, dest.nbytes)
111
114
  def _offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
112
115
 
113
116
  class ClangCompiler(Compiler):
@@ -128,14 +131,19 @@ class ClangCompiler(Compiler):
128
131
  class DSPDevice(Compiled):
129
132
  def __init__(self, device:str=""):
130
133
  compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b"]
131
- try:
134
+ if getenv("MOCKDSP"):
135
+ super().__init__(device, CPUAllocator(self), MockDSPRenderer(),
136
+ ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
137
+ else:
132
138
  self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
133
139
  # Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
134
- sections = ['hash', 'text', 'rela.plt', 'got', 'got.plt', 'dynamic', 'dynsym', 'dynstr', 'plt', 'data', 'bss']
140
+ sections = ['text', 'rela.plt', 'rela.dyn', 'plt', 'data', 'bss', 'hash', 'dynamic',
141
+ 'got', 'got.plt', 'dynsym', 'dynstr', 'symtab', 'shstrtab', 'strtab']
135
142
  sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
136
143
  with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
137
144
  self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
138
145
  self.link_ld.flush()
146
+
139
147
  super().__init__(device, DSPAllocator(self), DSPRenderer(),
140
148
  ClangCompiler("compile_dsp", ["-shared"] + compiler_args + [f"-T{self.link_ld.name}"], 'llvm-objdump'), functools.partial(DSPProgram, self))
141
149
  fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
@@ -144,8 +152,6 @@ class DSPDevice(Compiled):
144
152
 
145
153
  self.init_dsp()
146
154
  RPCListener(self).start()
147
- except FileNotFoundError:
148
- super().__init__(device, MallocAllocator, MockDSPRenderer(), ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
149
155
 
150
156
  def open_lib(self, lib):
151
157
  self.binded_lib, self.binded_lib_off = lib, 0
@@ -169,7 +175,8 @@ class DSPDevice(Compiled):
169
175
  except (OSError, PermissionError):
170
176
  # DSP might ask for a connection reset or just fail with operation not permitted, try to reset connection.
171
177
  self.init_dsp()
172
- _exec_lib()
178
+ try: _exec_lib()
179
+ except (OSError, PermissionError) as e: raise RuntimeError(e)
173
180
 
174
181
  def init_dsp(self):
175
182
  if hasattr(self, 'rpc_fd'):
@@ -259,8 +266,8 @@ static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd,
259
266
  return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
260
267
 
261
268
  class MockDSPRenderer(DSPRenderer):
262
- def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
263
- ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
269
+ def _render_defines(self, uops) -> list[str]: return ClangRenderer._render_defines(self, uops)
270
+ def _render_entry(self, function_name:str, bufs:list[tuple[str,tuple[DType,bool]]]) -> str:
264
271
  # https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
265
272
  # control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
266
273
  msrc = [mockdsp_boilerplate, 'void _start(void) {']
@@ -277,7 +284,7 @@ class MockDSPRenderer(DSPRenderer):
277
284
  for i,b in enumerate(bufs):
278
285
  if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
279
286
  msrc.append('exit(0); }')
280
- return ret + '\n' + '\n'.join(msrc)
287
+ return '\n'.join(msrc)
281
288
 
282
289
  class MockDSPProgram:
283
290
  def __init__(self, name:str, lib:bytes): self.lib = lib
@@ -288,11 +295,11 @@ class MockDSPProgram:
288
295
  os.chmod(dsp_lib.name, 0o0777)
289
296
  # NOTE: this timing includes a docker launch
290
297
  proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
291
- "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
292
- input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
298
+ "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
299
+ input=b''.join([bytes(to_mv(x.va_addr, x.size)) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
293
300
  offset = 4
294
301
  for x in bufs:
295
- x[:] = proc.stdout[offset:offset+len(x)]
296
- offset += len(x)
302
+ x.cpu_view()[:] = proc.stdout[offset:offset+x.size]
303
+ offset += x.size
297
304
  assert offset == len(proc.stdout)
298
305
  return struct.unpack("I", proc.stdout[0:4])[0] / 1e9 # pretend it's 1 Ghz, but this is an inscount, not a time
@@ -1,8 +1,8 @@
1
1
  from __future__ import annotations
2
- from typing import Optional, cast
3
- import ctypes, functools, hashlib, contextlib
2
+ from typing import cast
3
+ import ctypes, functools, hashlib
4
4
  from tinygrad.runtime.autogen import opencl as cl
5
- from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
5
+ from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address, suppress_finalizing
6
6
  from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
7
7
  from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError
8
8
 
@@ -41,15 +41,19 @@ class CLProgram:
41
41
  self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
42
42
 
43
43
  def __del__(self):
44
- with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel))
45
- with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program))
44
+ try: check(cl.clReleaseKernel(self.kernel))
45
+ except (TypeError, AttributeError): pass
46
+ try: check(cl.clReleaseProgram(self.program))
47
+ except (TypeError, AttributeError): pass
46
48
 
47
- def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
49
+ def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]|None=None,
50
+ vals:tuple[int, ...]=(), wait=False) -> float|None:
48
51
  for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
49
52
  for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
50
53
  if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
51
54
  event = cl.cl_event() if wait else None
52
- check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
55
+ check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size),
56
+ (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event))
53
57
  if wait:
54
58
  assert event is not None
55
59
  check(cl.clWaitForEvents(1, event))
@@ -58,16 +62,14 @@ class CLProgram:
58
62
  return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
59
63
  return None
60
64
 
61
- class CLAllocator(LRUAllocator):
62
- def __init__(self, dev:CLDevice):
63
- self.dev = dev
64
- super().__init__()
65
+ class CLAllocator(LRUAllocator['CLDevice']):
65
66
  def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
66
67
  if options.image is not None:
67
68
  return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
68
69
  cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
69
70
  options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
70
71
  return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
72
+ @suppress_finalizing
71
73
  def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
72
74
  def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
73
75
  if dest[1].image is not None:
@@ -1,8 +1,8 @@
1
1
  import ctypes, functools
2
- from tinygrad.helpers import init_c_var, from_mv, init_c_struct_t, getenv
2
+ from tinygrad.helpers import init_c_var, mv_address, init_c_struct_t, getenv
3
3
  from tinygrad.device import Compiled, LRUAllocator, BufferSpec
4
4
  from tinygrad.runtime.autogen import hip
5
- from tinygrad.runtime.support.compiler_hip import AMDCompiler
5
+ from tinygrad.runtime.support.compiler_amd import HIPCompiler
6
6
  from tinygrad.renderer.cstyle import HIPRenderer
7
7
  if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
8
8
 
@@ -14,7 +14,7 @@ class HIPDevice(Compiled):
14
14
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
15
15
  self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
16
16
  self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
17
- super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
17
+ super().__init__(device, HIPAllocator(self), HIPRenderer(self.arch), HIPCompiler(self.arch), functools.partial(HIPProgram, self))
18
18
  def synchronize(self):
19
19
  check(hip.hipSetDevice(self.device_id))
20
20
  check(hip.hipDeviceSynchronize())
@@ -50,17 +50,14 @@ class HIPProgram:
50
50
  check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.dev.time_event_st, self.dev.time_event_en))
51
51
  return ret.value * 1e-3
52
52
 
53
- class HIPAllocator(LRUAllocator):
54
- def __init__(self, dev:HIPDevice):
55
- self.dev = dev
56
- super().__init__()
53
+ class HIPAllocator(LRUAllocator[HIPDevice]):
57
54
  def _alloc(self, size:int, options:BufferSpec):
58
55
  check(hip.hipSetDevice(self.dev.device_id))
59
56
  return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
60
57
  def _free(self, opaque, options:BufferSpec): check(hip.hipFree(opaque))
61
58
  def _copyin(self, dest, src: memoryview):
62
59
  check(hip.hipSetDevice(self.dev.device_id))
63
- check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
60
+ check(hip.hipMemcpy(dest, mv_address(src), len(src), hip.hipMemcpyHostToDevice))
64
61
  def _copyout(self, dest:memoryview, src):
65
62
  self.dev.synchronize()
66
- check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
63
+ check(hip.hipMemcpy(mv_address(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
@@ -1,5 +1,7 @@
1
- import ctypes, platform
2
- from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
1
+ import ctypes, platform, functools, queue
2
+ from tinygrad.device import Compiler
3
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQSignal
4
+ from tinygrad.runtime.ops_cpu import CPUAllocator, CPUProgram, CPUComputeQueue, CPUWorker
3
5
  from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
4
6
  from tinygrad.renderer.llvmir import LLVMRenderer
5
7
  import tinygrad.runtime.autogen.llvm as llvm
@@ -12,15 +14,15 @@ def expect(x, err, ret=None):
12
14
  return ret
13
15
 
14
16
  class LLVMCompiler(Compiler):
15
- def __init__(self, host_arch:str):
16
- for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
17
+ jit = True
18
+ target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()]
19
+ def __init__(self, processor:str, feats:str):
20
+ for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
17
21
 
18
- triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
22
+ triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch]
19
23
  target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
20
- # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
21
- cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
22
- if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
23
- self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
24
+ if DEBUG >= 3: print(f"LLVM init for {processor!r} with {feats!r}")
25
+ self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, processor.encode(), feats.encode(),
24
26
  llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
25
27
 
26
28
  self.pbo = llvm.LLVMCreatePassBuilderOptions()
@@ -33,11 +35,21 @@ class LLVMCompiler(Compiler):
33
35
  else:
34
36
  self.passes = b'default<O0>'
35
37
 
36
- super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
38
+ self.diag_msgs: list[str] = []
39
+ @ctypes.CFUNCTYPE(None, llvm.LLVMDiagnosticInfoRef, ctypes.c_void_p)
40
+ def handle_diag(diag_ref, _arg):
41
+ severity = llvm.LLVMGetDiagInfoSeverity(diag_ref)
42
+ msg = ctypes.string_at(llvm.LLVMGetDiagInfoDescription(diag_ref)).decode()
43
+ if severity == llvm.LLVMDSError:
44
+ self.diag_msgs.append(msg)
45
+ self.handle_diag = handle_diag
46
+ llvm.LLVMContextSetDiagnosticHandler(llvm.LLVMGetGlobalContext(), handle_diag, None)
47
+ super().__init__(f"compile_llvm_{self.target_arch}{'_jit' if self.jit else ''}{'_opt' if opt else ''}")
37
48
 
38
49
  def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
39
50
 
40
51
  def compile(self, src:str) -> bytes:
52
+ self.diag_msgs.clear()
41
53
  src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
42
54
  mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
43
55
  expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
@@ -48,11 +60,19 @@ class LLVMCompiler(Compiler):
48
60
  llvm.LLVMDisposeModule(mod)
49
61
  obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
50
62
  llvm.LLVMDisposeMemoryBuffer(obj_buf)
51
- return jit_loader(obj)
63
+ if self.diag_msgs: raise RuntimeError("llvm diagnostic: " + "\n".join(self.diag_msgs))
64
+ return jit_loader(obj) if self.jit else obj
52
65
 
53
66
  def disassemble(self, lib:bytes): capstone_flatdump(lib)
54
67
 
55
- class LLVMDevice(Compiled):
56
- def __init__(self, device:str):
57
- compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()])
58
- super().__init__(device, MallocAllocator, LLVMRenderer(), compiler, CPUProgram)
68
+ class HostLLVMCompiler(LLVMCompiler):
69
+ def __init__(self):
70
+ # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
71
+ cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
72
+ super().__init__(cpu.decode(), feats.decode())
73
+
74
+ class LLVMDevice(HCQCompiled):
75
+ def __init__(self, device:str=""):
76
+ self.tasks:queue.Queue = queue.Queue()
77
+ CPUWorker(self).start()
78
+ super().__init__(device, CPUAllocator(self), LLVMRenderer(), HostLLVMCompiler(), functools.partial(CPUProgram, self), HCQSignal, CPUComputeQueue)