tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. tinygrad/__init__.py +6 -6
  2. tinygrad/codegen/__init__.py +0 -0
  3. tinygrad/codegen/kernel.py +253 -225
  4. tinygrad/codegen/linearizer.py +398 -436
  5. tinygrad/codegen/uops.py +451 -0
  6. tinygrad/device.py +268 -274
  7. tinygrad/dtype.py +56 -40
  8. tinygrad/engine/__init__.py +0 -0
  9. tinygrad/engine/graph.py +100 -0
  10. tinygrad/engine/jit.py +198 -0
  11. tinygrad/engine/realize.py +192 -0
  12. tinygrad/engine/schedule.py +370 -0
  13. tinygrad/engine/search.py +199 -0
  14. tinygrad/{mlops.py → function.py} +40 -32
  15. tinygrad/helpers.py +144 -46
  16. tinygrad/lazy.py +143 -242
  17. tinygrad/multi.py +173 -0
  18. tinygrad/nn/__init__.py +180 -9
  19. tinygrad/nn/datasets.py +8 -0
  20. tinygrad/nn/optim.py +106 -28
  21. tinygrad/nn/state.py +87 -19
  22. tinygrad/ops.py +104 -45
  23. tinygrad/renderer/__init__.py +65 -0
  24. tinygrad/renderer/assembly.py +269 -0
  25. tinygrad/renderer/cstyle.py +308 -210
  26. tinygrad/renderer/llvmir.py +119 -124
  27. tinygrad/runtime/__init__.py +0 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +13403 -0
  29. tinygrad/runtime/autogen/comgr.py +891 -0
  30. tinygrad/runtime/autogen/cuda.py +5923 -0
  31. tinygrad/runtime/autogen/hip.py +5909 -0
  32. tinygrad/runtime/autogen/hsa.py +5893 -0
  33. tinygrad/runtime/autogen/io_uring.py +1486 -0
  34. tinygrad/runtime/autogen/kfd.py +812 -0
  35. tinygrad/runtime/autogen/nv_gpu.py +33597 -0
  36. tinygrad/runtime/autogen/opencl.py +1795 -0
  37. tinygrad/runtime/driver/__init__.py +0 -0
  38. tinygrad/runtime/driver/hip_comgr.py +56 -0
  39. tinygrad/runtime/graph/__init__.py +0 -0
  40. tinygrad/runtime/graph/clang.py +39 -0
  41. tinygrad/runtime/graph/cuda.py +59 -54
  42. tinygrad/runtime/graph/hcq.py +187 -0
  43. tinygrad/runtime/graph/metal.py +37 -41
  44. tinygrad/runtime/ops_amd.py +550 -0
  45. tinygrad/runtime/ops_clang.py +16 -14
  46. tinygrad/runtime/ops_cuda.py +129 -37
  47. tinygrad/runtime/ops_disk.py +111 -43
  48. tinygrad/runtime/ops_gpu.py +52 -50
  49. tinygrad/runtime/ops_llvm.py +36 -56
  50. tinygrad/runtime/ops_metal.py +41 -24
  51. tinygrad/runtime/ops_npy.py +9 -0
  52. tinygrad/runtime/ops_nv.py +625 -0
  53. tinygrad/runtime/ops_python.py +208 -0
  54. tinygrad/shape/__init__.py +0 -0
  55. tinygrad/shape/shapetracker.py +46 -107
  56. tinygrad/shape/symbolic.py +99 -98
  57. tinygrad/shape/view.py +162 -45
  58. tinygrad/tensor.py +2492 -483
  59. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
  60. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
  61. tinygrad-0.9.1.dist-info/RECORD +63 -0
  62. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  63. tinygrad/features/image.py +0 -93
  64. tinygrad/features/multi.py +0 -103
  65. tinygrad/features/search.py +0 -160
  66. tinygrad/graph.py +0 -106
  67. tinygrad/jit.py +0 -152
  68. tinygrad/realize.py +0 -50
  69. tinygrad/runtime/graph/hip.py +0 -24
  70. tinygrad/runtime/ops_cpu.py +0 -45
  71. tinygrad/runtime/ops_hip.py +0 -97
  72. tinygrad/runtime/ops_torch.py +0 -49
  73. tinygrad-0.8.0.dist-info/RECORD +0 -41
  74. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,550 @@
1
+ from __future__ import annotations
2
+ from typing import Tuple, List, Any
3
+ import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
4
+ from dataclasses import dataclass
5
+ from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
+ from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
7
+ from tinygrad.renderer.cstyle import AMDRenderer
8
+ from tinygrad.runtime.driver.hip_comgr import compile_hip
9
+ import tinygrad.runtime.autogen.kfd as kfd
10
+ import tinygrad.runtime.autogen.hsa as hsa
11
+ import tinygrad.runtime.autogen.amd_gpu as amd_gpu
12
+ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
13
+
14
+ libc = ctypes.CDLL(ctypes.util.find_library("c"))
15
+ libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
16
+ libc.mmap.restype = ctypes.c_void_p
17
+ libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
18
+ libc.munmap.restype = ctypes.c_int
19
+
20
+ if getenv("MOCKGPU"):
21
+ import extra.mockgpu.mockgpu # noqa: F401
22
+ libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
23
+ libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
24
+
25
+ def is_usable_gpu(gpu_id):
26
+ try:
27
+ with gpu_id.open() as f:
28
+ return int(f.read()) != 0
29
+ except OSError:
30
+ return False
31
+
32
+ def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
33
+ made = made_struct or user_struct(**kwargs)
34
+ ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
35
+ if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
36
+ return made
37
+
38
+ def ioctls_from_header():
39
+ #hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
40
+ #pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
41
+ #matches = re.findall(pattern, hdr, re.MULTILINE)
42
+ # get this from python instead
43
+ hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
44
+ pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
45
+ matches = re.findall(pattern, hdrpy, re.MULTILINE)
46
+ idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
47
+ fxns = {name.replace("AMDKFD_IOC_", "").lower():
48
+ functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
49
+ for name, idir, nr, sname in matches}
50
+ return type("KIO", (object, ), fxns)
51
+ kio = ioctls_from_header()
52
+
53
+ SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
54
+ SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
55
+
56
+ regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
57
+ regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
58
+
59
+ # VGT_EVENT_TYPE in navi10_enum.h
60
+ CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
61
+
62
+ WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
63
+ WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
64
+
65
+ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
66
+
67
+ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
68
+ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
69
+ def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
70
+
71
+ class AMDCompiler(Compiler):
72
+ def __init__(self, arch:str):
73
+ self.arch = arch
74
+ super().__init__(f"compile_hip_{self.arch}")
75
+ def compile(self, src:str) -> bytes:
76
+ try: return compile_hip(src, self.arch)
77
+ except RuntimeError as e: raise CompileError(e) from e
78
+
79
+ class HWQueue:
80
+ def __init__(self): self.q, self.cmd_offsets = [], [0]
81
+ def _mark_command_end(self):
82
+ self.cmd_offsets.append(len(self.q))
83
+ return self
84
+ def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
85
+ def __len__(self): return len(self.cmd_offsets) - 1
86
+
87
+ class HWPM4Queue(HWQueue):
88
+ def __init__(self):
89
+ self.binded_device, self.ptr_to_dispatch_packet = None, {}
90
+ super().__init__()
91
+
92
+ def __del__(self):
93
+ if self.binded_device is not None:
94
+ self.binded_device.synchronize()
95
+ self.binded_device._gpu_free(self.hw_page)
96
+
97
+ def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
98
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
99
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
100
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
101
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
102
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
103
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
104
+
105
+ def memory_barrier(self):
106
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
107
+ amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
108
+ nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
109
+ self._invalidate_cache()
110
+ return self._mark_command_end()
111
+
112
+ def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
113
+ self._invalidate_cache()
114
+
115
+ user_data = [*data64_le(kernargs)]
116
+ if hasattr(prg, 'dispatch_packet_offset'):
117
+ dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.dispatch_packet_offset)
118
+ dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
119
+ dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
120
+ dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
121
+ user_data = [*data64_le(dp_addr)] + user_data
122
+ self.ptr_to_dispatch_packet[len(self)] = dp
123
+
124
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
125
+ *data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
126
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
127
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
128
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
129
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
130
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
131
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
132
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_data)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_data
133
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
134
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
135
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
136
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
137
+
138
+ if signal is not None: self.signal(signal, signal_value)
139
+ return self._mark_command_end()
140
+
141
+ def update_exec(self, cmd_idx, global_size, local_size):
142
+ # Patch the exec cmd with new launch dims
143
+ assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
144
+ self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
145
+ self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
146
+
147
+ if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
148
+ dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
149
+ dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
150
+
151
+ def wait(self, signal:hsa.amd_signal_t, value=0):
152
+ addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
153
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
154
+ amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
155
+ amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
156
+ return self._mark_command_end()
157
+
158
+ def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
159
+ cache_flush_flags = 0
160
+
161
+ if cache_flush:
162
+ cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
163
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
164
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
165
+
166
+ # event_index__mec_release_mem__end_of_pipe = 5
167
+ # event_index__mec_release_mem__shader_done = 6
168
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
169
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
170
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
171
+ *data64_le(address), *data64_le(value), cst]
172
+
173
+ def timestamp(self, sig):
174
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
175
+ address=ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)
176
+ return self._mark_command_end()
177
+
178
+ def signal(self, signal:hsa.amd_signal_t, value=0):
179
+ # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
180
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
181
+ value=value, cache_flush=True)
182
+ if signal.event_mailbox_ptr != 0:
183
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
184
+ value=signal.event_id, cst=signal.event_id, cache_flush=True)
185
+ return self._mark_command_end()
186
+
187
+ def update_wait(self, cmd_idx, signal=None, value=None):
188
+ assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
189
+ if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
190
+ if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
191
+ return self
192
+
193
+ def update_signal(self, cmd_idx, signal=None, value=None):
194
+ assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
195
+ if signal is not None:
196
+ self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
197
+ if signal.event_mailbox_ptr != 0:
198
+ self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
199
+ if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
200
+ return self
201
+
202
+ def bind(self, device: AMDDevice):
203
+ self.binded_device = device
204
+ self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
205
+ hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
206
+ for i, value in enumerate(self.q): hw_view[i] = value
207
+
208
+ self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
209
+ len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
210
+ self.q = hw_view # type: ignore
211
+
212
+ def submit(self, device: AMDDevice):
213
+ cmds = self.indirect_cmd if device == self.binded_device else self.q
214
+
215
+ for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
216
+
217
+ device.compute_queue.put_value += len(cmds)
218
+ device.compute_queue.write_ptr[0] = device.compute_queue.put_value
219
+ device.compute_queue.doorbell[0] = device.compute_queue.put_value
220
+ return self
221
+
222
+ SDMA_MAX_COPY_SIZE = 0x400000
223
+ class HWCopyQueue(HWQueue):
224
+ def __init__(self):
225
+ self.internal_cmd_sizes = []
226
+ super().__init__()
227
+
228
+ def _q(self, arr):
229
+ self.q += arr
230
+ self.internal_cmd_sizes.append(len(arr))
231
+
232
+ def copy(self, dest, src, copy_size):
233
+ # Invalidate cache inv
234
+ self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
235
+ amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
236
+
237
+ copied = 0
238
+ copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
239
+ for _ in range(copy_commands):
240
+ step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
241
+
242
+ self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
243
+ amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
244
+
245
+ copied += step_copy_size
246
+
247
+ # Invalidate cache wb
248
+ self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
249
+
250
+ return self._mark_command_end()
251
+
252
+ def signal(self, signal: hsa.amd_signal_t, value=0):
253
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
254
+
255
+ if signal.event_mailbox_ptr != 0:
256
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
257
+ self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
258
+
259
+ return self._mark_command_end()
260
+
261
+ def wait(self, signal: hsa.amd_signal_t, value=0):
262
+ self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
263
+ amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
264
+ amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
265
+
266
+ return self._mark_command_end()
267
+
268
+ def update_wait(self, cmd_idx, signal=None, value=None):
269
+ assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
270
+ if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
271
+ if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
272
+ return self
273
+
274
+ def timestamp(self, sig: hsa.amd_signal_t):
275
+ self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
276
+ *data64_le(ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
277
+ return self._mark_command_end()
278
+
279
+ def submit(self, device: AMDDevice):
280
+ if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
281
+
282
+ tail_blit_dword = 0
283
+ for cmdsz in self.internal_cmd_sizes:
284
+ if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
285
+ tail_blit_dword += cmdsz
286
+
287
+ start_idx = (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes) // 4
288
+ device.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
289
+ device.sdma_queue.put_value += tail_blit_dword * 4
290
+
291
+ if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
292
+ zero_fill = device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes
293
+ ctypes.memset(mv_address(device.sdma_queue.ring) + (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes), 0, zero_fill)
294
+ device.sdma_queue.put_value += zero_fill
295
+
296
+ device.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
297
+ device.sdma_queue.put_value += rem_packet_cnt * 4
298
+
299
+ device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
300
+ device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
301
+ return self
302
+
303
+ SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
304
+ class AMDProgram:
305
+ def __init__(self, device:AMDDevice, name:str, lib:bytes):
306
+ # TODO; this API needs the type signature of the function and global_size/local_size
307
+ self.device, self.name, self.lib = device, name, lib
308
+
309
+ if DEBUG >= 6:
310
+ asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
311
+ print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
312
+
313
+ _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
314
+ sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
315
+
316
+ lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
317
+ self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
318
+ lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
319
+
320
+ for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
321
+ if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
322
+
323
+ entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
324
+ self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
325
+ self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
326
+ self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
327
+ self.kernargs_alloc_size = self.kernargs_segment_size
328
+ self.kernargs_offset = 0
329
+
330
+ lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
331
+ if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
332
+ if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
333
+
334
+ code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
335
+ self.rsrc1 = code.compute_pgm_rsrc1
336
+ self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
337
+
338
+ if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
339
+ # Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
340
+ self.dispatch_packet_offset = self.kernargs_alloc_size
341
+ self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
342
+
343
+ assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
344
+ assert code.workitem_private_segment_byte_size == 0
345
+ assert code.max_scratch_backing_memory_byte_size == 0
346
+ assert code.kernel_code_prefetch_byte_size == 0
347
+
348
+ self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
349
+
350
+ HWPM4Queue().memory_barrier().submit(self.device)
351
+
352
+ # NOTE: no programs are ever freed
353
+ def __del__(self):
354
+ if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
355
+
356
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
357
+ if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
358
+ self.device.kernargs_ptr = self.device.kernargs.va_addr
359
+
360
+ if not hasattr(self, "args_struct_t"):
361
+ self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
362
+ [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
363
+ if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
364
+ raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
365
+
366
+ args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
367
+ for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
368
+ for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
369
+
370
+ sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
371
+
372
+ q = HWPM4Queue()
373
+ q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
374
+ if wait or PROFILE: q.timestamp(sig_st)
375
+ q.exec(self, self.device.kernargs_ptr, global_size, local_size)
376
+ if wait or PROFILE: q.timestamp(sig_en)
377
+ q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
378
+ self.device.timeline_value += 1
379
+ self.device.kernargs_ptr += self.kernargs_alloc_size
380
+
381
+ if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
382
+ if wait:
383
+ self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
384
+ return (sig_en.start_ts - sig_st.start_ts) / 1e8
385
+
386
+ class AMDAllocator(HCQCompatAllocator):
387
+ def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
388
+
389
+ def _alloc(self, size:int, options:BufferOptions):
390
+ try:
391
+ if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
392
+ return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
393
+ except OSError as e:
394
+ if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
395
+ raise
396
+
397
+ def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
398
+
399
+ MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
400
+
401
+ @dataclass
402
+ class AMDQueueDesc:
403
+ ring: memoryview
404
+ read_ptr: memoryview
405
+ write_ptr: memoryview
406
+ doorbell: memoryview
407
+ put_value: int = 0
408
+
409
+ class AMDDevice(HCQCompatCompiled):
410
+ kfd:int = -1
411
+ event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
412
+ signals_page:Any = None
413
+ signals_pool:List[hsa.amd_signal_t] = []
414
+ gpus:List[pathlib.Path] = []
415
+
416
+ def _gpu_map(self, mem):
417
+ if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
418
+ mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
419
+ c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
420
+ stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
421
+ assert stm.n_success == len(mem.mapped_gpu_ids)
422
+
423
+ def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
424
+ flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
425
+ if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
426
+ if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
427
+ if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
428
+ buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
429
+ else:
430
+ buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
431
+ assert addr != 0xffffffffffffffff
432
+ mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
433
+ if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
434
+ buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
435
+ assert addr == buf == mem.va_addr
436
+ if map_to_gpu: self._gpu_map(mem)
437
+ return mem
438
+
439
+ def _gpu_free(self, mem):
440
+ if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
441
+ c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
442
+ stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
443
+ assert stm.n_success == len(gpus)
444
+ libc.munmap(mem.va_addr, mem.size)
445
+ kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
446
+
447
+ @classmethod
448
+ def _read_signal(self, sig): return sig.value
449
+
450
+ @classmethod
451
+ def _read_timestamp(self, sig): return sig.start_ts
452
+
453
+ @classmethod
454
+ def _set_signal(self, sig, value): sig.value = value
455
+
456
+ @classmethod
457
+ def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
458
+ self._set_signal(ret := self.signals_pool.pop(), value)
459
+ if (sync_event:=kwargs.get('sync_event')) is not None:
460
+ ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
461
+ ret.event_id = sync_event.event_id
462
+ else: ret.event_mailbox_ptr = ret.event_id = 0
463
+ return ret
464
+
465
+ @classmethod
466
+ def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
467
+ assert signal.event_id != 0, "can't wait on this signal"
468
+ evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
469
+
470
+ # Wait active for 5s, then going to sleep.
471
+ start_time = time.time() * 1000
472
+ while (time_spent:=time.time() * 1000 - start_time) < timeout:
473
+ if signal.value >= value: return
474
+ if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
475
+ raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
476
+
477
+ def __init__(self, device:str=""):
478
+ if AMDDevice.kfd == -1:
479
+ AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
480
+ AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
481
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
482
+ with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
483
+ with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
484
+ self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
485
+ target = int(self.properties['gfx_target_version'])
486
+ self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
487
+ kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
488
+
489
+ if AMDDevice.event_page is None:
490
+ AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
491
+ AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
492
+ for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
493
+ AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
494
+ sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
495
+ else:
496
+ self._gpu_map(AMDDevice.signals_page)
497
+ self._gpu_map(AMDDevice.event_page)
498
+ sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
499
+
500
+ self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
501
+
502
+ self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
503
+ self.kernargs_ptr = self.kernargs.va_addr
504
+
505
+ # Scratch setup
506
+ max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
507
+ max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
508
+ self.max_private_segment_size = 4096
509
+ wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
510
+ self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
511
+ self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
512
+ engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
513
+ self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
514
+
515
+ self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
516
+ self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
517
+
518
+ super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self), HWPM4Queue, HWCopyQueue,
519
+ timeline_signals=[self._get_signal(sync_event=sync_event), self._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))])
520
+
521
+ def _gpu2cpu_time(self, gpu_time, is_copy):
522
+ if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
523
+ return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
524
+
525
+ def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
526
+ gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
527
+ ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
528
+ cwsr_ctx = self._gpu_alloc(ctx_save_restore_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
529
+ eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
530
+ queue = kio.create_queue(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
531
+ queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
532
+ eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
533
+ ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=cwsr_ctx.size if cwsr_ctx else 0,
534
+ write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
535
+
536
+ if not hasattr(self, 'doorbells'):
537
+ self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
538
+ self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
539
+
540
+ return AMDQueueDesc(ring=to_mv(ring.va_addr, ring_size).cast("I"),
541
+ read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
542
+ doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
543
+
544
+ def synchronize(self):
545
+ AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
546
+
547
+ # reset kernargs
548
+ self.kernargs_ptr = self.kernargs.va_addr
549
+ if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
550
+ if PROFILE: self._prof_process_events()
@@ -1,19 +1,19 @@
1
- import ctypes, subprocess, functools, pathlib, tempfile
2
- from tinygrad.device import Compiled, MallocAllocator
3
- from tinygrad.helpers import cpu_time_execution
4
- from tinygrad.codegen.kernel import LinearizerOptions
5
- from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
1
+ import ctypes, subprocess, pathlib, tempfile
2
+ from tinygrad.device import Compiled, Compiler, MallocAllocator
3
+ from tinygrad.helpers import cpu_time_execution, DEBUG, cpu_objdump
4
+ from tinygrad.renderer.cstyle import ClangRenderer
6
5
 
7
- CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#include <stdbool.h>\n' # noqa: E501
8
-
9
- def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
10
- # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
11
- with tempfile.NamedTemporaryFile(delete=True) as output_file:
12
- subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8')) # noqa: E501
13
- return pathlib.Path(output_file.name).read_bytes()
6
+ class ClangCompiler(Compiler):
7
+ def compile(self, src:str) -> bytes:
8
+ # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
9
+ with tempfile.NamedTemporaryFile(delete=True) as output_file:
10
+ subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
11
+ '-o', str(output_file.name)], input=src.encode('utf-8'))
12
+ return pathlib.Path(output_file.name).read_bytes()
14
13
 
15
14
  class ClangProgram:
16
15
  def __init__(self, name:str, lib:bytes):
16
+ if DEBUG >= 6: cpu_objdump(lib)
17
17
  self.name, self.lib = name, lib
18
18
  # write to disk so we can load it
19
19
  with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
@@ -22,5 +22,7 @@ class ClangProgram:
22
22
 
23
23
  def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
24
24
 
25
- renderer = functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict"))
26
- ClangDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram)
25
+ class ClangDevice(Compiled):
26
+ def __init__(self, device:str):
27
+ from tinygrad.runtime.graph.clang import ClangGraph
28
+ super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler("compile_clang"), ClangProgram, ClangGraph)