tinygrad 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. tinygrad/__init__.py +6 -6
  2. tinygrad/codegen/kernel.py +230 -190
  3. tinygrad/codegen/linearizer.py +278 -384
  4. tinygrad/codegen/uops.py +415 -0
  5. tinygrad/device.py +132 -275
  6. tinygrad/dtype.py +53 -37
  7. tinygrad/engine/__init__.py +0 -0
  8. tinygrad/engine/graph.py +100 -0
  9. tinygrad/engine/jit.py +195 -0
  10. tinygrad/engine/realize.py +191 -0
  11. tinygrad/engine/schedule.py +362 -0
  12. tinygrad/engine/search.py +196 -0
  13. tinygrad/{mlops.py → function.py} +28 -14
  14. tinygrad/helpers.py +72 -43
  15. tinygrad/lazy.py +141 -240
  16. tinygrad/multi.py +169 -0
  17. tinygrad/nn/__init__.py +179 -8
  18. tinygrad/nn/datasets.py +7 -0
  19. tinygrad/nn/optim.py +106 -28
  20. tinygrad/nn/state.py +86 -17
  21. tinygrad/ops.py +70 -44
  22. tinygrad/renderer/__init__.py +61 -0
  23. tinygrad/renderer/assembly.py +276 -0
  24. tinygrad/renderer/cstyle.py +299 -206
  25. tinygrad/renderer/llvmir.py +118 -123
  26. tinygrad/runtime/autogen/amd_gpu.py +1900 -0
  27. tinygrad/runtime/autogen/comgr.py +865 -0
  28. tinygrad/runtime/autogen/cuda.py +5923 -0
  29. tinygrad/runtime/autogen/hip.py +5909 -0
  30. tinygrad/runtime/autogen/hsa.py +5761 -0
  31. tinygrad/runtime/autogen/kfd.py +812 -0
  32. tinygrad/runtime/autogen/nv_gpu.py +33328 -0
  33. tinygrad/runtime/autogen/opencl.py +1795 -0
  34. tinygrad/runtime/driver/hip_comgr.py +47 -0
  35. tinygrad/runtime/driver/hsa.py +143 -0
  36. tinygrad/runtime/graph/clang.py +38 -0
  37. tinygrad/runtime/graph/cuda.py +59 -54
  38. tinygrad/runtime/graph/hcq.py +143 -0
  39. tinygrad/runtime/graph/hsa.py +171 -0
  40. tinygrad/runtime/graph/metal.py +37 -41
  41. tinygrad/runtime/ops_amd.py +564 -0
  42. tinygrad/runtime/ops_clang.py +16 -14
  43. tinygrad/runtime/ops_cuda.py +130 -38
  44. tinygrad/runtime/ops_disk.py +45 -42
  45. tinygrad/runtime/ops_gpu.py +52 -50
  46. tinygrad/runtime/ops_hsa.py +278 -0
  47. tinygrad/runtime/ops_llvm.py +36 -56
  48. tinygrad/runtime/ops_metal.py +42 -24
  49. tinygrad/runtime/ops_npy.py +9 -0
  50. tinygrad/runtime/ops_nv.py +630 -0
  51. tinygrad/runtime/ops_python.py +204 -0
  52. tinygrad/shape/shapetracker.py +41 -105
  53. tinygrad/shape/symbolic.py +98 -95
  54. tinygrad/shape/view.py +137 -35
  55. tinygrad/tensor.py +2367 -442
  56. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
  57. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/METADATA +19 -9
  58. tinygrad-0.9.0.dist-info/RECORD +60 -0
  59. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
  60. tinygrad/features/image.py +0 -93
  61. tinygrad/features/multi.py +0 -103
  62. tinygrad/features/search.py +0 -160
  63. tinygrad/graph.py +0 -106
  64. tinygrad/jit.py +0 -152
  65. tinygrad/realize.py +0 -50
  66. tinygrad/runtime/graph/hip.py +0 -24
  67. tinygrad/runtime/ops_cpu.py +0 -45
  68. tinygrad/runtime/ops_hip.py +0 -97
  69. tinygrad/runtime/ops_torch.py +0 -49
  70. tinygrad-0.8.0.dist-info/RECORD +0 -41
  71. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,564 @@
1
+ from __future__ import annotations
2
+ from typing import Tuple, List, Any, cast
3
+ import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
4
+ from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
5
+ from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
6
+ from tinygrad.renderer.cstyle import AMDRenderer
7
+ from tinygrad.runtime.driver.hip_comgr import compile_hip
8
+ from tinygrad.runtime.ops_hsa import HSACompiler
9
+ import tinygrad.runtime.autogen.kfd as kfd
10
+ import tinygrad.runtime.autogen.hsa as hsa
11
+ import tinygrad.runtime.autogen.amd_gpu as amd_gpu
12
+ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
13
+
14
+ libc = ctypes.CDLL(ctypes.util.find_library("c"))
15
+ libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
16
+ libc.mmap.restype = ctypes.c_void_p
17
+ libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
18
+ libc.munmap.restype = ctypes.c_int
19
+
20
+ if getenv("MOCKGPU"):
21
+ import extra.mockgpu.mockgpu # noqa: F401
22
+ libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
23
+ libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
24
+
25
+ def is_usable_gpu(gpu_id):
26
+ try:
27
+ with gpu_id.open() as f:
28
+ return int(f.read()) != 0
29
+ except OSError:
30
+ return False
31
+
32
+ def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
33
+ made = made_struct or user_struct(**kwargs)
34
+ ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
35
+ if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
36
+ return made
37
+
38
+ def ioctls_from_header():
39
+ #hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
40
+ #pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
41
+ #matches = re.findall(pattern, hdr, re.MULTILINE)
42
+ # get this from python instead
43
+ hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
44
+ pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
45
+ matches = re.findall(pattern, hdrpy, re.MULTILINE)
46
+ idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
47
+ fxns = {name.replace("AMDKFD_IOC_", "").lower():
48
+ functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
49
+ for name, idir, nr, sname in matches}
50
+ return type("KIO", (object, ), fxns)
51
+ kio = ioctls_from_header()
52
+
53
+ def create_sdma_packets():
54
+ # TODO: clean up this, if we want to keep it
55
+ structs = {}
56
+ for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
57
+ names = set()
58
+ fields = []
59
+ for pkt_fields in pkt._fields_:
60
+ if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
61
+ else:
62
+ assert pkt_fields[1]._fields_[0][0] == '_0'
63
+ for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
64
+ fname = union_fields[0]
65
+ if fname in names: fname = pkt_fields[0]+fname
66
+ names.add(fname)
67
+ # merge together 64-bit fields, otherwise just append them
68
+ if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
69
+ else: fields.append(tuple([fname, *union_fields[1:]]))
70
+ new_name = name[16:-4].lower()
71
+ structs[new_name] = init_c_struct_t(tuple(fields))
72
+ assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
73
+ return type("SDMA_PKTS", (object, ), structs)
74
+ sdma_pkts = create_sdma_packets()
75
+
76
+ class AMDCompiler(Compiler):
77
+ def __init__(self, arch:str):
78
+ self.arch = arch
79
+ super().__init__(f"compile_hip_{self.arch}")
80
+ def compile(self, src:str) -> bytes:
81
+ try: return compile_hip(src, self.arch)
82
+ except RuntimeError as e: raise CompileError(e)
83
+
84
+ PAGE_SIZE = 0x1000
85
+ SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
86
+ SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
87
+
88
+ BASE_ADDR = 0x00001260
89
+ SUB = amd_gpu.PACKET3_SET_SH_REG_START - BASE_ADDR
90
+
91
+ regCOMPUTE_PGM_LO = 0x1bac - SUB
92
+ regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
93
+ regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
94
+ regCOMPUTE_START_X = 0x1ba4 - SUB
95
+ regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
96
+ regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
97
+ regCOMPUTE_RESTART_X = 0x1bbb - SUB
98
+ regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
99
+ regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
100
+ regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
101
+
102
+ regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
103
+ regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
104
+
105
+ # VGT_EVENT_TYPE in navi10_enum.h
106
+ CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
107
+ CS_PARTIAL_FLUSH = 0x7
108
+
109
+ WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
110
+ WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
111
+
112
+ COMPUTE_SHADER_EN = 1
113
+ FORCE_START_AT_000 = 1 << 2
114
+ CS_W32_EN = 1 << 15
115
+
116
+ class HWPM4Queue:
117
+ def __init__(self): self.q = []
118
+ def ptr(self) -> int: return len(self.q)
119
+
120
+ def hdp_flush(self):
121
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
122
+ amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | \
123
+ amd_gpu.WAIT_REG_MEM_ENGINE(0), regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE, 0x0, 0x0, 0x20]
124
+
125
+ def invalidate_cache(self):
126
+ # overkill?
127
+ addr=0x0
128
+ sz=(1 << 64)-1
129
+ gli=1
130
+ glv=1
131
+ glk=1
132
+ gl1=1
133
+ gl2=1
134
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
135
+ sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
136
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
137
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
138
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
139
+ return self
140
+
141
+ def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
142
+ self.hdp_flush()
143
+ self.invalidate_cache()
144
+
145
+ code = hsa.amd_kernel_code_t.from_address(prg.handle) # NOTE: this is wrong, it's not this object
146
+ assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
147
+ assert code.workitem_private_segment_byte_size == 0
148
+ assert code.max_scratch_backing_memory_byte_size == 0
149
+ assert code.kernel_code_prefetch_byte_size == 0
150
+ rsrc1, rsrc2 = code.compute_pgm_rsrc1, code.compute_pgm_rsrc2
151
+
152
+ # this is required
153
+ lds_size = ((prg.group_segment_size + 511) // 512) & 0x1FF
154
+ assert lds_size <= 0x80 # larger numbers stall the GPU
155
+
156
+ prog_addr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
157
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, prog_addr&0xFFFFFFFF, prog_addr>>32, 0, 0,
158
+ (prg.device.scratch.va_addr>>8)&0xFFFFFFFF, prg.device.scratch.va_addr>>40]
159
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, rsrc1, rsrc2 | (lds_size << 15)]
160
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_TMPRING_SIZE, 0x00200200] # (waveSize << 12) | (numWaves)
161
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_RESTART_X, 0,0,0,0]
162
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF,0xFFFFFFFF]
163
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF,0xFFFFFFFF]
164
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]
165
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, kernargs&0xFFFFFFFF, kernargs>>32]
166
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0]
167
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_RESOURCE_LIMITS, 0]
168
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
169
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
170
+
171
+ if signal is not None: self.signal(signal, signal_value)
172
+ return self
173
+
174
+ def update_exec(self, cmd_ptr, global_size, local_size):
175
+ # Patch the exec cmd with new launch dims
176
+ assert self.q[cmd_ptr + 67] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),"The pointer does not point to a packet of this type"
177
+ self.q[cmd_ptr + 59 : cmd_ptr + 62] = local_size
178
+ self.q[cmd_ptr + 68 : cmd_ptr + 71] = global_size
179
+
180
+ def wait(self, signal:hsa.amd_signal_t, value=0):
181
+ addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
182
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
183
+ amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
184
+ amd_gpu.WAIT_REG_MEM_ENGINE(0), addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
185
+ return self
186
+
187
+ def timestamp(self, addr):
188
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
189
+ # event_index__mec_release_mem__end_of_pipe = 5
190
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5),
191
+ # * 3 - send 64bit GPU counter value
192
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(3) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(0) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
193
+ addr&0xFFFFFFFF, addr>>32, 0, 0, 0]
194
+ return self
195
+
196
+ def signal(self, signal:hsa.amd_signal_t, value=0):
197
+ # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
198
+ addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
199
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
200
+ # event_index__mec_release_mem__end_of_pipe = 5
201
+ # event_index__mec_release_mem__shader_done = 6
202
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
203
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
204
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
205
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
206
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
207
+ addr&0xFFFFFFFF, addr>>32,
208
+ value&0xFFFFFFFF, value>>32, 0]
209
+ if signal.event_mailbox_ptr != 0:
210
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
211
+ # event_index__mec_release_mem__end_of_pipe = 5
212
+ # event_index__mec_release_mem__shader_done = 6
213
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
214
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
215
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
216
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
217
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
218
+ signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
219
+ signal.event_id&0xFFFFFFFF, signal.event_id>>32,
220
+ signal.event_id]
221
+ return self
222
+
223
+ def submit(self, device:AMDDevice):
224
+ wptr = device.pm4_write_pointer[0]
225
+ pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
226
+ for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
227
+ device.pm4_write_pointer[0] = wptr + len(self.q)
228
+ device.pm4_doorbell[0] = wptr + len(self.q)
229
+ return self
230
+
231
+ # prebuilt sdma packets
232
+ sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
233
+ sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
234
+ GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
235
+ GCR_CONTROL_GL2_RANGE=0)
236
+ sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
237
+ GCR_CONTROL_GL2_RANGE=0)
238
+
239
+ SDMA_MAX_COPY_SIZE = 0x400000
240
+ class HWCopyQueue:
241
+ def __init__(self): self.q = []
242
+
243
+ def submit(self, device:AMDDevice):
244
+ read_ptr = device.sdma_read_pointer[0]
245
+ if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
246
+ for cmd in self.q:
247
+ if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
248
+ ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
249
+ device.sdma_doorbell_value += fill
250
+ ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
251
+ device.sdma_doorbell_value += cmdsz
252
+ device.sdma_write_pointer[0] = device.sdma_doorbell_value
253
+ device.sdma_doorbell[0] = device.sdma_doorbell_value
254
+ return self
255
+
256
+ def timestamp(self, addr):
257
+ self.q.append(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=addr))
258
+ return self
259
+
260
+ def copy(self, dest, src, copy_size):
261
+ self.q.append(sdma_flush_hdp_pkt) # TODO: do I need this?
262
+ self.q.append(sdma_cache_inv)
263
+ copied = 0
264
+ copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
265
+ for _ in range(copies_commands):
266
+ step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
267
+ self.q.append(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
268
+ count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
269
+ copied += step_copy_size
270
+ self.q.append(sdma_cache_wb)
271
+ return self
272
+
273
+ def signal(self, signal:hsa.amd_signal_t, value=0):
274
+ self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
275
+ if signal.event_mailbox_ptr != 0:
276
+ self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
277
+ self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
278
+ return self
279
+
280
+ def wait(self, signal:hsa.amd_signal_t, value=0):
281
+ self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=WAIT_REG_MEM_FUNCTION_GEQ,
282
+ addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
283
+ value=value, mask=0xffffffff, interval=0x04, retry_count=0xfff))
284
+ return self
285
+
286
+ SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
287
+ class AMDProgram:
288
+ def __init__(self, device:AMDDevice, name:str, lib:bytes):
289
+ # TODO; this API needs the type signature of the function and global_size/local_size
290
+ self.device, self.name, self.lib = device, name, lib
291
+
292
+ if DEBUG >= 6:
293
+ asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
294
+ print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
295
+
296
+ _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
297
+ sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
298
+
299
+ lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
300
+ self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
301
+ lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
302
+
303
+ for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
304
+ if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
305
+
306
+ entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
307
+ self.handle = self.lib_gpu.va_addr + entry_point
308
+ self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
309
+ self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
310
+ self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
311
+ self.kernargs_offset = 0
312
+ assert self.private_segment_size <= self.device.max_private_segment_size, \
313
+ f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
314
+
315
+ HWPM4Queue().invalidate_cache().submit(self.device)
316
+
317
+ # NOTE: no programs are ever freed
318
+ def __del__(self):
319
+ if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
320
+
321
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
322
+ if self.device.kernargs_ptr + self.kernargs_segment_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
323
+ self.device.kernargs_ptr = self.device.kernargs.va_addr
324
+ assert self.device.kernargs_ptr + self.kernargs_segment_size <= (self.device.kernargs.va_addr + self.device.kernargs.size), "kernargs overrun"
325
+ if not hasattr(self, "args_struct_t"):
326
+ self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
327
+ [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
328
+ if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
329
+ raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
330
+ args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
331
+ for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
332
+ for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
333
+
334
+ q = HWPM4Queue()
335
+ q.wait(self.device.timeline_signal, self.device.timeline_value - 1)
336
+ if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
337
+ q.exec(self, self.device.kernargs_ptr, global_size, local_size)
338
+ if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset)
339
+ q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
340
+ self.device.timeline_value += 1
341
+ self.device.kernargs_ptr += self.kernargs_segment_size
342
+
343
+ if wait:
344
+ self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
345
+ return (self.device.timeline_signal.end_ts - self.device.timeline_signal.start_ts) / 1e8
346
+
347
+ class AMDAllocator(LRUAllocator):
348
+ def __init__(self, device:AMDDevice):
349
+ self.device = device
350
+ # NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
351
+ self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(16)]
352
+ self.b_timeline = [0] * len(self.b)
353
+ self.b_next = 0
354
+ super().__init__()
355
+
356
+ def _alloc(self, size:int, options:BufferOptions):
357
+ try:
358
+ if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
359
+ else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
360
+ except OSError as e:
361
+ if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
362
+ else: raise
363
+
364
+ def _free(self, gpumem, options:BufferOptions): self.device._gpu_free(gpumem)
365
+ #def as_buffer(self, src:Any) -> memoryview:
366
+ # self.device.synchronize()
367
+ # return to_mv(src.va_addr, src.size)
368
+
369
+ #def copy_from_fd(self, dest, fd, offset, size):
370
+ # fo = io.FileIO(fd, "a+b", closefd=False)
371
+ # fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
372
+ # copied_in, total_copy_size = 0, round_up(size+minor_offset, PAGE_SIZE)
373
+ # for i in range(0, size+minor_offset, self.b[0].size):
374
+ # local_size = min(self.b[0].size, total_copy_size-i)
375
+ # copy_size = min(local_size-minor_offset, size-copied_in)
376
+ # if copy_size == 0: break
377
+
378
+ # fo.readinto(to_mv(self.b[1].va_addr, local_size))
379
+ # if i != 0: self.device._wait_signal(self.device.signal_sdma)
380
+ # self.b = self.b[::-1]
381
+ # self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
382
+
383
+ # copied_in += copy_size
384
+ # minor_offset = 0 # only on the first
385
+ # self.device._wait_signal(self.device.signal_sdma)
386
+
387
+ def copyin(self, dest, src: memoryview):
388
+ for i in range(0, src.nbytes, self.b[0].size):
389
+ self.b_next = (self.b_next + 1) % len(self.b)
390
+ AMDDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
391
+ ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
392
+ HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
393
+ .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
394
+ .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
395
+ self.b_timeline[self.b_next] = self.device.timeline_value
396
+ self.device.timeline_value += 1
397
+
398
+ def copyout(self, dest:memoryview, src):
399
+ self.device.synchronize()
400
+ for i in range(0, dest.nbytes, self.b[0].size):
401
+ HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
402
+ .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
403
+ .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
404
+ AMDDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
405
+ self.device.timeline_value += 1
406
+
407
+ ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
408
+
409
+ def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
410
+ src_dev._gpu_map(dest)
411
+ HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
412
+ .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
413
+ .copy(dest.va_addr, src.va_addr, sz) \
414
+ .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
415
+ HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
416
+ src_dev.timeline_value += 1
417
+
418
+ MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
419
+ class AMDDevice(Compiled):
420
+ kfd:int = -1
421
+ event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
422
+ signals_page:Any = None
423
+ signals_pool:List[hsa.amd_signal_t] = []
424
+ gpus:List[pathlib.Path] = []
425
+
426
+ def _gpu_map(self, mem):
427
+ if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
428
+ mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
429
+ c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
430
+ stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
431
+ assert stm.n_success == len(mem.mapped_gpu_ids)
432
+
433
+ def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
434
+ flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
435
+ if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
436
+ if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
437
+ if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
438
+ buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
439
+ else:
440
+ buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
441
+ assert addr != 0xffffffffffffffff
442
+ mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
443
+ if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
444
+ buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
445
+ assert addr == buf == mem.va_addr
446
+ if map_to_gpu: self._gpu_map(mem)
447
+ return mem
448
+
449
+ def _gpu_free(self, mem):
450
+ if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
451
+ c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
452
+ stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
453
+ assert stm.n_success == len(gpus)
454
+ libc.munmap(mem.va_addr, mem.size)
455
+ kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
456
+
457
+ @classmethod
458
+ def _set_signal(self, sig, value): sig.value = value
459
+
460
+ @classmethod
461
+ def _get_signal(self, value=0, sync_event=None) -> hsa.amd_signal_t:
462
+ self._set_signal(ret := self.signals_pool.pop(), value)
463
+ if sync_event is not None:
464
+ ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
465
+ ret.event_id = sync_event.event_id
466
+ else: ret.event_mailbox_ptr = ret.event_id = 0
467
+ return ret
468
+
469
+ @classmethod
470
+ def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
471
+ assert signal.event_id != 0, "can't wait on this signal"
472
+ evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
473
+
474
+ start_time = time.time() * 1000
475
+ while (time.time() * 1000 - start_time) < timeout:
476
+ if signal.value >= value: return
477
+ kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=100)
478
+ raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
479
+
480
+ def __init__(self, device:str=""):
481
+ if AMDDevice.kfd == -1:
482
+ AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
483
+ AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
484
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
485
+ with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
486
+ with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
487
+ self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
488
+ target = int(self.properties['gfx_target_version'])
489
+ self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
490
+ kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
491
+
492
+ if AMDDevice.event_page is None:
493
+ AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
494
+ AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
495
+ for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
496
+ AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
497
+ sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
498
+ else:
499
+ self._gpu_map(AMDDevice.signals_page)
500
+ self._gpu_map(AMDDevice.event_page)
501
+ sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
502
+
503
+ self.timeline_value: int = 1
504
+ self.timeline_signal = AMDDevice._get_signal(sync_event=sync_event)
505
+ self._shadow_timeline_signal = AMDDevice._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
506
+
507
+ self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
508
+ self.kernargs_ptr = self.kernargs.va_addr
509
+
510
+ # scratch setup
511
+ max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
512
+ max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
513
+ self.max_private_segment_size = 4096
514
+ wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
515
+ self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
516
+ self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
517
+
518
+ # SDMA Queue
519
+ self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
520
+ self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
521
+ self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
522
+ queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
523
+ write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
524
+
525
+ # doorbell page
526
+ self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
527
+ self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
528
+
529
+ self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
530
+ self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
531
+ self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
532
+ self.sdma_doorbell_value = 0
533
+
534
+ # PM4 Queue
535
+ self.pm4_ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
536
+ self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
537
+ self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
538
+ self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
539
+ self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
540
+ queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
541
+ eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
542
+ # TODO: are these needed? (i know eop is)
543
+ ctx_save_restore_address=self.pm4_ctx_save_restore_address.va_addr, ctx_save_restore_size=self.pm4_ctx_save_restore_address.size,
544
+ ctl_stack_size = 0xa000,
545
+ write_pointer_address=self.gart_pm4.va_addr, read_pointer_address=self.gart_pm4.va_addr+8)
546
+
547
+ self.pm4_read_pointer = to_mv(self.pm4_queue.read_pointer_address, 8).cast("Q")
548
+ self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
549
+ self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
550
+
551
+ from tinygrad.runtime.graph.hcq import HCQGraph
552
+ super().__init__(device, AMDAllocator(self), AMDRenderer(), HSACompiler(self.arch),
553
+ functools.partial(AMDProgram, self),
554
+ functools.partial(HCQGraph, AMDDevice, HWPM4Queue, HWCopyQueue))
555
+
556
+ def synchronize(self):
557
+ AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
558
+
559
+ # reset kernargs
560
+ self.kernargs_ptr = self.kernargs.va_addr
561
+ if self.timeline_value > (1 << 31):
562
+ self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
563
+ self.timeline_signal.value, self.timeline_value = 0, 1
564
+ cast(AMDAllocator, self.allocator).b_timeline = [0] * len(cast(AMDAllocator, self.allocator).b)
@@ -1,19 +1,19 @@
1
- import ctypes, subprocess, functools, pathlib, tempfile
2
- from tinygrad.device import Compiled, MallocAllocator
3
- from tinygrad.helpers import cpu_time_execution
4
- from tinygrad.codegen.kernel import LinearizerOptions
5
- from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
1
+ import ctypes, subprocess, pathlib, tempfile
2
+ from tinygrad.device import Compiled, Compiler, MallocAllocator
3
+ from tinygrad.helpers import cpu_time_execution, DEBUG, cpu_objdump
4
+ from tinygrad.renderer.cstyle import ClangRenderer
6
5
 
7
- CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#include <stdbool.h>\n' # noqa: E501
8
-
9
- def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
10
- # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
11
- with tempfile.NamedTemporaryFile(delete=True) as output_file:
12
- subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8')) # noqa: E501
13
- return pathlib.Path(output_file.name).read_bytes()
6
+ class ClangCompiler(Compiler):
7
+ def compile(self, src:str) -> bytes:
8
+ # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
9
+ with tempfile.NamedTemporaryFile(delete=True) as output_file:
10
+ subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
11
+ '-o', str(output_file.name)], input=src.encode('utf-8'))
12
+ return pathlib.Path(output_file.name).read_bytes()
14
13
 
15
14
  class ClangProgram:
16
15
  def __init__(self, name:str, lib:bytes):
16
+ if DEBUG >= 6: cpu_objdump(lib)
17
17
  self.name, self.lib = name, lib
18
18
  # write to disk so we can load it
19
19
  with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
@@ -22,5 +22,7 @@ class ClangProgram:
22
22
 
23
23
  def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
24
24
 
25
- renderer = functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict"))
26
- ClangDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram)
25
+ class ClangDevice(Compiled):
26
+ def __init__(self, device:str):
27
+ from tinygrad.runtime.graph.clang import ClangGraph
28
+ super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler("compile_clang"), ClangProgram, ClangGraph)