tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. tinygrad/codegen/__init__.py +0 -0
  2. tinygrad/codegen/kernel.py +78 -90
  3. tinygrad/codegen/linearizer.py +237 -169
  4. tinygrad/codegen/uops.py +278 -242
  5. tinygrad/device.py +147 -10
  6. tinygrad/dtype.py +7 -7
  7. tinygrad/engine/graph.py +16 -16
  8. tinygrad/engine/jit.py +39 -36
  9. tinygrad/engine/realize.py +6 -5
  10. tinygrad/engine/schedule.py +15 -7
  11. tinygrad/engine/search.py +6 -3
  12. tinygrad/function.py +17 -23
  13. tinygrad/helpers.py +77 -8
  14. tinygrad/lazy.py +26 -26
  15. tinygrad/multi.py +13 -9
  16. tinygrad/nn/__init__.py +1 -1
  17. tinygrad/nn/datasets.py +2 -1
  18. tinygrad/nn/state.py +3 -4
  19. tinygrad/ops.py +49 -16
  20. tinygrad/renderer/__init__.py +8 -4
  21. tinygrad/renderer/assembly.py +93 -100
  22. tinygrad/renderer/cstyle.py +47 -42
  23. tinygrad/renderer/llvmir.py +30 -30
  24. tinygrad/runtime/__init__.py +0 -0
  25. tinygrad/runtime/autogen/amd_gpu.py +11504 -1
  26. tinygrad/runtime/autogen/comgr.py +36 -10
  27. tinygrad/runtime/autogen/hsa.py +146 -14
  28. tinygrad/runtime/autogen/io_uring.py +1486 -0
  29. tinygrad/runtime/autogen/nv_gpu.py +269 -0
  30. tinygrad/runtime/driver/__init__.py +0 -0
  31. tinygrad/runtime/driver/hip_comgr.py +20 -11
  32. tinygrad/runtime/graph/__init__.py +0 -0
  33. tinygrad/runtime/graph/clang.py +3 -2
  34. tinygrad/runtime/graph/cuda.py +2 -2
  35. tinygrad/runtime/graph/hcq.py +122 -78
  36. tinygrad/runtime/ops_amd.py +302 -316
  37. tinygrad/runtime/ops_cuda.py +3 -3
  38. tinygrad/runtime/ops_disk.py +70 -5
  39. tinygrad/runtime/ops_gpu.py +2 -2
  40. tinygrad/runtime/ops_metal.py +5 -6
  41. tinygrad/runtime/ops_npy.py +1 -1
  42. tinygrad/runtime/ops_nv.py +161 -166
  43. tinygrad/runtime/ops_python.py +20 -16
  44. tinygrad/shape/__init__.py +0 -0
  45. tinygrad/shape/shapetracker.py +5 -2
  46. tinygrad/shape/symbolic.py +1 -3
  47. tinygrad/shape/view.py +34 -19
  48. tinygrad/tensor.py +219 -135
  49. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
  50. tinygrad-0.9.1.dist-info/RECORD +63 -0
  51. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  52. tinygrad/runtime/driver/hsa.py +0 -143
  53. tinygrad/runtime/graph/hsa.py +0 -171
  54. tinygrad/runtime/ops_hsa.py +0 -278
  55. tinygrad-0.9.0.dist-info/RECORD +0 -60
  56. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
  57. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, List, Any, cast
3
- import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
4
- from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
5
- from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
2
+ from typing import Tuple, List, Any
3
+ import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
4
+ from dataclasses import dataclass
5
+ from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
+ from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
6
7
  from tinygrad.renderer.cstyle import AMDRenderer
7
8
  from tinygrad.runtime.driver.hip_comgr import compile_hip
8
- from tinygrad.runtime.ops_hsa import HSACompiler
9
9
  import tinygrad.runtime.autogen.kfd as kfd
10
10
  import tinygrad.runtime.autogen.hsa as hsa
11
11
  import tinygrad.runtime.autogen.amd_gpu as amd_gpu
@@ -50,237 +50,254 @@ def ioctls_from_header():
50
50
  return type("KIO", (object, ), fxns)
51
51
  kio = ioctls_from_header()
52
52
 
53
- def create_sdma_packets():
54
- # TODO: clean up this, if we want to keep it
55
- structs = {}
56
- for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
57
- names = set()
58
- fields = []
59
- for pkt_fields in pkt._fields_:
60
- if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
61
- else:
62
- assert pkt_fields[1]._fields_[0][0] == '_0'
63
- for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
64
- fname = union_fields[0]
65
- if fname in names: fname = pkt_fields[0]+fname
66
- names.add(fname)
67
- # merge together 64-bit fields, otherwise just append them
68
- if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
69
- else: fields.append(tuple([fname, *union_fields[1:]]))
70
- new_name = name[16:-4].lower()
71
- structs[new_name] = init_c_struct_t(tuple(fields))
72
- assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
73
- return type("SDMA_PKTS", (object, ), structs)
74
- sdma_pkts = create_sdma_packets()
75
-
76
- class AMDCompiler(Compiler):
77
- def __init__(self, arch:str):
78
- self.arch = arch
79
- super().__init__(f"compile_hip_{self.arch}")
80
- def compile(self, src:str) -> bytes:
81
- try: return compile_hip(src, self.arch)
82
- except RuntimeError as e: raise CompileError(e)
83
-
84
- PAGE_SIZE = 0x1000
85
- SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
53
+ SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
86
54
  SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
87
55
 
88
- BASE_ADDR = 0x00001260
89
- SUB = amd_gpu.PACKET3_SET_SH_REG_START - BASE_ADDR
90
-
91
- regCOMPUTE_PGM_LO = 0x1bac - SUB
92
- regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
93
- regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
94
- regCOMPUTE_START_X = 0x1ba4 - SUB
95
- regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
96
- regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
97
- regCOMPUTE_RESTART_X = 0x1bbb - SUB
98
- regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
99
- regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
100
- regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
101
-
102
56
  regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
103
57
  regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
104
58
 
105
59
  # VGT_EVENT_TYPE in navi10_enum.h
106
60
  CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
107
- CS_PARTIAL_FLUSH = 0x7
108
61
 
109
62
  WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
110
63
  WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
111
64
 
112
- COMPUTE_SHADER_EN = 1
113
- FORCE_START_AT_000 = 1 << 2
114
- CS_W32_EN = 1 << 15
65
+ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
115
66
 
116
- class HWPM4Queue:
117
- def __init__(self): self.q = []
118
- def ptr(self) -> int: return len(self.q)
67
+ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
68
+ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
69
+ def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
119
70
 
120
- def hdp_flush(self):
121
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
122
- amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | \
123
- amd_gpu.WAIT_REG_MEM_ENGINE(0), regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE, 0x0, 0x0, 0x20]
124
-
125
- def invalidate_cache(self):
126
- # overkill?
127
- addr=0x0
128
- sz=(1 << 64)-1
129
- gli=1
130
- glv=1
131
- glk=1
132
- gl1=1
133
- gl2=1
134
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
135
- sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
136
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
137
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
138
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
71
+ class AMDCompiler(Compiler):
72
+ def __init__(self, arch:str):
73
+ self.arch = arch
74
+ super().__init__(f"compile_hip_{self.arch}")
75
+ def compile(self, src:str) -> bytes:
76
+ try: return compile_hip(src, self.arch)
77
+ except RuntimeError as e: raise CompileError(e) from e
78
+
79
+ class HWQueue:
80
+ def __init__(self): self.q, self.cmd_offsets = [], [0]
81
+ def _mark_command_end(self):
82
+ self.cmd_offsets.append(len(self.q))
139
83
  return self
84
+ def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
85
+ def __len__(self): return len(self.cmd_offsets) - 1
140
86
 
141
- def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
142
- self.hdp_flush()
143
- self.invalidate_cache()
87
+ class HWPM4Queue(HWQueue):
88
+ def __init__(self):
89
+ self.binded_device, self.ptr_to_dispatch_packet = None, {}
90
+ super().__init__()
144
91
 
145
- code = hsa.amd_kernel_code_t.from_address(prg.handle) # NOTE: this is wrong, it's not this object
146
- assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
147
- assert code.workitem_private_segment_byte_size == 0
148
- assert code.max_scratch_backing_memory_byte_size == 0
149
- assert code.kernel_code_prefetch_byte_size == 0
150
- rsrc1, rsrc2 = code.compute_pgm_rsrc1, code.compute_pgm_rsrc2
151
-
152
- # this is required
153
- lds_size = ((prg.group_segment_size + 511) // 512) & 0x1FF
154
- assert lds_size <= 0x80 # larger numbers stall the GPU
155
-
156
- prog_addr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
157
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, prog_addr&0xFFFFFFFF, prog_addr>>32, 0, 0,
158
- (prg.device.scratch.va_addr>>8)&0xFFFFFFFF, prg.device.scratch.va_addr>>40]
159
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, rsrc1, rsrc2 | (lds_size << 15)]
160
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_TMPRING_SIZE, 0x00200200] # (waveSize << 12) | (numWaves)
161
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_RESTART_X, 0,0,0,0]
162
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF,0xFFFFFFFF]
163
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF,0xFFFFFFFF]
164
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]
165
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, kernargs&0xFFFFFFFF, kernargs>>32]
166
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0]
167
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_RESOURCE_LIMITS, 0]
92
+ def __del__(self):
93
+ if self.binded_device is not None:
94
+ self.binded_device.synchronize()
95
+ self.binded_device._gpu_free(self.hw_page)
96
+
97
+ def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
98
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
99
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
100
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
101
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
102
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
103
+ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
104
+
105
+ def memory_barrier(self):
106
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
107
+ amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
108
+ nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
109
+ self._invalidate_cache()
110
+ return self._mark_command_end()
111
+
112
+ def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
113
+ self._invalidate_cache()
114
+
115
+ user_data = [*data64_le(kernargs)]
116
+ if hasattr(prg, 'dispatch_packet_offset'):
117
+ dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.dispatch_packet_offset)
118
+ dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
119
+ dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
120
+ dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
121
+ user_data = [*data64_le(dp_addr)] + user_data
122
+ self.ptr_to_dispatch_packet[len(self)] = dp
123
+
124
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
125
+ *data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
126
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
127
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
128
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
129
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
130
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
131
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
132
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_data)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_data
133
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
134
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
168
135
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
169
136
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
170
137
 
171
138
  if signal is not None: self.signal(signal, signal_value)
172
- return self
139
+ return self._mark_command_end()
173
140
 
174
- def update_exec(self, cmd_ptr, global_size, local_size):
141
+ def update_exec(self, cmd_idx, global_size, local_size):
175
142
  # Patch the exec cmd with new launch dims
176
- assert self.q[cmd_ptr + 67] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),"The pointer does not point to a packet of this type"
177
- self.q[cmd_ptr + 59 : cmd_ptr + 62] = local_size
178
- self.q[cmd_ptr + 68 : cmd_ptr + 71] = global_size
143
+ assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
144
+ self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
145
+ self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
146
+
147
+ if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
148
+ dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
149
+ dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
179
150
 
180
151
  def wait(self, signal:hsa.amd_signal_t, value=0):
181
152
  addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
182
153
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
183
154
  amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
184
- amd_gpu.WAIT_REG_MEM_ENGINE(0), addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
185
- return self
155
+ amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
156
+ return self._mark_command_end()
186
157
 
187
- def timestamp(self, addr):
158
+ def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
159
+ cache_flush_flags = 0
160
+
161
+ if cache_flush:
162
+ cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
163
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
164
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
165
+
166
+ # event_index__mec_release_mem__end_of_pipe = 5
167
+ # event_index__mec_release_mem__shader_done = 6
188
168
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
189
- # event_index__mec_release_mem__end_of_pipe = 5
190
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5),
191
- # * 3 - send 64bit GPU counter value
192
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(3) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(0) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
193
- addr&0xFFFFFFFF, addr>>32, 0, 0, 0]
194
- return self
169
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
170
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
171
+ *data64_le(address), *data64_le(value), cst]
172
+
173
+ def timestamp(self, sig):
174
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
175
+ address=ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)
176
+ return self._mark_command_end()
195
177
 
196
178
  def signal(self, signal:hsa.amd_signal_t, value=0):
197
179
  # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
198
- addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
199
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
200
- # event_index__mec_release_mem__end_of_pipe = 5
201
- # event_index__mec_release_mem__shader_done = 6
202
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
203
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
204
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
205
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
206
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
207
- addr&0xFFFFFFFF, addr>>32,
208
- value&0xFFFFFFFF, value>>32, 0]
180
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
181
+ value=value, cache_flush=True)
209
182
  if signal.event_mailbox_ptr != 0:
210
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
211
- # event_index__mec_release_mem__end_of_pipe = 5
212
- # event_index__mec_release_mem__shader_done = 6
213
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
214
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
215
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
216
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
217
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
218
- signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
219
- signal.event_id&0xFFFFFFFF, signal.event_id>>32,
220
- signal.event_id]
183
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
184
+ value=signal.event_id, cst=signal.event_id, cache_flush=True)
185
+ return self._mark_command_end()
186
+
187
+ def update_wait(self, cmd_idx, signal=None, value=None):
188
+ assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
189
+ if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
190
+ if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
221
191
  return self
222
192
 
223
- def submit(self, device:AMDDevice):
224
- wptr = device.pm4_write_pointer[0]
225
- pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
226
- for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
227
- device.pm4_write_pointer[0] = wptr + len(self.q)
228
- device.pm4_doorbell[0] = wptr + len(self.q)
193
+ def update_signal(self, cmd_idx, signal=None, value=None):
194
+ assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
195
+ if signal is not None:
196
+ self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
197
+ if signal.event_mailbox_ptr != 0:
198
+ self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
199
+ if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
229
200
  return self
230
201
 
231
- # prebuilt sdma packets
232
- sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
233
- sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
234
- GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
235
- GCR_CONTROL_GL2_RANGE=0)
236
- sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
237
- GCR_CONTROL_GL2_RANGE=0)
202
+ def bind(self, device: AMDDevice):
203
+ self.binded_device = device
204
+ self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
205
+ hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
206
+ for i, value in enumerate(self.q): hw_view[i] = value
238
207
 
239
- SDMA_MAX_COPY_SIZE = 0x400000
240
- class HWCopyQueue:
241
- def __init__(self): self.q = []
242
-
243
- def submit(self, device:AMDDevice):
244
- read_ptr = device.sdma_read_pointer[0]
245
- if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
246
- for cmd in self.q:
247
- if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
248
- ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
249
- device.sdma_doorbell_value += fill
250
- ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
251
- device.sdma_doorbell_value += cmdsz
252
- device.sdma_write_pointer[0] = device.sdma_doorbell_value
253
- device.sdma_doorbell[0] = device.sdma_doorbell_value
254
- return self
208
+ self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
209
+ len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
210
+ self.q = hw_view # type: ignore
211
+
212
+ def submit(self, device: AMDDevice):
213
+ cmds = self.indirect_cmd if device == self.binded_device else self.q
255
214
 
256
- def timestamp(self, addr):
257
- self.q.append(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=addr))
215
+ for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
216
+
217
+ device.compute_queue.put_value += len(cmds)
218
+ device.compute_queue.write_ptr[0] = device.compute_queue.put_value
219
+ device.compute_queue.doorbell[0] = device.compute_queue.put_value
258
220
  return self
259
221
 
222
+ SDMA_MAX_COPY_SIZE = 0x400000
223
+ class HWCopyQueue(HWQueue):
224
+ def __init__(self):
225
+ self.internal_cmd_sizes = []
226
+ super().__init__()
227
+
228
+ def _q(self, arr):
229
+ self.q += arr
230
+ self.internal_cmd_sizes.append(len(arr))
231
+
260
232
  def copy(self, dest, src, copy_size):
261
- self.q.append(sdma_flush_hdp_pkt) # TODO: do I need this?
262
- self.q.append(sdma_cache_inv)
233
+ # Invalidate cache inv
234
+ self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
235
+ amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
236
+
263
237
  copied = 0
264
- copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
265
- for _ in range(copies_commands):
238
+ copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
239
+ for _ in range(copy_commands):
266
240
  step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
267
- self.q.append(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
268
- count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
241
+
242
+ self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
243
+ amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
244
+
269
245
  copied += step_copy_size
270
- self.q.append(sdma_cache_wb)
271
- return self
272
246
 
273
- def signal(self, signal:hsa.amd_signal_t, value=0):
274
- self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
247
+ # Invalidate cache wb
248
+ self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
249
+
250
+ return self._mark_command_end()
251
+
252
+ def signal(self, signal: hsa.amd_signal_t, value=0):
253
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
254
+
275
255
  if signal.event_mailbox_ptr != 0:
276
- self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
277
- self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
256
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
257
+ self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
258
+
259
+ return self._mark_command_end()
260
+
261
+ def wait(self, signal: hsa.amd_signal_t, value=0):
262
+ self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
263
+ amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
264
+ amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
265
+
266
+ return self._mark_command_end()
267
+
268
+ def update_wait(self, cmd_idx, signal=None, value=None):
269
+ assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
270
+ if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
271
+ if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
278
272
  return self
279
273
 
280
- def wait(self, signal:hsa.amd_signal_t, value=0):
281
- self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=WAIT_REG_MEM_FUNCTION_GEQ,
282
- addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
283
- value=value, mask=0xffffffff, interval=0x04, retry_count=0xfff))
274
+ def timestamp(self, sig: hsa.amd_signal_t):
275
+ self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
276
+ *data64_le(ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
277
+ return self._mark_command_end()
278
+
279
+ def submit(self, device: AMDDevice):
280
+ if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
281
+
282
+ tail_blit_dword = 0
283
+ for cmdsz in self.internal_cmd_sizes:
284
+ if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
285
+ tail_blit_dword += cmdsz
286
+
287
+ start_idx = (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes) // 4
288
+ device.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
289
+ device.sdma_queue.put_value += tail_blit_dword * 4
290
+
291
+ if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
292
+ zero_fill = device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes
293
+ ctypes.memset(mv_address(device.sdma_queue.ring) + (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes), 0, zero_fill)
294
+ device.sdma_queue.put_value += zero_fill
295
+
296
+ device.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
297
+ device.sdma_queue.put_value += rem_packet_cnt * 4
298
+
299
+ device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
300
+ device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
284
301
  return self
285
302
 
286
303
  SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
@@ -304,119 +321,92 @@ class AMDProgram:
304
321
  if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
305
322
 
306
323
  entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
307
- self.handle = self.lib_gpu.va_addr + entry_point
308
324
  self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
309
325
  self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
310
326
  self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
327
+ self.kernargs_alloc_size = self.kernargs_segment_size
311
328
  self.kernargs_offset = 0
312
- assert self.private_segment_size <= self.device.max_private_segment_size, \
313
- f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
314
329
 
315
- HWPM4Queue().invalidate_cache().submit(self.device)
330
+ lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
331
+ if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
332
+ if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
333
+
334
+ code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
335
+ self.rsrc1 = code.compute_pgm_rsrc1
336
+ self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
337
+
338
+ if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
339
+ # Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
340
+ self.dispatch_packet_offset = self.kernargs_alloc_size
341
+ self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
342
+
343
+ assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
344
+ assert code.workitem_private_segment_byte_size == 0
345
+ assert code.max_scratch_backing_memory_byte_size == 0
346
+ assert code.kernel_code_prefetch_byte_size == 0
347
+
348
+ self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
349
+
350
+ HWPM4Queue().memory_barrier().submit(self.device)
316
351
 
317
352
  # NOTE: no programs are ever freed
318
353
  def __del__(self):
319
354
  if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
320
355
 
321
356
  def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
322
- if self.device.kernargs_ptr + self.kernargs_segment_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
357
+ if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
323
358
  self.device.kernargs_ptr = self.device.kernargs.va_addr
324
- assert self.device.kernargs_ptr + self.kernargs_segment_size <= (self.device.kernargs.va_addr + self.device.kernargs.size), "kernargs overrun"
359
+
325
360
  if not hasattr(self, "args_struct_t"):
326
361
  self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
327
362
  [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
328
363
  if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
329
- raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
364
+ raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
365
+
330
366
  args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
331
367
  for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
332
368
  for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
333
369
 
370
+ sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
371
+
334
372
  q = HWPM4Queue()
335
- q.wait(self.device.timeline_signal, self.device.timeline_value - 1)
336
- if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
373
+ q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
374
+ if wait or PROFILE: q.timestamp(sig_st)
337
375
  q.exec(self, self.device.kernargs_ptr, global_size, local_size)
338
- if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset)
376
+ if wait or PROFILE: q.timestamp(sig_en)
339
377
  q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
340
378
  self.device.timeline_value += 1
341
- self.device.kernargs_ptr += self.kernargs_segment_size
379
+ self.device.kernargs_ptr += self.kernargs_alloc_size
342
380
 
381
+ if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
343
382
  if wait:
344
383
  self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
345
- return (self.device.timeline_signal.end_ts - self.device.timeline_signal.start_ts) / 1e8
346
-
347
- class AMDAllocator(LRUAllocator):
348
- def __init__(self, device:AMDDevice):
349
- self.device = device
350
- # NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
351
- self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(16)]
352
- self.b_timeline = [0] * len(self.b)
353
- self.b_next = 0
354
- super().__init__()
384
+ return (sig_en.start_ts - sig_st.start_ts) / 1e8
385
+
386
+ class AMDAllocator(HCQCompatAllocator):
387
+ def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
355
388
 
356
389
  def _alloc(self, size:int, options:BufferOptions):
357
390
  try:
358
391
  if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
359
- else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
392
+ return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
360
393
  except OSError as e:
361
394
  if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
362
- else: raise
363
-
364
- def _free(self, gpumem, options:BufferOptions): self.device._gpu_free(gpumem)
365
- #def as_buffer(self, src:Any) -> memoryview:
366
- # self.device.synchronize()
367
- # return to_mv(src.va_addr, src.size)
368
-
369
- #def copy_from_fd(self, dest, fd, offset, size):
370
- # fo = io.FileIO(fd, "a+b", closefd=False)
371
- # fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
372
- # copied_in, total_copy_size = 0, round_up(size+minor_offset, PAGE_SIZE)
373
- # for i in range(0, size+minor_offset, self.b[0].size):
374
- # local_size = min(self.b[0].size, total_copy_size-i)
375
- # copy_size = min(local_size-minor_offset, size-copied_in)
376
- # if copy_size == 0: break
377
-
378
- # fo.readinto(to_mv(self.b[1].va_addr, local_size))
379
- # if i != 0: self.device._wait_signal(self.device.signal_sdma)
380
- # self.b = self.b[::-1]
381
- # self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
382
-
383
- # copied_in += copy_size
384
- # minor_offset = 0 # only on the first
385
- # self.device._wait_signal(self.device.signal_sdma)
386
-
387
- def copyin(self, dest, src: memoryview):
388
- for i in range(0, src.nbytes, self.b[0].size):
389
- self.b_next = (self.b_next + 1) % len(self.b)
390
- AMDDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
391
- ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
392
- HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
393
- .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
394
- .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
395
- self.b_timeline[self.b_next] = self.device.timeline_value
396
- self.device.timeline_value += 1
397
-
398
- def copyout(self, dest:memoryview, src):
399
- self.device.synchronize()
400
- for i in range(0, dest.nbytes, self.b[0].size):
401
- HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
402
- .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
403
- .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
404
- AMDDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
405
- self.device.timeline_value += 1
406
-
407
- ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
408
-
409
- def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
410
- src_dev._gpu_map(dest)
411
- HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
412
- .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
413
- .copy(dest.va_addr, src.va_addr, sz) \
414
- .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
415
- HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
416
- src_dev.timeline_value += 1
395
+ raise
396
+
397
+ def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
417
398
 
418
399
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
419
- class AMDDevice(Compiled):
400
+
401
+ @dataclass
402
+ class AMDQueueDesc:
403
+ ring: memoryview
404
+ read_ptr: memoryview
405
+ write_ptr: memoryview
406
+ doorbell: memoryview
407
+ put_value: int = 0
408
+
409
+ class AMDDevice(HCQCompatCompiled):
420
410
  kfd:int = -1
421
411
  event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
422
412
  signals_page:Any = None
@@ -439,7 +429,7 @@ class AMDDevice(Compiled):
439
429
  else:
440
430
  buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
441
431
  assert addr != 0xffffffffffffffff
442
- mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
432
+ mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
443
433
  if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
444
434
  buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
445
435
  assert addr == buf == mem.va_addr
@@ -454,13 +444,19 @@ class AMDDevice(Compiled):
454
444
  libc.munmap(mem.va_addr, mem.size)
455
445
  kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
456
446
 
447
+ @classmethod
448
+ def _read_signal(self, sig): return sig.value
449
+
450
+ @classmethod
451
+ def _read_timestamp(self, sig): return sig.start_ts
452
+
457
453
  @classmethod
458
454
  def _set_signal(self, sig, value): sig.value = value
459
455
 
460
456
  @classmethod
461
- def _get_signal(self, value=0, sync_event=None) -> hsa.amd_signal_t:
457
+ def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
462
458
  self._set_signal(ret := self.signals_pool.pop(), value)
463
- if sync_event is not None:
459
+ if (sync_event:=kwargs.get('sync_event')) is not None:
464
460
  ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
465
461
  ret.event_id = sync_event.event_id
466
462
  else: ret.event_mailbox_ptr = ret.event_id = 0
@@ -471,10 +467,11 @@ class AMDDevice(Compiled):
471
467
  assert signal.event_id != 0, "can't wait on this signal"
472
468
  evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
473
469
 
470
+ # Wait active for 5s, then going to sleep.
474
471
  start_time = time.time() * 1000
475
- while (time.time() * 1000 - start_time) < timeout:
472
+ while (time_spent:=time.time() * 1000 - start_time) < timeout:
476
473
  if signal.value >= value: return
477
- kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=100)
474
+ if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
478
475
  raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
479
476
 
480
477
  def __init__(self, device:str=""):
@@ -500,65 +497,54 @@ class AMDDevice(Compiled):
500
497
  self._gpu_map(AMDDevice.event_page)
501
498
  sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
502
499
 
503
- self.timeline_value: int = 1
504
- self.timeline_signal = AMDDevice._get_signal(sync_event=sync_event)
505
- self._shadow_timeline_signal = AMDDevice._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
500
+ self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
506
501
 
507
502
  self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
508
503
  self.kernargs_ptr = self.kernargs.va_addr
509
504
 
510
- # scratch setup
505
+ # Scratch setup
511
506
  max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
512
507
  max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
513
508
  self.max_private_segment_size = 4096
514
509
  wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
515
510
  self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
516
511
  self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
517
-
518
- # SDMA Queue
519
- self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
520
- self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
521
- self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
522
- queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
523
- write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
524
-
525
- # doorbell page
526
- self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
527
- self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
528
-
529
- self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
530
- self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
531
- self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
532
- self.sdma_doorbell_value = 0
533
-
534
- # PM4 Queue
535
- self.pm4_ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
536
- self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
537
- self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
538
- self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
539
- self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
540
- queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
541
- eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
542
- # TODO: are these needed? (i know eop is)
543
- ctx_save_restore_address=self.pm4_ctx_save_restore_address.va_addr, ctx_save_restore_size=self.pm4_ctx_save_restore_address.size,
544
- ctl_stack_size = 0xa000,
545
- write_pointer_address=self.gart_pm4.va_addr, read_pointer_address=self.gart_pm4.va_addr+8)
546
-
547
- self.pm4_read_pointer = to_mv(self.pm4_queue.read_pointer_address, 8).cast("Q")
548
- self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
549
- self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
550
-
551
- from tinygrad.runtime.graph.hcq import HCQGraph
552
- super().__init__(device, AMDAllocator(self), AMDRenderer(), HSACompiler(self.arch),
553
- functools.partial(AMDProgram, self),
554
- functools.partial(HCQGraph, AMDDevice, HWPM4Queue, HWCopyQueue))
512
+ engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
513
+ self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
514
+
515
+ self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
516
+ self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
517
+
518
+ super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self), HWPM4Queue, HWCopyQueue,
519
+ timeline_signals=[self._get_signal(sync_event=sync_event), self._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))])
520
+
521
+ def _gpu2cpu_time(self, gpu_time, is_copy):
522
+ if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
523
+ return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
524
+
525
+ def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
526
+ gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
527
+ ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
528
+ cwsr_ctx = self._gpu_alloc(ctx_save_restore_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
529
+ eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
530
+ queue = kio.create_queue(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
531
+ queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
532
+ eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
533
+ ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=cwsr_ctx.size if cwsr_ctx else 0,
534
+ write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
535
+
536
+ if not hasattr(self, 'doorbells'):
537
+ self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
538
+ self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
539
+
540
+ return AMDQueueDesc(ring=to_mv(ring.va_addr, ring_size).cast("I"),
541
+ read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
542
+ doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
555
543
 
556
544
  def synchronize(self):
557
545
  AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
558
546
 
559
547
  # reset kernargs
560
548
  self.kernargs_ptr = self.kernargs.va_addr
561
- if self.timeline_value > (1 << 31):
562
- self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
563
- self.timeline_signal.value, self.timeline_value = 0, 1
564
- cast(AMDAllocator, self.allocator).b_timeline = [0] * len(cast(AMDAllocator, self.allocator).b)
549
+ if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
550
+ if PROFILE: self._prof_process_events()