tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tinygrad/codegen/kernel.py +248 -115
  2. tinygrad/codegen/lowerer.py +215 -0
  3. tinygrad/codegen/transcendental.py +310 -0
  4. tinygrad/codegen/uopgraph.py +622 -0
  5. tinygrad/codegen/uops.py +235 -393
  6. tinygrad/device.py +428 -69
  7. tinygrad/dtype.py +18 -4
  8. tinygrad/engine/graph.py +19 -32
  9. tinygrad/engine/jit.py +148 -70
  10. tinygrad/engine/realize.py +127 -51
  11. tinygrad/engine/schedule.py +259 -216
  12. tinygrad/engine/search.py +29 -22
  13. tinygrad/function.py +9 -0
  14. tinygrad/helpers.py +87 -49
  15. tinygrad/lazy.py +34 -35
  16. tinygrad/multi.py +41 -36
  17. tinygrad/nn/__init__.py +39 -22
  18. tinygrad/nn/state.py +3 -3
  19. tinygrad/ops.py +63 -62
  20. tinygrad/renderer/__init__.py +43 -21
  21. tinygrad/renderer/assembly.py +104 -106
  22. tinygrad/renderer/cstyle.py +87 -60
  23. tinygrad/renderer/llvmir.py +21 -30
  24. tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
  25. tinygrad/runtime/autogen/cuda.py +6 -162
  26. tinygrad/runtime/autogen/kfd.py +32 -0
  27. tinygrad/runtime/autogen/libc.py +4260 -0
  28. tinygrad/runtime/autogen/nvrtc.py +579 -0
  29. tinygrad/runtime/graph/clang.py +2 -2
  30. tinygrad/runtime/graph/cuda.py +8 -11
  31. tinygrad/runtime/graph/hcq.py +120 -107
  32. tinygrad/runtime/graph/metal.py +18 -15
  33. tinygrad/runtime/ops_amd.py +197 -305
  34. tinygrad/runtime/ops_clang.py +2 -2
  35. tinygrad/runtime/ops_cuda.py +36 -94
  36. tinygrad/runtime/ops_disk.py +3 -7
  37. tinygrad/runtime/ops_gpu.py +4 -2
  38. tinygrad/runtime/ops_hip.py +70 -0
  39. tinygrad/runtime/ops_metal.py +38 -27
  40. tinygrad/runtime/ops_nv.py +283 -363
  41. tinygrad/runtime/ops_python.py +26 -30
  42. tinygrad/runtime/support/compiler_cuda.py +78 -0
  43. tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
  44. tinygrad/runtime/support/elf.py +38 -0
  45. tinygrad/shape/shapetracker.py +5 -14
  46. tinygrad/shape/symbolic.py +4 -8
  47. tinygrad/shape/view.py +34 -22
  48. tinygrad/tensor.py +399 -97
  49. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
  50. tinygrad-0.9.2.dist-info/RECORD +70 -0
  51. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
  52. tinygrad/codegen/linearizer.py +0 -528
  53. tinygrad-0.9.1.dist-info/RECORD +0 -63
  54. /tinygrad/runtime/{driver → support}/__init__.py +0 -0
  55. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
  56. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
@@ -1,60 +1,30 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, List, Any
3
- import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
2
+ from typing import Tuple, List, Any, cast
3
+ import os, fcntl, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
4
4
  from dataclasses import dataclass
5
- from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
- from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
5
+ from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \
6
+ HCQSignal, HCQProgram, BufferOptions
7
+ from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address
7
8
  from tinygrad.renderer.cstyle import AMDRenderer
8
- from tinygrad.runtime.driver.hip_comgr import compile_hip
9
- import tinygrad.runtime.autogen.kfd as kfd
10
- import tinygrad.runtime.autogen.hsa as hsa
11
- import tinygrad.runtime.autogen.amd_gpu as amd_gpu
12
- if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
13
-
14
- libc = ctypes.CDLL(ctypes.util.find_library("c"))
15
- libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
16
- libc.mmap.restype = ctypes.c_void_p
17
- libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
18
- libc.munmap.restype = ctypes.c_int
19
-
20
- if getenv("MOCKGPU"):
21
- import extra.mockgpu.mockgpu # noqa: F401
22
- libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
23
- libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
9
+ from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
10
+ from tinygrad.runtime.support.compiler_hip import AMDCompiler, disasm
11
+ from tinygrad.runtime.support.elf import elf_loader
12
+ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
13
+ if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
24
14
 
25
15
  def is_usable_gpu(gpu_id):
26
- try:
27
- with gpu_id.open() as f:
28
- return int(f.read()) != 0
29
- except OSError:
30
- return False
31
-
32
- def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
33
- made = made_struct or user_struct(**kwargs)
34
- ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
16
+ with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
17
+ return False
18
+
19
+ def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
20
+ ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made := user_struct(**kwargs))<<16) | (ord('K')<<8) | nr, made)
35
21
  if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
36
22
  return made
37
23
 
38
- def ioctls_from_header():
39
- #hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
40
- #pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
41
- #matches = re.findall(pattern, hdr, re.MULTILINE)
42
- # get this from python instead
43
- hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
44
- pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
45
- matches = re.findall(pattern, hdrpy, re.MULTILINE)
46
- idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
47
- fxns = {name.replace("AMDKFD_IOC_", "").lower():
48
- functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
49
- for name, idir, nr, sname in matches}
50
- return type("KIO", (object, ), fxns)
51
- kio = ioctls_from_header()
52
-
53
- SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
54
- SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
55
-
56
- regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
57
- regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
24
+ kio:Any = type("KIO", (object,), {name[11:].lower(): functools.partial(kfd_ioctl, {"IOW": 1, "IOR": 2, "IOWR": 3}[p[0]], p[1], p[2])
25
+ for name,p in kfd.__dict__.items() if name.startswith("AMDKFD_IOC_")})
26
+
27
+ regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
58
28
 
59
29
  # VGT_EVENT_TYPE in navi10_enum.h
60
30
  CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
@@ -66,27 +36,35 @@ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
66
36
 
67
37
  def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
68
38
  def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
69
- def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
70
-
71
- class AMDCompiler(Compiler):
72
- def __init__(self, arch:str):
73
- self.arch = arch
74
- super().__init__(f"compile_hip_{self.arch}")
75
- def compile(self, src:str) -> bytes:
76
- try: return compile_hip(src, self.arch)
77
- except RuntimeError as e: raise CompileError(e) from e
78
-
79
- class HWQueue:
80
- def __init__(self): self.q, self.cmd_offsets = [], [0]
81
- def _mark_command_end(self):
82
- self.cmd_offsets.append(len(self.q))
83
- return self
84
- def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
85
- def __len__(self): return len(self.cmd_offsets) - 1
86
-
87
- class HWPM4Queue(HWQueue):
39
+
40
+ class AMDSignal(HCQSignal):
41
+ def __init__(self, value=0, alloc_event=False):
42
+ self._signal = AMDDevice.signals_pool.pop()
43
+ self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
44
+ if alloc_event:
45
+ sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
46
+ self._event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
47
+ self._event_id = sync_event.event_id
48
+ self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event_id)
49
+ else: self._event_mailbox_ptr = self._event_id = 0
50
+ super().__init__(value)
51
+ def __del__(self): AMDDevice.signals_pool.append(self._signal)
52
+ def _get_value(self) -> int: return self._signal[0]
53
+ def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
54
+ def _set_value(self, new_value:int): self._signal[0] = new_value
55
+ def wait(self, value:int, timeout:int=10000):
56
+ start_time = time.time() * 1000
57
+ while (time_spent:=time.time() * 1000 - start_time) < timeout:
58
+ if self._signal[0] >= value: return
59
+
60
+ # Wait active for 5s, then going to sleep.
61
+ if time_spent > 5000 and self._event_id != 0:
62
+ kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
63
+ raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
64
+
65
+ class AMDComputeQueue(HWComputeQueue):
88
66
  def __init__(self):
89
- self.binded_device, self.ptr_to_dispatch_packet = None, {}
67
+ self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
90
68
  super().__init__()
91
69
 
92
70
  def __del__(self):
@@ -94,7 +72,7 @@ class HWPM4Queue(HWQueue):
94
72
  self.binded_device.synchronize()
95
73
  self.binded_device._gpu_free(self.hw_page)
96
74
 
97
- def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
75
+ def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
98
76
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
99
77
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
100
78
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
@@ -102,24 +80,39 @@ class HWPM4Queue(HWQueue):
102
80
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
103
81
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
104
82
 
105
- def memory_barrier(self):
83
+ def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
84
+ cache_flush_flags = 0
85
+
86
+ if cache_flush:
87
+ cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
88
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
89
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
90
+
91
+ # event_index__mec_release_mem__end_of_pipe = 5
92
+ # event_index__mec_release_mem__shader_done = 6
93
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
94
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
95
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
96
+ *data64_le(address), *data64_le(value), cst]
97
+
98
+ def _memory_barrier(self):
106
99
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
107
100
  amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
108
101
  nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
109
- self._invalidate_cache()
110
- return self._mark_command_end()
102
+ self._acquire_mem()
111
103
 
112
- def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
113
- self._invalidate_cache()
104
+ def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
105
+ self._acquire_mem(gli=0, gl2=0)
114
106
 
115
- user_data = [*data64_le(kernargs)]
116
- if hasattr(prg, 'dispatch_packet_offset'):
117
- dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.dispatch_packet_offset)
107
+ user_regs, cmd_idx = [], len(self) - 1
108
+ if prg.enable_dispatch_ptr:
109
+ dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
118
110
  dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
119
111
  dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
120
- dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
121
- user_data = [*data64_le(dp_addr)] + user_data
122
- self.ptr_to_dispatch_packet[len(self)] = dp
112
+ dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
113
+ user_regs += [*data64_le(dp_addr)]
114
+ self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
115
+ user_regs += [*data64_le(args_state.ptr)]
123
116
 
124
117
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
125
118
  *data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
@@ -129,79 +122,54 @@ class HWPM4Queue(HWQueue):
129
122
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
130
123
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
131
124
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
132
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_data)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_data
125
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
126
+
127
+ self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
133
128
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
134
129
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
130
+
131
+ self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
135
132
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
136
133
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
137
134
 
138
- if signal is not None: self.signal(signal, signal_value)
139
- return self._mark_command_end()
135
+ def _update_exec(self, cmd_idx, global_size, local_size):
136
+ if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
137
+ if global_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_global_offset[cmd_idx], data=global_size)
140
138
 
141
- def update_exec(self, cmd_idx, global_size, local_size):
142
- # Patch the exec cmd with new launch dims
143
- assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
144
- self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
145
- self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
139
+ if (dp:=self.cmd_idx_to_dispatch_packet.get(cmd_idx)) is not None:
140
+ if local_size is not None: dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
141
+ if global_size is not None:
142
+ dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
146
143
 
147
- if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
148
- dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
149
- dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
150
-
151
- def wait(self, signal:hsa.amd_signal_t, value=0):
152
- addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
144
+ def _wait(self, signal, value=0):
153
145
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
154
146
  amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
155
- amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
156
- return self._mark_command_end()
147
+ amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
157
148
 
158
- def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
159
- cache_flush_flags = 0
149
+ def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
160
150
 
161
- if cache_flush:
162
- cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
163
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
164
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
151
+ def _signal(self, signal, value=0):
152
+ # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
153
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
154
+ if signal._event_mailbox_ptr != 0:
155
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
156
+ value=signal._event_id, cst=signal._event_id, cache_flush=False)
165
157
 
166
- # event_index__mec_release_mem__end_of_pipe = 5
167
- # event_index__mec_release_mem__shader_done = 6
168
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
169
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
170
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
171
- *data64_le(address), *data64_le(value), cst]
158
+ def _update_wait(self, cmd_idx, signal=None, value=None):
159
+ if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
160
+ if value is not None: self._patch(cmd_idx, offset=4, data=[value])
172
161
 
173
- def timestamp(self, sig):
174
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
175
- address=ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)
176
- return self._mark_command_end()
162
+ def _update_signal(self, cmd_idx, signal=None, value=None):
163
+ if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
164
+ if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
177
165
 
178
- def signal(self, signal:hsa.amd_signal_t, value=0):
179
- # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
180
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
181
- value=value, cache_flush=True)
182
- if signal.event_mailbox_ptr != 0:
183
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
184
- value=signal.event_id, cst=signal.event_id, cache_flush=True)
185
- return self._mark_command_end()
186
-
187
- def update_wait(self, cmd_idx, signal=None, value=None):
188
- assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
189
- if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
190
- if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
191
- return self
192
-
193
- def update_signal(self, cmd_idx, signal=None, value=None):
194
- assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
195
- if signal is not None:
196
- self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
197
- if signal.event_mailbox_ptr != 0:
198
- self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
199
- if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
200
- return self
201
-
202
- def bind(self, device: AMDDevice):
166
+ # Check if the signal command has mailptr part
167
+ if signal is not None and self.cmds_len[cmd_idx] > 8:
168
+ self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event_id), signal._event_id])
169
+
170
+ def bind(self, device):
203
171
  self.binded_device = device
204
- self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
172
+ self.hw_page = cast(AMDDevice, device)._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
205
173
  hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
206
174
  for i, value in enumerate(self.q): hw_view[i] = value
207
175
 
@@ -209,7 +177,7 @@ class HWPM4Queue(HWQueue):
209
177
  len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
210
178
  self.q = hw_view # type: ignore
211
179
 
212
- def submit(self, device: AMDDevice):
180
+ def _submit(self, device):
213
181
  cmds = self.indirect_cmd if device == self.binded_device else self.q
214
182
 
215
183
  for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
@@ -217,25 +185,20 @@ class HWPM4Queue(HWQueue):
217
185
  device.compute_queue.put_value += len(cmds)
218
186
  device.compute_queue.write_ptr[0] = device.compute_queue.put_value
219
187
  device.compute_queue.doorbell[0] = device.compute_queue.put_value
220
- return self
221
188
 
222
189
  SDMA_MAX_COPY_SIZE = 0x400000
223
- class HWCopyQueue(HWQueue):
190
+ class AMDCopyQueue(HWCopyQueue):
224
191
  def __init__(self):
225
- self.internal_cmd_sizes = []
192
+ self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
226
193
  super().__init__()
227
194
 
228
195
  def _q(self, arr):
229
196
  self.q += arr
230
197
  self.internal_cmd_sizes.append(len(arr))
231
198
 
232
- def copy(self, dest, src, copy_size):
233
- # Invalidate cache inv
234
- self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
235
- amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
236
-
237
- copied = 0
238
- copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
199
+ def _copy(self, dest, src, copy_size):
200
+ copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
201
+ self.copy_cmds_per_copy[len(self) - 1] = copy_commands
239
202
  for _ in range(copy_commands):
240
203
  step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
241
204
 
@@ -244,39 +207,33 @@ class HWCopyQueue(HWQueue):
244
207
 
245
208
  copied += step_copy_size
246
209
 
247
- # Invalidate cache wb
248
- self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
249
-
250
- return self._mark_command_end()
210
+ def _update_copy(self, cmd_idx, dest=None, src=None):
211
+ for i in range(self.copy_cmds_per_copy[cmd_idx]):
212
+ if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
213
+ if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
251
214
 
252
- def signal(self, signal: hsa.amd_signal_t, value=0):
253
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
215
+ def _signal(self, signal, value=0):
216
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
254
217
 
255
- if signal.event_mailbox_ptr != 0:
256
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
257
- self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
218
+ if signal._event_mailbox_ptr != 0:
219
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event_id])
220
+ self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event_id)])
258
221
 
259
- return self._mark_command_end()
260
-
261
- def wait(self, signal: hsa.amd_signal_t, value=0):
222
+ def _wait(self, signal, value=0):
262
223
  self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
263
- amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
224
+ amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
264
225
  amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
265
226
 
266
- return self._mark_command_end()
267
-
268
- def update_wait(self, cmd_idx, signal=None, value=None):
269
- assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
270
- if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
271
- if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
272
- return self
227
+ def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
228
+ def _update_wait(self, cmd_idx, signal=None, value=None):
229
+ if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
230
+ if value is not None: self._patch(cmd_idx, offset=3, data=[value])
273
231
 
274
- def timestamp(self, sig: hsa.amd_signal_t):
232
+ def _timestamp(self, signal):
275
233
  self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
276
- *data64_le(ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
277
- return self._mark_command_end()
234
+ *data64_le(signal._timestamp_addr)])
278
235
 
279
- def submit(self, device: AMDDevice):
236
+ def _submit(self, device):
280
237
  if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
281
238
 
282
239
  tail_blit_dword = 0
@@ -298,104 +255,68 @@ class HWCopyQueue(HWQueue):
298
255
 
299
256
  device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
300
257
  device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
301
- return self
302
258
 
303
- SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
304
- class AMDProgram:
259
+ class AMDArgsState(HCQArgsState):
260
+ def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
261
+ super().__init__(ptr, prg, bufs, vals=vals)
262
+
263
+ self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
264
+ self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
265
+
266
+ self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
267
+ self.vals[:] = array.array('I', vals)
268
+
269
+ def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
270
+ def update_var(self, index:int, val:int): self.vals[index] = val
271
+
272
+ class AMDProgram(HCQProgram):
305
273
  def __init__(self, device:AMDDevice, name:str, lib:bytes):
306
274
  # TODO; this API needs the type signature of the function and global_size/local_size
307
275
  self.device, self.name, self.lib = device, name, lib
308
276
 
309
- if DEBUG >= 6:
310
- asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
311
- print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
312
-
313
- _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
314
- sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
315
-
316
- lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
317
- self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
318
- lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
277
+ if DEBUG >= 6: print(disasm(lib))
319
278
 
320
- for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
321
- if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
279
+ image, sections, _ = elf_loader(self.lib)
280
+ self.lib_gpu = self.device._gpu_alloc(round_up(image.nbytes, 0x1000), kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
281
+ ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
322
282
 
323
- entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
324
- self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
325
- self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
326
- self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
327
- self.kernargs_alloc_size = self.kernargs_segment_size
328
- self.kernargs_offset = 0
283
+ entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
284
+ self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
285
+ self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
286
+ self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
329
287
 
330
288
  lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
331
289
  if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
332
290
  if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
333
291
 
334
292
  code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
335
- self.rsrc1 = code.compute_pgm_rsrc1
336
- self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
337
-
338
- if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
339
- # Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
340
- self.dispatch_packet_offset = self.kernargs_alloc_size
341
- self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
342
-
343
293
  assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
344
- assert code.workitem_private_segment_byte_size == 0
345
- assert code.max_scratch_backing_memory_byte_size == 0
346
- assert code.kernel_code_prefetch_byte_size == 0
347
294
 
295
+ self.rsrc1 = code.compute_pgm_rsrc1
296
+ self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
348
297
  self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
349
298
 
350
- HWPM4Queue().memory_barrier().submit(self.device)
299
+ # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
300
+ # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
301
+ self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
302
+ additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
303
+
304
+ super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
351
305
 
352
- # NOTE: no programs are ever freed
353
306
  def __del__(self):
354
- if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
355
-
356
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
357
- if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
358
- self.device.kernargs_ptr = self.device.kernargs.va_addr
359
-
360
- if not hasattr(self, "args_struct_t"):
361
- self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
362
- [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
363
- if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
364
- raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
365
-
366
- args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
367
- for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
368
- for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
369
-
370
- sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
371
-
372
- q = HWPM4Queue()
373
- q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
374
- if wait or PROFILE: q.timestamp(sig_st)
375
- q.exec(self, self.device.kernargs_ptr, global_size, local_size)
376
- if wait or PROFILE: q.timestamp(sig_en)
377
- q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
378
- self.device.timeline_value += 1
379
- self.device.kernargs_ptr += self.kernargs_alloc_size
380
-
381
- if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
382
- if wait:
383
- self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
384
- return (sig_en.start_ts - sig_st.start_ts) / 1e8
385
-
386
- class AMDAllocator(HCQCompatAllocator):
307
+ if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu)
308
+
309
+ class AMDAllocator(HCQAllocator):
387
310
  def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
388
311
 
389
- def _alloc(self, size:int, options:BufferOptions):
390
- try:
391
- if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
392
- return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
393
- except OSError as e:
394
- if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
395
- raise
312
+ def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
313
+ if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
314
+ return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
396
315
 
397
316
  def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
398
317
 
318
+ def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
319
+
399
320
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
400
321
 
401
322
  @dataclass
@@ -406,11 +327,11 @@ class AMDQueueDesc:
406
327
  doorbell: memoryview
407
328
  put_value: int = 0
408
329
 
409
- class AMDDevice(HCQCompatCompiled):
330
+ class AMDDevice(HCQCompiled):
410
331
  kfd:int = -1
411
332
  event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
412
333
  signals_page:Any = None
413
- signals_pool:List[hsa.amd_signal_t] = []
334
+ signals_pool:List[memoryview] = []
414
335
  gpus:List[pathlib.Path] = []
415
336
 
416
337
  def _gpu_map(self, mem):
@@ -429,7 +350,14 @@ class AMDDevice(HCQCompatCompiled):
429
350
  else:
430
351
  buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
431
352
  assert addr != 0xffffffffffffffff
432
- mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
353
+
354
+ try: mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
355
+ except OSError as e:
356
+ if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
357
+ raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
358
+ if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
359
+ raise
360
+
433
361
  if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
434
362
  buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
435
363
  assert addr == buf == mem.va_addr
@@ -444,63 +372,34 @@ class AMDDevice(HCQCompatCompiled):
444
372
  libc.munmap(mem.va_addr, mem.size)
445
373
  kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
446
374
 
447
- @classmethod
448
- def _read_signal(self, sig): return sig.value
449
-
450
- @classmethod
451
- def _read_timestamp(self, sig): return sig.start_ts
452
-
453
- @classmethod
454
- def _set_signal(self, sig, value): sig.value = value
455
-
456
- @classmethod
457
- def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
458
- self._set_signal(ret := self.signals_pool.pop(), value)
459
- if (sync_event:=kwargs.get('sync_event')) is not None:
460
- ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
461
- ret.event_id = sync_event.event_id
462
- else: ret.event_mailbox_ptr = ret.event_id = 0
463
- return ret
464
-
465
- @classmethod
466
- def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
467
- assert signal.event_id != 0, "can't wait on this signal"
468
- evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
469
-
470
- # Wait active for 5s, then going to sleep.
471
- start_time = time.time() * 1000
472
- while (time_spent:=time.time() * 1000 - start_time) < timeout:
473
- if signal.value >= value: return
474
- if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
475
- raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
476
-
477
375
  def __init__(self, device:str=""):
478
376
  if AMDDevice.kfd == -1:
479
377
  AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
480
- AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
378
+ gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
379
+ gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
380
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
381
+ AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
382
+
481
383
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
384
+ if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
385
+
482
386
  with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
483
387
  with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
484
388
  self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
485
389
  target = int(self.properties['gfx_target_version'])
486
390
  self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
391
+ if target < 110000 or target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
392
+
487
393
  kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
488
394
 
489
395
  if AMDDevice.event_page is None:
490
- AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
396
+ AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
491
397
  AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
492
- for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
493
- AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
494
- sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
398
+ AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
399
+ kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
495
400
  else:
496
401
  self._gpu_map(AMDDevice.signals_page)
497
402
  self._gpu_map(AMDDevice.event_page)
498
- sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
499
-
500
- self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
501
-
502
- self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
503
- self.kernargs_ptr = self.kernargs.va_addr
504
403
 
505
404
  # Scratch setup
506
405
  max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
@@ -515,12 +414,8 @@ class AMDDevice(HCQCompatCompiled):
515
414
  self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
516
415
  self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
517
416
 
518
- super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self), HWPM4Queue, HWCopyQueue,
519
- timeline_signals=[self._get_signal(sync_event=sync_event), self._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))])
520
-
521
- def _gpu2cpu_time(self, gpu_time, is_copy):
522
- if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
523
- return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
417
+ super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
418
+ AMDSignal, AMDComputeQueue, AMDCopyQueue, (AMDSignal(alloc_event=True), AMDSignal(alloc_event=True)))
524
419
 
525
420
  def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
526
421
  gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
@@ -541,10 +436,7 @@ class AMDDevice(HCQCompatCompiled):
541
436
  read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
542
437
  doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
543
438
 
544
- def synchronize(self):
545
- AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
546
-
547
- # reset kernargs
548
- self.kernargs_ptr = self.kernargs.va_addr
549
- if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
550
- if PROFILE: self._prof_process_events()
439
+ def invalidate_caches(self):
440
+ AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
441
+ self.timeline_value += 1
442
+ self.synchronize()