tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. tinygrad/__init__.py +11 -6
  2. tinygrad/codegen/kernel.py +308 -175
  3. tinygrad/codegen/linearize.py +95 -0
  4. tinygrad/codegen/lowerer.py +143 -0
  5. tinygrad/codegen/transcendental.py +257 -0
  6. tinygrad/codegen/uopgraph.py +506 -0
  7. tinygrad/device.py +72 -171
  8. tinygrad/dtype.py +122 -47
  9. tinygrad/engine/jit.py +184 -87
  10. tinygrad/{lazy.py → engine/lazy.py} +74 -66
  11. tinygrad/engine/memory.py +51 -0
  12. tinygrad/engine/realize.py +86 -61
  13. tinygrad/engine/schedule.py +366 -317
  14. tinygrad/engine/search.py +58 -47
  15. tinygrad/function.py +59 -58
  16. tinygrad/helpers.py +120 -102
  17. tinygrad/multi.py +82 -78
  18. tinygrad/nn/__init__.py +116 -67
  19. tinygrad/nn/datasets.py +12 -5
  20. tinygrad/nn/optim.py +1 -1
  21. tinygrad/nn/state.py +91 -6
  22. tinygrad/ops.py +1126 -143
  23. tinygrad/renderer/__init__.py +47 -23
  24. tinygrad/renderer/cstyle.py +338 -265
  25. tinygrad/renderer/llvmir.py +125 -143
  26. tinygrad/renderer/ptx.py +225 -0
  27. tinygrad/runtime/autogen/adreno.py +17904 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
  29. tinygrad/runtime/autogen/cuda.py +6 -162
  30. tinygrad/runtime/autogen/io_uring.py +97 -63
  31. tinygrad/runtime/autogen/kfd.py +60 -47
  32. tinygrad/runtime/autogen/kgsl.py +1386 -0
  33. tinygrad/runtime/autogen/libc.py +5462 -0
  34. tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
  35. tinygrad/runtime/autogen/nvrtc.py +579 -0
  36. tinygrad/runtime/autogen/opencl.py +11 -11
  37. tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
  38. tinygrad/runtime/graph/clang.py +3 -3
  39. tinygrad/runtime/graph/cuda.py +11 -15
  40. tinygrad/runtime/graph/hcq.py +120 -107
  41. tinygrad/runtime/graph/metal.py +71 -43
  42. tinygrad/runtime/ops_amd.py +244 -323
  43. tinygrad/runtime/ops_clang.py +12 -5
  44. tinygrad/runtime/ops_cloud.py +220 -0
  45. tinygrad/runtime/ops_cuda.py +42 -99
  46. tinygrad/runtime/ops_disk.py +25 -26
  47. tinygrad/runtime/ops_dsp.py +181 -0
  48. tinygrad/runtime/ops_gpu.py +29 -16
  49. tinygrad/runtime/ops_hip.py +68 -0
  50. tinygrad/runtime/ops_llvm.py +15 -10
  51. tinygrad/runtime/ops_metal.py +147 -64
  52. tinygrad/runtime/ops_nv.py +356 -397
  53. tinygrad/runtime/ops_python.py +78 -79
  54. tinygrad/runtime/ops_qcom.py +405 -0
  55. tinygrad/runtime/support/__init__.py +0 -0
  56. tinygrad/runtime/support/compiler_cuda.py +77 -0
  57. tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
  58. tinygrad/runtime/support/elf.py +38 -0
  59. tinygrad/runtime/support/hcq.py +539 -0
  60. tinygrad/shape/shapetracker.py +40 -50
  61. tinygrad/shape/view.py +102 -63
  62. tinygrad/tensor.py +1109 -365
  63. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
  64. tinygrad-0.10.0.dist-info/RECORD +77 -0
  65. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
  66. tinygrad/codegen/linearizer.py +0 -528
  67. tinygrad/codegen/uops.py +0 -451
  68. tinygrad/engine/graph.py +0 -100
  69. tinygrad/renderer/assembly.py +0 -269
  70. tinygrad/shape/symbolic.py +0 -327
  71. tinygrad-0.9.1.dist-info/RECORD +0 -63
  72. /tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
  73. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
  74. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,60 +1,23 @@
1
1
  from __future__ import annotations
2
2
  from typing import Tuple, List, Any
3
- import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
3
+ import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
4
+ assert sys.platform != 'win32'
4
5
  from dataclasses import dataclass
5
- from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
- from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram
7
+ from tinygrad.device import BufferOptions
8
+ from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
7
9
  from tinygrad.renderer.cstyle import AMDRenderer
8
- from tinygrad.runtime.driver.hip_comgr import compile_hip
9
- import tinygrad.runtime.autogen.kfd as kfd
10
- import tinygrad.runtime.autogen.hsa as hsa
11
- import tinygrad.runtime.autogen.amd_gpu as amd_gpu
12
- if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
13
-
14
- libc = ctypes.CDLL(ctypes.util.find_library("c"))
15
- libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
16
- libc.mmap.restype = ctypes.c_void_p
17
- libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
18
- libc.munmap.restype = ctypes.c_int
19
-
20
- if getenv("MOCKGPU"):
21
- import extra.mockgpu.mockgpu # noqa: F401
22
- libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
23
- libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
10
+ from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
11
+ from tinygrad.runtime.support.compiler_hip import AMDCompiler
12
+ from tinygrad.runtime.support.elf import elf_loader
13
+ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
14
+ if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
24
15
 
25
16
  def is_usable_gpu(gpu_id):
26
- try:
27
- with gpu_id.open() as f:
28
- return int(f.read()) != 0
29
- except OSError:
30
- return False
31
-
32
- def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
33
- made = made_struct or user_struct(**kwargs)
34
- ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
35
- if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
36
- return made
37
-
38
- def ioctls_from_header():
39
- #hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
40
- #pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
41
- #matches = re.findall(pattern, hdr, re.MULTILINE)
42
- # get this from python instead
43
- hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
44
- pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
45
- matches = re.findall(pattern, hdrpy, re.MULTILINE)
46
- idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
47
- fxns = {name.replace("AMDKFD_IOC_", "").lower():
48
- functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
49
- for name, idir, nr, sname in matches}
50
- return type("KIO", (object, ), fxns)
51
- kio = ioctls_from_header()
52
-
53
- SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
54
- SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
55
-
56
- regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
57
- regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
17
+ with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
18
+ return False
19
+
20
+ regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
58
21
 
59
22
  # VGT_EVENT_TYPE in navi10_enum.h
60
23
  CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
@@ -66,35 +29,41 @@ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
66
29
 
67
30
  def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
68
31
  def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
69
- def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
70
-
71
- class AMDCompiler(Compiler):
72
- def __init__(self, arch:str):
73
- self.arch = arch
74
- super().__init__(f"compile_hip_{self.arch}")
75
- def compile(self, src:str) -> bytes:
76
- try: return compile_hip(src, self.arch)
77
- except RuntimeError as e: raise CompileError(e) from e
78
-
79
- class HWQueue:
80
- def __init__(self): self.q, self.cmd_offsets = [], [0]
81
- def _mark_command_end(self):
82
- self.cmd_offsets.append(len(self.q))
83
- return self
84
- def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
85
- def __len__(self): return len(self.cmd_offsets) - 1
86
-
87
- class HWPM4Queue(HWQueue):
32
+
33
+ class AMDSignal(HCQSignal):
34
+ def __init__(self, value=0, is_timeline=False):
35
+ self._signal = AMDDevice.signals_pool.pop()
36
+ self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
37
+ if is_timeline:
38
+ self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
39
+ self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
40
+ self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
41
+ else: self._event_mailbox_ptr = 0
42
+ super().__init__(value)
43
+ def __del__(self): AMDDevice.signals_pool.append(self._signal)
44
+ def _get_value(self) -> int: return self._signal[0]
45
+ def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
46
+ def _set_value(self, new_value:int): self._signal[0] = new_value
47
+ def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
48
+ start_time = time.time() * 1000
49
+ while (time_spent:=time.time() * 1000 - start_time) < timeout:
50
+ if self._signal[0] >= value: return
51
+
52
+ # Wait active for 5s, then going to sleep.
53
+ if time_spent > 5000 and self._event_mailbox_ptr != 0:
54
+ kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
55
+ raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
56
+
57
+ class AMDComputeQueue(HWComputeQueue):
88
58
  def __init__(self):
89
- self.binded_device, self.ptr_to_dispatch_packet = None, {}
59
+ self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
90
60
  super().__init__()
91
61
 
92
62
  def __del__(self):
93
63
  if self.binded_device is not None:
94
- self.binded_device.synchronize()
95
- self.binded_device._gpu_free(self.hw_page)
64
+ self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True, uncached=True))
96
65
 
97
- def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
66
+ def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
98
67
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
99
68
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
100
69
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
@@ -102,106 +71,101 @@ class HWPM4Queue(HWQueue):
102
71
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
103
72
  amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
104
73
 
105
- def memory_barrier(self):
74
+ def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
75
+ cache_flush_flags = 0
76
+
77
+ if cache_flush:
78
+ cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
79
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
80
+ amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
81
+
82
+ # event_index__mec_release_mem__end_of_pipe = 5
83
+ # event_index__mec_release_mem__shader_done = 6
84
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
85
+ amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
86
+ amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
87
+ *data64_le(address), *data64_le(value), cst]
88
+
89
+ def _memory_barrier(self):
106
90
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
107
91
  amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
108
92
  nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
109
- self._invalidate_cache()
110
- return self._mark_command_end()
93
+ self._acquire_mem()
111
94
 
112
- def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
113
- self._invalidate_cache()
95
+ def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
96
+ self._acquire_mem(gli=0, gl2=0)
114
97
 
115
- user_data = [*data64_le(kernargs)]
116
- if hasattr(prg, 'dispatch_packet_offset'):
117
- dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.dispatch_packet_offset)
98
+ cmd_idx = self._cur_cmd_idx()
99
+ user_regs = [*data64_le(prg.device.scratch.va_addr), 0xffffffff, 0xc00000] if prg.enable_private_segment_sgpr else []
100
+ if prg.enable_dispatch_ptr:
101
+ dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
118
102
  dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
119
103
  dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
120
- dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
121
- user_data = [*data64_le(dp_addr)] + user_data
122
- self.ptr_to_dispatch_packet[len(self)] = dp
104
+ dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
105
+ user_regs += [*data64_le(dp_addr)]
106
+ self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
107
+ user_regs += [*data64_le(args_state.ptr)]
123
108
 
124
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
125
- *data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
109
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)]
126
110
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
111
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0]
127
112
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
113
+ if prg.device.has_scratch_base_registers:
114
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2),
115
+ gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.device.scratch.va_addr >> 8)]
116
+ if prg.device.target < 110000: self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20]
128
117
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
129
118
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
130
119
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
131
120
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
132
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_data)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_data
121
+ self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
122
+
123
+ self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
133
124
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
134
125
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
126
+
127
+ self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
135
128
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
136
129
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
137
130
 
138
- if signal is not None: self.signal(signal, signal_value)
139
- return self._mark_command_end()
140
-
141
- def update_exec(self, cmd_idx, global_size, local_size):
142
- # Patch the exec cmd with new launch dims
143
- assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
144
- self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
145
- self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
131
+ def _update_exec(self, cmd_idx, global_size, local_size):
132
+ if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
133
+ if global_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_global_offset[cmd_idx], data=global_size)
146
134
 
147
- if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
148
- dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
149
- dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
135
+ if (dp:=self.cmd_idx_to_dispatch_packet.get(cmd_idx)) is not None:
136
+ if local_size is not None: dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
137
+ if global_size is not None:
138
+ dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
150
139
 
151
- def wait(self, signal:hsa.amd_signal_t, value=0):
152
- addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
140
+ def _wait(self, signal, value=0):
153
141
  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
154
142
  amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
155
- amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
156
- return self._mark_command_end()
143
+ amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
157
144
 
158
- def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
159
- cache_flush_flags = 0
145
+ def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
160
146
 
161
- if cache_flush:
162
- cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
163
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
164
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
147
+ def _signal(self, signal, value=0):
148
+ # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
149
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
150
+ if signal._event_mailbox_ptr != 0:
151
+ self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
152
+ value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
165
153
 
166
- # event_index__mec_release_mem__end_of_pipe = 5
167
- # event_index__mec_release_mem__shader_done = 6
168
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
169
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
170
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
171
- *data64_le(address), *data64_le(value), cst]
154
+ def _update_wait(self, cmd_idx, signal=None, value=None):
155
+ if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
156
+ if value is not None: self._patch(cmd_idx, offset=4, data=[value])
172
157
 
173
- def timestamp(self, sig):
174
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
175
- address=ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)
176
- return self._mark_command_end()
158
+ def _update_signal(self, cmd_idx, signal=None, value=None):
159
+ if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
160
+ if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
177
161
 
178
- def signal(self, signal:hsa.amd_signal_t, value=0):
179
- # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
180
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
181
- value=value, cache_flush=True)
182
- if signal.event_mailbox_ptr != 0:
183
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
184
- value=signal.event_id, cst=signal.event_id, cache_flush=True)
185
- return self._mark_command_end()
186
-
187
- def update_wait(self, cmd_idx, signal=None, value=None):
188
- assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
189
- if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
190
- if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
191
- return self
192
-
193
- def update_signal(self, cmd_idx, signal=None, value=None):
194
- assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
195
- if signal is not None:
196
- self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
197
- if signal.event_mailbox_ptr != 0:
198
- self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
199
- if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
200
- return self
201
-
202
- def bind(self, device: AMDDevice):
162
+ # Check if the signal command has mailptr part
163
+ if signal is not None and self.cmds_len[cmd_idx] > 8:
164
+ self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
165
+
166
+ def bind(self, device):
203
167
  self.binded_device = device
204
- self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
168
+ self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
205
169
  hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
206
170
  for i, value in enumerate(self.q): hw_view[i] = value
207
171
 
@@ -209,7 +173,7 @@ class HWPM4Queue(HWQueue):
209
173
  len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
210
174
  self.q = hw_view # type: ignore
211
175
 
212
- def submit(self, device: AMDDevice):
176
+ def _submit(self, device):
213
177
  cmds = self.indirect_cmd if device == self.binded_device else self.q
214
178
 
215
179
  for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
@@ -217,25 +181,20 @@ class HWPM4Queue(HWQueue):
217
181
  device.compute_queue.put_value += len(cmds)
218
182
  device.compute_queue.write_ptr[0] = device.compute_queue.put_value
219
183
  device.compute_queue.doorbell[0] = device.compute_queue.put_value
220
- return self
221
184
 
222
185
  SDMA_MAX_COPY_SIZE = 0x400000
223
- class HWCopyQueue(HWQueue):
186
+ class AMDCopyQueue(HWCopyQueue):
224
187
  def __init__(self):
225
- self.internal_cmd_sizes = []
188
+ self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
226
189
  super().__init__()
227
190
 
228
191
  def _q(self, arr):
229
192
  self.q += arr
230
193
  self.internal_cmd_sizes.append(len(arr))
231
194
 
232
- def copy(self, dest, src, copy_size):
233
- # Invalidate cache inv
234
- self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
235
- amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
236
-
237
- copied = 0
238
- copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
195
+ def _copy(self, dest, src, copy_size):
196
+ copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
197
+ self.copy_cmds_per_copy[len(self) - 1] = copy_commands
239
198
  for _ in range(copy_commands):
240
199
  step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
241
200
 
@@ -244,39 +203,33 @@ class HWCopyQueue(HWQueue):
244
203
 
245
204
  copied += step_copy_size
246
205
 
247
- # Invalidate cache wb
248
- self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
249
-
250
- return self._mark_command_end()
251
-
252
- def signal(self, signal: hsa.amd_signal_t, value=0):
253
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
206
+ def _update_copy(self, cmd_idx, dest=None, src=None):
207
+ for i in range(self.copy_cmds_per_copy[cmd_idx]):
208
+ if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
209
+ if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
254
210
 
255
- if signal.event_mailbox_ptr != 0:
256
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
257
- self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
211
+ def _signal(self, signal, value=0):
212
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
258
213
 
259
- return self._mark_command_end()
214
+ if signal._event_mailbox_ptr != 0:
215
+ self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
216
+ self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
260
217
 
261
- def wait(self, signal: hsa.amd_signal_t, value=0):
218
+ def _wait(self, signal, value=0):
262
219
  self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
263
- amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
220
+ amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
264
221
  amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
265
222
 
266
- return self._mark_command_end()
223
+ def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
224
+ def _update_wait(self, cmd_idx, signal=None, value=None):
225
+ if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
226
+ if value is not None: self._patch(cmd_idx, offset=3, data=[value])
267
227
 
268
- def update_wait(self, cmd_idx, signal=None, value=None):
269
- assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
270
- if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
271
- if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
272
- return self
273
-
274
- def timestamp(self, sig: hsa.amd_signal_t):
228
+ def _timestamp(self, signal):
275
229
  self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
276
- *data64_le(ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
277
- return self._mark_command_end()
230
+ *data64_le(signal._timestamp_addr)])
278
231
 
279
- def submit(self, device: AMDDevice):
232
+ def _submit(self, device):
280
233
  if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
281
234
 
282
235
  tail_blit_dword = 0
@@ -298,103 +251,69 @@ class HWCopyQueue(HWQueue):
298
251
 
299
252
  device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
300
253
  device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
301
- return self
302
254
 
303
- SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
304
- class AMDProgram:
305
- def __init__(self, device:AMDDevice, name:str, lib:bytes):
306
- # TODO; this API needs the type signature of the function and global_size/local_size
307
- self.device, self.name, self.lib = device, name, lib
255
+ class AMDArgsState(HCQArgsState):
256
+ def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
257
+ super().__init__(ptr, prg, bufs, vals=vals)
308
258
 
309
- if DEBUG >= 6:
310
- asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
311
- print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
259
+ self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
260
+ self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
312
261
 
313
- _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
314
- sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
262
+ self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
263
+ self.vals[:] = array.array('I', vals)
315
264
 
316
- lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
317
- self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
318
- lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
265
+ def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
266
+ def update_var(self, index:int, val:int): self.vals[index] = val
319
267
 
320
- for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
321
- if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
268
+ class AMDProgram(HCQProgram):
269
+ def __init__(self, device:AMDDevice, name:str, lib:bytes):
270
+ # TODO; this API needs the type signature of the function and global_size/local_size
271
+ self.device, self.name, self.lib = device, name, lib
272
+ image, sections, _ = elf_loader(self.lib)
273
+ self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000), BufferOptions(cpu_access=True, nolru=True))
274
+ ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
322
275
 
323
- entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
324
- self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
325
- self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
326
- self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
327
- self.kernargs_alloc_size = self.kernargs_segment_size
328
- self.kernargs_offset = 0
276
+ entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
277
+ self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
278
+ self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
279
+ self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
329
280
 
330
281
  lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
331
282
  if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
332
283
  if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
333
284
 
334
285
  code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
335
- self.rsrc1 = code.compute_pgm_rsrc1
336
- self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
337
-
338
- if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
339
- # Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
340
- self.dispatch_packet_offset = self.kernargs_alloc_size
341
- self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
342
-
343
286
  assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
344
- assert code.workitem_private_segment_byte_size == 0
345
- assert code.max_scratch_backing_memory_byte_size == 0
346
- assert code.kernel_code_prefetch_byte_size == 0
347
287
 
288
+ # Set rsrc1.priv=1 on gfx11 to workaround cwsr.
289
+ self.rsrc1 = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.device.target < 120000 else 0)
290
+ self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
348
291
  self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
349
292
 
350
- HWPM4Queue().memory_barrier().submit(self.device)
293
+ # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
294
+ # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
295
+ self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
296
+ self.enable_private_segment_sgpr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
297
+ additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
298
+
299
+ super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
351
300
 
352
- # NOTE: no programs are ever freed
353
301
  def __del__(self):
354
- if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
355
-
356
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
357
- if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
358
- self.device.kernargs_ptr = self.device.kernargs.va_addr
359
-
360
- if not hasattr(self, "args_struct_t"):
361
- self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
362
- [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
363
- if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
364
- raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
365
-
366
- args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
367
- for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
368
- for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
369
-
370
- sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
371
-
372
- q = HWPM4Queue()
373
- q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
374
- if wait or PROFILE: q.timestamp(sig_st)
375
- q.exec(self, self.device.kernargs_ptr, global_size, local_size)
376
- if wait or PROFILE: q.timestamp(sig_en)
377
- q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
378
- self.device.timeline_value += 1
379
- self.device.kernargs_ptr += self.kernargs_alloc_size
380
-
381
- if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
382
- if wait:
383
- self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
384
- return (sig_en.start_ts - sig_st.start_ts) / 1e8
385
-
386
- class AMDAllocator(HCQCompatAllocator):
302
+ if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True, nolru=True))
303
+
304
+ class AMDAllocator(HCQAllocator):
387
305
  def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
388
306
 
389
- def _alloc(self, size:int, options:BufferOptions):
390
- try:
391
- if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
392
- return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
393
- except OSError as e:
394
- if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
395
- raise
307
+ def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
308
+ if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
309
+ if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
310
+ return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
396
311
 
397
- def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
312
+ def _free(self, opaque, options:BufferOptions):
313
+ self.device.synchronize()
314
+ self.device._gpu_free(opaque)
315
+
316
+ def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
398
317
 
399
318
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
400
319
 
@@ -406,18 +325,19 @@ class AMDQueueDesc:
406
325
  doorbell: memoryview
407
326
  put_value: int = 0
408
327
 
409
- class AMDDevice(HCQCompatCompiled):
328
+ class AMDDevice(HCQCompiled):
410
329
  kfd:int = -1
411
330
  event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
412
331
  signals_page:Any = None
413
- signals_pool:List[hsa.amd_signal_t] = []
332
+ signals_pool:List[memoryview] = []
414
333
  gpus:List[pathlib.Path] = []
415
334
 
416
335
  def _gpu_map(self, mem):
417
336
  if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
418
337
  mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
419
338
  c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
420
- stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
339
+ stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
340
+ n_devices=len(mem.mapped_gpu_ids))
421
341
  assert stm.n_success == len(mem.mapped_gpu_ids)
422
342
 
423
343
  def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
@@ -429,7 +349,15 @@ class AMDDevice(HCQCompatCompiled):
429
349
  else:
430
350
  buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
431
351
  assert addr != 0xffffffffffffffff
432
- mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
352
+
353
+ try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
354
+ flags=flags, mmap_offset=buf)
355
+ except OSError as e:
356
+ if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
357
+ raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
358
+ if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
359
+ raise
360
+
433
361
  if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
434
362
  buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
435
363
  assert addr == buf == mem.va_addr
@@ -439,68 +367,39 @@ class AMDDevice(HCQCompatCompiled):
439
367
  def _gpu_free(self, mem):
440
368
  if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
441
369
  c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
442
- stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
370
+ stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
443
371
  assert stm.n_success == len(gpus)
444
372
  libc.munmap(mem.va_addr, mem.size)
445
- kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
446
-
447
- @classmethod
448
- def _read_signal(self, sig): return sig.value
449
-
450
- @classmethod
451
- def _read_timestamp(self, sig): return sig.start_ts
452
-
453
- @classmethod
454
- def _set_signal(self, sig, value): sig.value = value
455
-
456
- @classmethod
457
- def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
458
- self._set_signal(ret := self.signals_pool.pop(), value)
459
- if (sync_event:=kwargs.get('sync_event')) is not None:
460
- ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
461
- ret.event_id = sync_event.event_id
462
- else: ret.event_mailbox_ptr = ret.event_id = 0
463
- return ret
464
-
465
- @classmethod
466
- def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
467
- assert signal.event_id != 0, "can't wait on this signal"
468
- evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
469
-
470
- # Wait active for 5s, then going to sleep.
471
- start_time = time.time() * 1000
472
- while (time_spent:=time.time() * 1000 - start_time) < timeout:
473
- if signal.value >= value: return
474
- if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
475
- raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
373
+ kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
476
374
 
477
375
  def __init__(self, device:str=""):
478
376
  if AMDDevice.kfd == -1:
479
377
  AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
480
- AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
378
+ gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
379
+ gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
380
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
381
+ AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
382
+
481
383
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
384
+ if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
385
+
482
386
  with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
483
387
  with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
484
388
  self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
485
- target = int(self.properties['gfx_target_version'])
486
- self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
487
- kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
389
+ self.target = int(self.properties['gfx_target_version'])
390
+ self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
391
+ if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
392
+
393
+ kfd.AMDKFD_IOC_ACQUIRE_VM(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
488
394
 
489
395
  if AMDDevice.event_page is None:
490
- AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
396
+ AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
491
397
  AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
492
- for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
493
- AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
494
- sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
398
+ AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
399
+ kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
495
400
  else:
496
401
  self._gpu_map(AMDDevice.signals_page)
497
402
  self._gpu_map(AMDDevice.event_page)
498
- sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
499
-
500
- self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
501
-
502
- self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
503
- self.kernargs_ptr = self.kernargs.va_addr
504
403
 
505
404
  # Scratch setup
506
405
  max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
@@ -509,28 +408,37 @@ class AMDDevice(HCQCompatCompiled):
509
408
  wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
510
409
  self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
511
410
  self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
411
+ self.has_scratch_base_registers = self.target >= 110000
512
412
  engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
513
413
  self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
514
414
 
515
- self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
415
+ # https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
416
+ sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
417
+ vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
418
+ wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (max_cu_id + 1), mmap.PAGESIZE)
419
+ ctl_stack_size = round_up(12 * (max_cu_id + 1) * (max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
420
+ self.debug_memory_size = round_up((max_cu_id + 1) * (max_wave_id + 1) * 32, 64)
421
+
422
+ self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
423
+ eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
516
424
  self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
517
425
 
518
- super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self), HWPM4Queue, HWCopyQueue,
519
- timeline_signals=[self._get_signal(sync_event=sync_event), self._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))])
426
+ self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
427
+ self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
520
428
 
521
- def _gpu2cpu_time(self, gpu_time, is_copy):
522
- if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
523
- return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
429
+ super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
430
+ AMDSignal, AMDComputeQueue, AMDCopyQueue)
524
431
 
525
- def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
432
+ def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None, ctl_stack_size=0) -> AMDQueueDesc:
526
433
  gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
527
434
  ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
528
- cwsr_ctx = self._gpu_alloc(ctx_save_restore_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
435
+ cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE),
436
+ kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
529
437
  eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
530
- queue = kio.create_queue(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
438
+ queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
531
439
  queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
532
- eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
533
- ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=cwsr_ctx.size if cwsr_ctx else 0,
440
+ eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
441
+ ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size if cwsr_ctx else 0,
534
442
  write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
535
443
 
536
444
  if not hasattr(self, 'doorbells'):
@@ -541,10 +449,23 @@ class AMDDevice(HCQCompatCompiled):
541
449
  read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
542
450
  doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
543
451
 
544
- def synchronize(self):
545
- AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
452
+ def invalidate_caches(self):
453
+ AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
454
+ self.timeline_value += 1
455
+ self.synchronize()
456
+
457
+ def on_device_hang(self):
458
+ report = []
459
+
460
+ ev = (kfd.struct_kfd_event_data)(event_id=self.mem_fault_event.event_id)
461
+ kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
462
+ if ev.memory_exception_data.gpu_id:
463
+ pfstatus = ' '.join(f'{k[0]}={getattr(ev.memory_exception_data.failure, k[0])}' for k in ev.memory_exception_data.failure._fields_)
464
+ report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {pfstatus}"]
465
+
466
+ ev = (kfd.struct_kfd_event_data)(event_id=self.hw_fault_event.event_id)
467
+ kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
468
+ if ev.hw_exception_data.gpu_id:
469
+ report += [f"HW fault: {' '.join(f'{k[0]}={getattr(ev.hw_exception_data, k[0])}' for k in ev.hw_exception_data._fields_)}"]
546
470
 
547
- # reset kernargs
548
- self.kernargs_ptr = self.kernargs.va_addr
549
- if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
550
- if PROFILE: self._prof_process_events()
471
+ raise RuntimeError("\n".join(report))