tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. tinygrad/codegen/kernel.py +114 -172
  2. tinygrad/codegen/linearize.py +211 -81
  3. tinygrad/codegen/lowerer.py +30 -35
  4. tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
  5. tinygrad/codegen/transcendental.py +12 -13
  6. tinygrad/device.py +170 -47
  7. tinygrad/dtype.py +28 -26
  8. tinygrad/engine/jit.py +80 -63
  9. tinygrad/engine/memory.py +4 -5
  10. tinygrad/engine/multi.py +162 -0
  11. tinygrad/engine/realize.py +58 -107
  12. tinygrad/engine/schedule.py +381 -314
  13. tinygrad/engine/search.py +40 -44
  14. tinygrad/gradient.py +70 -0
  15. tinygrad/helpers.py +77 -58
  16. tinygrad/nn/__init__.py +30 -32
  17. tinygrad/nn/datasets.py +1 -2
  18. tinygrad/nn/optim.py +22 -26
  19. tinygrad/nn/state.py +89 -64
  20. tinygrad/ops.py +562 -446
  21. tinygrad/renderer/__init__.py +79 -36
  22. tinygrad/renderer/cstyle.py +70 -84
  23. tinygrad/renderer/llvmir.py +32 -20
  24. tinygrad/renderer/ptx.py +79 -99
  25. tinygrad/renderer/wgsl.py +87 -0
  26. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  27. tinygrad/runtime/autogen/comgr.py +2 -0
  28. tinygrad/runtime/autogen/kfd.py +4 -3
  29. tinygrad/runtime/autogen/kgsl.py +1 -1
  30. tinygrad/runtime/autogen/libpciaccess.py +2023 -0
  31. tinygrad/runtime/autogen/llvm.py +11379 -0
  32. tinygrad/runtime/autogen/vfio.py +891 -0
  33. tinygrad/runtime/graph/cuda.py +8 -9
  34. tinygrad/runtime/graph/hcq.py +84 -79
  35. tinygrad/runtime/graph/metal.py +19 -21
  36. tinygrad/runtime/ops_amd.py +488 -327
  37. tinygrad/runtime/ops_clang.py +15 -28
  38. tinygrad/runtime/ops_cloud.py +34 -34
  39. tinygrad/runtime/ops_cuda.py +30 -27
  40. tinygrad/runtime/ops_disk.py +62 -63
  41. tinygrad/runtime/ops_dsp.py +129 -38
  42. tinygrad/runtime/ops_gpu.py +30 -30
  43. tinygrad/runtime/ops_hip.py +29 -31
  44. tinygrad/runtime/ops_llvm.py +45 -40
  45. tinygrad/runtime/ops_metal.py +93 -73
  46. tinygrad/runtime/ops_npy.py +2 -2
  47. tinygrad/runtime/ops_nv.py +232 -270
  48. tinygrad/runtime/ops_python.py +51 -46
  49. tinygrad/runtime/ops_qcom.py +129 -157
  50. tinygrad/runtime/ops_webgpu.py +63 -0
  51. tinygrad/runtime/support/allocator.py +94 -0
  52. tinygrad/runtime/support/am/__init__.py +0 -0
  53. tinygrad/runtime/support/am/amdev.py +384 -0
  54. tinygrad/runtime/support/am/ip.py +463 -0
  55. tinygrad/runtime/support/compiler_cuda.py +4 -2
  56. tinygrad/runtime/support/elf.py +26 -4
  57. tinygrad/runtime/support/hcq.py +254 -324
  58. tinygrad/runtime/support/llvm.py +32 -0
  59. tinygrad/shape/shapetracker.py +84 -53
  60. tinygrad/shape/view.py +103 -138
  61. tinygrad/spec.py +154 -0
  62. tinygrad/tensor.py +744 -496
  63. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
  64. tinygrad-0.10.1.dist-info/RECORD +86 -0
  65. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
  66. tinygrad/engine/lazy.py +0 -228
  67. tinygrad/function.py +0 -212
  68. tinygrad/multi.py +0 -177
  69. tinygrad/runtime/graph/clang.py +0 -39
  70. tinygrad-0.10.0.dist-info/RECORD +0 -77
  71. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
  72. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,23 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, List, Any
3
- import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
2
+ from typing import Any, cast
3
+ import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select, atexit
4
4
  assert sys.platform != 'win32'
5
5
  from dataclasses import dataclass
6
- from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram
7
- from tinygrad.device import BufferOptions
8
- from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
7
+ from tinygrad.ops import sint
8
+ from tinygrad.device import BufferSpec
9
+ from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
9
10
  from tinygrad.renderer.cstyle import AMDRenderer
10
- from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
11
+ from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, libpciaccess, vfio
12
+ from tinygrad.runtime.autogen.am import am
11
13
  from tinygrad.runtime.support.compiler_hip import AMDCompiler
12
14
  from tinygrad.runtime.support.elf import elf_loader
15
+ from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
13
16
  if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
14
- if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
15
-
16
- def is_usable_gpu(gpu_id):
17
- with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
18
- return False
19
17
 
20
18
  regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
21
19
 
22
- # VGT_EVENT_TYPE in navi10_enum.h
23
- CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
24
-
25
- WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
20
+ EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
26
21
  WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
27
22
 
28
23
  COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
@@ -31,246 +26,227 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
31
26
  def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
32
27
 
33
28
  class AMDSignal(HCQSignal):
34
- def __init__(self, value=0, is_timeline=False):
35
- self._signal = AMDDevice.signals_pool.pop()
36
- self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
37
- if is_timeline:
38
- self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
39
- self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
40
- self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
41
- else: self._event_mailbox_ptr = 0
42
- super().__init__(value)
43
- def __del__(self): AMDDevice.signals_pool.append(self._signal)
44
- def _get_value(self) -> int: return self._signal[0]
45
- def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
46
- def _set_value(self, new_value:int): self._signal[0] = new_value
47
- def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
48
- start_time = time.time() * 1000
49
- while (time_spent:=time.time() * 1000 - start_time) < timeout:
50
- if self._signal[0] >= value: return
51
-
52
- # Wait active for 5s, then going to sleep.
53
- if time_spent > 5000 and self._event_mailbox_ptr != 0:
54
- kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
55
- raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
56
-
57
- class AMDComputeQueue(HWComputeQueue):
58
- def __init__(self):
59
- self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
60
- super().__init__()
29
+ def __init__(self, base_addr:int|None=None, **kwargs):
30
+ super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
31
+
32
+ def __del__(self):
33
+ if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
34
+
35
+ def _sleep(self, time_spent_waiting_ms:int):
36
+ # Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
37
+ if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
61
38
 
39
+ class AMDComputeQueue(HWQueue):
62
40
  def __del__(self):
63
41
  if self.binded_device is not None:
64
- self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True, uncached=True))
65
-
66
- def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
67
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
68
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
69
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
70
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
71
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
72
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
73
-
74
- def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
75
- cache_flush_flags = 0
76
-
77
- if cache_flush:
78
- cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
79
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
80
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
81
-
82
- # event_index__mec_release_mem__end_of_pipe = 5
83
- # event_index__mec_release_mem__shader_done = 6
84
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
85
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
86
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
87
- *data64_le(address), *data64_le(value), cst]
88
-
89
- def _memory_barrier(self):
90
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
91
- amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
92
- nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
93
- self._acquire_mem()
94
-
95
- def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
96
- self._acquire_mem(gli=0, gl2=0)
97
-
98
- cmd_idx = self._cur_cmd_idx()
99
- user_regs = [*data64_le(prg.device.scratch.va_addr), 0xffffffff, 0xc00000] if prg.enable_private_segment_sgpr else []
42
+ self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
43
+
44
+ def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
45
+
46
+ def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
47
+ wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
48
+ | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
49
+
50
+ self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
51
+
52
+ def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
53
+ cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
54
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
55
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
56
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
57
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
58
+
59
+ self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
60
+
61
+ def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
62
+ cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
63
+ | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
64
+ | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
65
+
66
+ event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
67
+ | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
68
+
69
+ memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
70
+
71
+ self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
72
+
73
+ def memory_barrier(self):
74
+ self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
75
+ self.acquire_mem()
76
+ return self
77
+
78
+ def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
79
+ self.bind_args_state(args_state)
80
+
81
+ self.acquire_mem(gli=0, gl2=0)
82
+
83
+ if prg.enable_private_segment_sgpr:
84
+ scratch_hilo = data64_le(prg.dev.scratch.va_addr)
85
+ # sgpr word1 bit31 enables swizzle
86
+ # sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
87
+ user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000] if prg.enable_private_segment_sgpr else []
88
+ else: user_regs = []
100
89
  if prg.enable_dispatch_ptr:
101
90
  dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
102
- dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
103
- dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
91
+
92
+ self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
93
+ self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
104
94
  dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
105
95
  user_regs += [*data64_le(dp_addr)]
106
- self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
96
+
107
97
  user_regs += [*data64_le(args_state.ptr)]
108
98
 
109
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)]
110
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
111
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0]
112
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
113
- if prg.device.has_scratch_base_registers:
114
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2),
115
- gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.device.scratch.va_addr >> 8)]
116
- if prg.device.target < 110000: self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20]
117
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
118
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
119
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
120
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
121
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
122
-
123
- self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
124
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
125
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
126
-
127
- self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
128
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
129
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
130
-
131
- def _update_exec(self, cmd_idx, global_size, local_size):
132
- if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
133
- if global_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_global_offset[cmd_idx], data=global_size)
134
-
135
- if (dp:=self.cmd_idx_to_dispatch_packet.get(cmd_idx)) is not None:
136
- if local_size is not None: dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
137
- if global_size is not None:
138
- dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
139
-
140
- def _wait(self, signal, value=0):
141
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
142
- amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
143
- amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
144
-
145
- def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
146
-
147
- def _signal(self, signal, value=0):
99
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
100
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
101
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
102
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
103
+ if prg.dev.has_scratch_base_registers:
104
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
105
+ if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
106
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
107
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
108
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
109
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
110
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
111
+
112
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
113
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
114
+
115
+ self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
116
+ self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
117
+ return self
118
+
119
+ def wait(self, signal:AMDSignal, value:sint=0):
120
+ self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
121
+ return self
122
+
123
+ def timestamp(self, signal:AMDSignal):
124
+ self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
125
+ return self
126
+
127
+ def signal(self, signal:AMDSignal, value:sint=0):
148
128
  # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
149
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
150
- if signal._event_mailbox_ptr != 0:
151
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
152
- value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
153
-
154
- def _update_wait(self, cmd_idx, signal=None, value=None):
155
- if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
156
- if value is not None: self._patch(cmd_idx, offset=4, data=[value])
157
-
158
- def _update_signal(self, cmd_idx, signal=None, value=None):
159
- if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
160
- if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
161
-
162
- # Check if the signal command has mailptr part
163
- if signal is not None and self.cmds_len[cmd_idx] > 8:
164
- self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
165
-
166
- def bind(self, device):
167
- self.binded_device = device
168
- self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
129
+ self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
130
+ amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
131
+
132
+ if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
133
+ self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
134
+ amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
135
+ return self
136
+
137
+ def bind(self, dev:AMDDevice):
138
+ self.binded_device = dev
139
+ self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
169
140
  hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
170
- for i, value in enumerate(self.q): hw_view[i] = value
141
+ for i, value in enumerate(self._q): hw_view[i] = value
171
142
 
172
143
  self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
173
- len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
174
- self.q = hw_view # type: ignore
144
+ len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
145
+ self._q = hw_view
146
+ return self
175
147
 
176
- def _submit(self, device):
177
- cmds = self.indirect_cmd if device == self.binded_device else self.q
148
+ def _submit(self, dev:AMDDevice):
149
+ cmds = self.indirect_cmd if dev == self.binded_device else self._q
178
150
 
179
- for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
151
+ for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
180
152
 
181
- device.compute_queue.put_value += len(cmds)
182
- device.compute_queue.write_ptr[0] = device.compute_queue.put_value
183
- device.compute_queue.doorbell[0] = device.compute_queue.put_value
153
+ dev.compute_queue.put_value += len(cmds)
154
+ dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value
155
+ dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
184
156
 
185
157
  SDMA_MAX_COPY_SIZE = 0x400000
186
- class AMDCopyQueue(HWCopyQueue):
158
+ class AMDCopyQueue(HWQueue):
187
159
  def __init__(self):
188
- self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
160
+ self.internal_cmd_sizes = []
189
161
  super().__init__()
190
162
 
191
- def _q(self, arr):
192
- self.q += arr
163
+ def q(self, *arr):
164
+ super().q(*arr)
193
165
  self.internal_cmd_sizes.append(len(arr))
194
166
 
195
- def _copy(self, dest, src, copy_size):
167
+ def copy(self, dest:sint, src:sint, copy_size:int):
196
168
  copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
197
- self.copy_cmds_per_copy[len(self) - 1] = copy_commands
169
+
198
170
  for _ in range(copy_commands):
199
171
  step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
200
172
 
201
- self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
202
- amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
173
+ self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
174
+ amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
203
175
 
204
176
  copied += step_copy_size
177
+ return self
205
178
 
206
- def _update_copy(self, cmd_idx, dest=None, src=None):
207
- for i in range(self.copy_cmds_per_copy[cmd_idx]):
208
- if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
209
- if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
179
+ def signal(self, signal:AMDSignal, value:sint=0):
180
+ self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
210
181
 
211
- def _signal(self, signal, value=0):
212
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
182
+ if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
183
+ self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
184
+ self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
185
+ elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
213
186
 
214
- if signal._event_mailbox_ptr != 0:
215
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
216
- self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
187
+ return self
217
188
 
218
- def _wait(self, signal, value=0):
219
- self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
220
- amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
221
- amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
189
+ def wait(self, signal:AMDSignal, value:sint=0):
190
+ self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
191
+ amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
192
+ amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
193
+ return self
222
194
 
223
- def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
224
- def _update_wait(self, cmd_idx, signal=None, value=None):
225
- if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
226
- if value is not None: self._patch(cmd_idx, offset=3, data=[value])
195
+ def timestamp(self, signal:AMDSignal):
196
+ self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
197
+ *data64_le(signal.timestamp_addr))
198
+ return self
227
199
 
228
- def _timestamp(self, signal):
229
- self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
230
- *data64_le(signal._timestamp_addr)])
200
+ def bind(self, dev:AMDDevice):
201
+ if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return
231
202
 
232
- def _submit(self, device):
233
- if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
203
+ self.binded_device = dev
204
+ self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
205
+ hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
206
+ for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
234
207
 
235
- tail_blit_dword = 0
236
- for cmdsz in self.internal_cmd_sizes:
237
- if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
238
- tail_blit_dword += cmdsz
208
+ self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
209
+ self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
239
210
 
240
- start_idx = (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes) // 4
241
- device.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
242
- device.sdma_queue.put_value += tail_blit_dword * 4
211
+ def _submit(self, dev:AMDDevice):
212
+ if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
243
213
 
244
- if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
245
- zero_fill = device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes
246
- ctypes.memset(mv_address(device.sdma_queue.ring) + (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes), 0, zero_fill)
247
- device.sdma_queue.put_value += zero_fill
214
+ if self.binded_device == dev:
215
+ # An IB packet must end on a 8 DW boundary.
216
+ add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
217
+ cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
248
218
 
249
- device.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
250
- device.sdma_queue.put_value += rem_packet_cnt * 4
219
+ if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
220
+ cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
221
+ else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
251
222
 
252
- device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
253
- device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
223
+ tail_blit_dword = 0
224
+ for cmdsz in cmd_sizes:
225
+ if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
226
+ tail_blit_dword += cmdsz
254
227
 
255
- class AMDArgsState(HCQArgsState):
256
- def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
257
- super().__init__(ptr, prg, bufs, vals=vals)
228
+ start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
229
+ dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
230
+ dev.sdma_queue.put_value += tail_blit_dword * 4
258
231
 
259
- self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
260
- self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
232
+ if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
233
+ zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
234
+ ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
235
+ dev.sdma_queue.put_value += zero_fill
261
236
 
262
- self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
263
- self.vals[:] = array.array('I', vals)
237
+ dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
238
+ dev.sdma_queue.put_value += rem_packet_cnt * 4
264
239
 
265
- def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
266
- def update_var(self, index:int, val:int): self.vals[index] = val
240
+ dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
241
+ dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
267
242
 
268
243
  class AMDProgram(HCQProgram):
269
- def __init__(self, device:AMDDevice, name:str, lib:bytes):
244
+ def __init__(self, dev:AMDDevice, name:str, lib:bytes):
270
245
  # TODO; this API needs the type signature of the function and global_size/local_size
271
- self.device, self.name, self.lib = device, name, lib
246
+ self.dev: AMDDevice = dev
247
+ self.name, self.lib = name, lib
272
248
  image, sections, _ = elf_loader(self.lib)
273
- self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000), BufferOptions(cpu_access=True, nolru=True))
249
+ self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
274
250
  ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
275
251
 
276
252
  entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
@@ -279,43 +255,43 @@ class AMDProgram(HCQProgram):
279
255
  self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
280
256
 
281
257
  lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
282
- if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
283
- if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
258
+ if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
259
+
260
+ # Ensure scratch size
261
+ self.dev._ensure_has_local_memory(self.private_segment_size)
284
262
 
285
263
  code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
286
264
  assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
287
265
 
288
266
  # Set rsrc1.priv=1 on gfx11 to workaround cwsr.
289
- self.rsrc1 = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.device.target < 120000 else 0)
290
- self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
291
- self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
267
+ self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
268
+ self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
269
+ self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
292
270
 
293
271
  # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
294
272
  # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
295
- self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
296
- self.enable_private_segment_sgpr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
273
+ self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
274
+ self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
297
275
  additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
298
276
 
299
- super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
277
+ super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
300
278
 
301
279
  def __del__(self):
302
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True, nolru=True))
280
+ if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
303
281
 
304
- class AMDAllocator(HCQAllocator):
305
- def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
282
+ class AMDAllocator(HCQAllocator['AMDDevice']):
283
+ def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
306
284
 
307
- def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
308
- if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
309
- if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
310
- return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
285
+ def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
286
+ return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
311
287
 
312
- def _free(self, opaque, options:BufferOptions):
313
- self.device.synchronize()
314
- self.device._gpu_free(opaque)
288
+ def _free(self, opaque, options:BufferSpec):
289
+ self.dev.synchronize()
290
+ self.dev.dev_iface.free(opaque)
315
291
 
316
- def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
292
+ def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
317
293
 
318
- MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
294
+ MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
319
295
 
320
296
  @dataclass
321
297
  class AMDQueueDesc:
@@ -325,147 +301,332 @@ class AMDQueueDesc:
325
301
  doorbell: memoryview
326
302
  put_value: int = 0
327
303
 
328
- class AMDDevice(HCQCompiled):
329
- kfd:int = -1
330
- event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
331
- signals_page:Any = None
332
- signals_pool:List[memoryview] = []
333
- gpus:List[pathlib.Path] = []
334
-
335
- def _gpu_map(self, mem):
336
- if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
337
- mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
338
- c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
339
- stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
340
- n_devices=len(mem.mapped_gpu_ids))
341
- assert stm.n_success == len(mem.mapped_gpu_ids)
342
-
343
- def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
344
- flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
345
- if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
346
- if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
304
+ class KFDIface:
305
+ kfd:HWInterface|None = None
306
+ event_page:HCQBuffer|None = None
307
+ gpus:list[HWInterface] = []
308
+
309
+ def _is_usable_gpu(self, gpu_id):
310
+ with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
311
+ return False
312
+
313
+ def __init__(self, dev, device_id):
314
+ self.dev = dev
315
+
316
+ kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
317
+
318
+ # Initialize KFD interface during first run
319
+ if KFDIface.kfd is None:
320
+ KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
321
+ gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
322
+ gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
323
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
324
+ KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
325
+
326
+ if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
327
+
328
+ self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
329
+ self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
330
+ self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
331
+
332
+ kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
333
+
334
+ # Set these for our device.
335
+ if KFDIface.event_page is None:
336
+ KFDIface.event_page = self.alloc(0x8000, uncached=True)
337
+ kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_page_offset=KFDIface.event_page.meta.handle)
338
+ else: self.map(KFDIface.event_page)
339
+
340
+ # Event to wait for queues completion
341
+ self.dev.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
342
+ self.dev.queue_event_mailbox_ptr = KFDIface.event_page.va_addr + self.dev.queue_event.event_slot_index * 8
343
+ self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.dev.queue_event.event_id)
344
+ self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
345
+
346
+ # OS events to collect memory and hardware faults
347
+ self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
348
+ self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
349
+
350
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
351
+ flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
352
+
353
+ if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
354
+ else: flags |= (kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
355
+
356
+ if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
357
+
347
358
  if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
348
- buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
349
- else:
350
- buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
359
+ buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
360
+ else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
351
361
  assert addr != 0xffffffffffffffff
352
362
 
353
363
  try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
354
364
  flags=flags, mmap_offset=buf)
355
365
  except OSError as e:
356
- if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
366
+ if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
357
367
  raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
358
368
  if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
359
369
  raise
360
370
 
361
371
  if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
362
- buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
372
+ buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
363
373
  assert addr == buf == mem.va_addr
364
- if map_to_gpu: self._gpu_map(mem)
365
- return mem
366
374
 
367
- def _gpu_free(self, mem):
368
- if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
375
+ self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
376
+ return hcqbuf
377
+
378
+ def free(self, mem):
379
+ if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
369
380
  c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
370
- stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
381
+ stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
371
382
  assert stm.n_success == len(gpus)
372
- libc.munmap(mem.va_addr, mem.size)
373
- kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
383
+ if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
384
+ kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
385
+
386
+ def map(self, mem):
387
+ if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
388
+ mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
389
+ c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
390
+ stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
391
+ n_devices=len(mem.meta.mapped_gpu_ids))
392
+ assert stm.n_success == len(mem.meta.mapped_gpu_ids)
393
+
394
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
395
+ cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
396
+ queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
397
+ queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
398
+ eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
399
+ ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
400
+ write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
374
401
 
375
- def __init__(self, device:str=""):
376
- if AMDDevice.kfd == -1:
377
- AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
378
- gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
379
- gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
402
+ if not hasattr(self, 'doorbells'):
403
+ self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
404
+ self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
405
+
406
+ return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
407
+ read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
408
+ doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
409
+
410
+ def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
411
+
412
+ def on_device_hang(self):
413
+ def _collect_str(st): return ' '.join(f'{k[0]}={getattr(st, k[0])}' for k in st._fields_)
414
+
415
+ report = []
416
+ for evnt in [self.mem_fault_event, self.hw_fault_event]:
417
+ ev = (kfd.struct_kfd_event_data)(event_id=evnt.event_id)
418
+ kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
419
+ if evnt == self.mem_fault_event and ev.memory_exception_data.gpu_id:
420
+ report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {_collect_str(ev.memory_exception_data.failure)}"]
421
+ if evnt == self.hw_fault_event and ev.hw_exception_data.gpu_id: report += [f"HW fault: {_collect_str(ev.hw_exception_data)}"]
422
+
423
+ raise RuntimeError("\n".join(report))
424
+
425
+ @dataclass
426
+ class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
427
+
428
+ class PCIIface:
429
+ vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
430
+ vfio_fd:HWInterface
431
+ gpus:list[Any] = []
432
+
433
+ def __init__(self, dev, dev_id):
434
+ self.dev = dev
435
+
436
+ if first_dev:=len(PCIIface.gpus) == 0:
437
+ libpciaccess.pci_system_init()
438
+ pci_iter = libpciaccess.pci_id_match_iterator_create(None)
439
+ while pcidev:=libpciaccess.pci_device_next(pci_iter):
440
+ if pcidev.contents.vendor_id == 0x1002 and pcidev.contents.device_id == 0x744c: PCIIface.gpus.append(pcidev.contents)
441
+
442
+ # TODO: visible_devices should be handled layer above this?
380
443
  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
381
- AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
444
+ PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
445
+
446
+ self.pcidev = PCIIface.gpus[dev_id]
447
+ self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
448
+
449
+ # Unbind the device from the kernel driver
450
+ if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
451
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
452
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
453
+
454
+ # Probe device
455
+ libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
456
+
457
+ # Try to init vfio. Use it if success.
458
+ if PCIIface.vfio:
459
+ try:
460
+ if first_dev:
461
+ HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
462
+ PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
463
+ vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
464
+
465
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
466
+ HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
467
+
468
+ iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
469
+ except OSError:
470
+ if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).")
471
+ PCIIface.vfio = False
472
+
473
+ # Init vfio for the device
474
+ if PCIIface.vfio:
475
+ self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
476
+ vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
477
+
478
+ if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
479
+ self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
480
+
481
+ self.irq_fd = HWInterface.eventfd(0, 0)
482
+ self.irq_poller = select.poll()
483
+ self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
484
+
485
+ irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
486
+ argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
487
+ vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
488
+ else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
489
+
490
+ self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
491
+ self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
492
+
493
+ self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
494
+ self.doorbell_cpu_addr = mv_address(dbell)
495
+
496
+ libpciaccess.pci_device_cfg_read_u16(self.pcidev, ctypes.byref(val:=ctypes.c_uint16()), libpciaccess.PCI_COMMAND)
497
+ libpciaccess.pci_device_cfg_write_u16(self.pcidev, val.value | libpciaccess.PCI_COMMAND_MASTER, libpciaccess.PCI_COMMAND)
498
+
499
+ array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
500
+ simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
501
+ self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': self.adev.ip_versions[am.GC_HWIP],
502
+ 'max_slots_scratch_cu': self.adev.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.adev.gc_info.gc_max_waves_per_simd,
503
+ 'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
504
+
505
+ def _map_pci_range(self, bar, off=0, addr=0, size=None):
506
+ fd, sz = self.bar_fds[bar], size or self.pcidev.regions[bar].size
507
+ return to_mv(fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz)
508
+
509
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
510
+ if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
511
+ vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
512
+ va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
513
+
514
+ # Read pagemap to get the physical address of each page. The pages are locked.
515
+ self.pagemap.seek(va // mmap.PAGESIZE * 8)
516
+ paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
517
+ am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
518
+ return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
519
+
520
+ am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
521
+ if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
522
+ return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
523
+
524
+ def free(self, mem):
525
+ for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
526
+ if not mem.meta.mapping.system: self.adev.mm.vfree(mem.meta.mapping)
527
+
528
+ def map(self, mem):
529
+ # Check if the memory is already mapped on this device
530
+ if self.dev in mem.meta.mapped_devs: return
531
+ mem.meta.mapped_devs.append(self.dev)
532
+
533
+ owner_sys_base = mem.meta.owner.dev_iface.pcidev.regions[0].base_addr
534
+ paddrs = [(paddr if mem.meta.mapping.system else (paddr + owner_sys_base), size) for paddr, size in mem.meta.mapping.paddrs]
535
+ self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
536
+
537
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
538
+ if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
539
+ self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
540
+ doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
541
+ else:
542
+ self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
543
+ eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
382
544
 
383
- self.device_id = int(device.split(":")[1]) if ":" in device else 0
384
- if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
545
+ return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
546
+ read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
547
+
548
+ def sleep(self, timeout):
549
+ if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
550
+ self.irq_fd.read(8 * events_cnt)
551
+ self.adev.ih.interrupt_handler()
552
+
553
+ def on_device_hang(self):
554
+ for d in self.dev.devices: d.dev_iface.adev.gmc.on_interrupt()
555
+ raise RuntimeError("Device hang detected")
556
+
557
+ def device_fini(self): self.adev.fini()
385
558
 
386
- with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
387
- with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
388
- self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
389
- self.target = int(self.properties['gfx_target_version'])
559
+ class AMDDevice(HCQCompiled):
560
+ driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
561
+ signals_page:Any = None
562
+ signals_pool:list[int] = []
563
+
564
+ def __init__(self, device:str=""):
565
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
566
+ self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
567
+ self.target = int(self.dev_iface.props['gfx_target_version'])
390
568
  self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
391
569
  if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
392
570
 
393
- kfd.AMDKFD_IOC_ACQUIRE_VM(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
394
-
395
- if AMDDevice.event_page is None:
396
- AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
397
- AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
398
- AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
399
- kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
400
- else:
401
- self._gpu_map(AMDDevice.signals_page)
402
- self._gpu_map(AMDDevice.event_page)
571
+ if AMDDevice.signals_page is None:
572
+ AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
573
+ AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
574
+ else: self.dev_iface.map(AMDDevice.signals_page)
403
575
 
404
- # Scratch setup
405
- max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
406
- max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
407
- self.max_private_segment_size = 4096
408
- wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
409
- self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
410
- self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
576
+ self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
577
+ self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
411
578
  self.has_scratch_base_registers = self.target >= 110000
412
- engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
413
- self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
414
579
 
415
580
  # https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
416
581
  sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
417
582
  vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
418
- wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (max_cu_id + 1), mmap.PAGESIZE)
419
- ctl_stack_size = round_up(12 * (max_cu_id + 1) * (max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
420
- self.debug_memory_size = round_up((max_cu_id + 1) * (max_wave_id + 1) * 32, 64)
583
+ wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
584
+ ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
585
+ debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
421
586
 
422
- self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
423
- eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
424
- self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
587
+ self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
588
+ eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
425
589
 
426
- self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
427
- self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
590
+ self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
428
591
 
429
592
  super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
430
593
  AMDSignal, AMDComputeQueue, AMDCopyQueue)
431
594
 
432
- def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None, ctl_stack_size=0) -> AMDQueueDesc:
433
- gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
434
- ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
435
- cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE),
436
- kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
437
- eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
438
- queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
439
- queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
440
- eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
441
- ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size if cwsr_ctx else 0,
442
- write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
595
+ # Scratch setup
596
+ self.max_private_segment_size = 0
597
+ self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
443
598
 
444
- if not hasattr(self, 'doorbells'):
445
- self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
446
- self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
599
+ atexit.register(self.device_fini)
447
600
 
448
- return AMDQueueDesc(ring=to_mv(ring.va_addr, ring_size).cast("I"),
449
- read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
450
- doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
601
+ def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
602
+ ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
603
+ gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
604
+ eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
605
+ return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
606
+ ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
607
+
608
+ def _ensure_has_local_memory(self, required):
609
+ if self.max_private_segment_size >= required: return
610
+
611
+ # <gfx103 requires alignment of 1024, >=gfx11 requires 256
612
+ wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
613
+
614
+ self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
615
+ if ok:
616
+ engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
617
+ waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
618
+ # >=gfx11 wavesize is per SE
619
+ wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
620
+ self.tmpring_size = waves << 12 | wavesize
621
+ self.max_private_segment_size = required
451
622
 
452
623
  def invalidate_caches(self):
453
624
  AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
454
625
  self.timeline_value += 1
455
626
  self.synchronize()
456
627
 
457
- def on_device_hang(self):
458
- report = []
459
-
460
- ev = (kfd.struct_kfd_event_data)(event_id=self.mem_fault_event.event_id)
461
- kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
462
- if ev.memory_exception_data.gpu_id:
463
- pfstatus = ' '.join(f'{k[0]}={getattr(ev.memory_exception_data.failure, k[0])}' for k in ev.memory_exception_data.failure._fields_)
464
- report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {pfstatus}"]
628
+ def on_device_hang(self): self.dev_iface.on_device_hang()
465
629
 
466
- ev = (kfd.struct_kfd_event_data)(event_id=self.hw_fault_event.event_id)
467
- kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
468
- if ev.hw_exception_data.gpu_id:
469
- report += [f"HW fault: {' '.join(f'{k[0]}={getattr(ev.hw_exception_data, k[0])}' for k in ev.hw_exception_data._fields_)}"]
470
-
471
- raise RuntimeError("\n".join(report))
630
+ def device_fini(self):
631
+ self.synchronize()
632
+ if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()