tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +141 -201
  4. tinygrad/codegen/linearize.py +223 -84
  5. tinygrad/codegen/lowerer.py +60 -42
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +22 -13
  8. tinygrad/device.py +187 -47
  9. tinygrad/dtype.py +39 -28
  10. tinygrad/engine/jit.py +83 -65
  11. tinygrad/engine/memory.py +4 -5
  12. tinygrad/engine/multi.py +161 -0
  13. tinygrad/engine/realize.py +62 -108
  14. tinygrad/engine/schedule.py +396 -357
  15. tinygrad/engine/search.py +55 -66
  16. tinygrad/gradient.py +73 -0
  17. tinygrad/helpers.py +81 -59
  18. tinygrad/nn/__init__.py +30 -32
  19. tinygrad/nn/datasets.py +1 -2
  20. tinygrad/nn/optim.py +22 -26
  21. tinygrad/nn/state.py +91 -66
  22. tinygrad/ops.py +492 -641
  23. tinygrad/renderer/__init__.py +95 -36
  24. tinygrad/renderer/cstyle.py +99 -92
  25. tinygrad/renderer/llvmir.py +83 -34
  26. tinygrad/renderer/ptx.py +83 -99
  27. tinygrad/renderer/wgsl.py +95 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  29. tinygrad/runtime/autogen/comgr.py +2 -0
  30. tinygrad/runtime/autogen/kfd.py +4 -3
  31. tinygrad/runtime/autogen/kgsl.py +1 -1
  32. tinygrad/runtime/autogen/libc.py +404 -71
  33. tinygrad/runtime/autogen/llvm.py +11379 -0
  34. tinygrad/runtime/autogen/pci.py +1333 -0
  35. tinygrad/runtime/autogen/vfio.py +891 -0
  36. tinygrad/runtime/autogen/webgpu.py +6985 -0
  37. tinygrad/runtime/graph/cuda.py +8 -9
  38. tinygrad/runtime/graph/hcq.py +84 -79
  39. tinygrad/runtime/graph/metal.py +40 -43
  40. tinygrad/runtime/ops_amd.py +498 -334
  41. tinygrad/runtime/ops_cloud.py +34 -34
  42. tinygrad/runtime/ops_cpu.py +24 -0
  43. tinygrad/runtime/ops_cuda.py +30 -27
  44. tinygrad/runtime/ops_disk.py +62 -63
  45. tinygrad/runtime/ops_dsp.py +159 -42
  46. tinygrad/runtime/ops_gpu.py +30 -30
  47. tinygrad/runtime/ops_hip.py +29 -31
  48. tinygrad/runtime/ops_llvm.py +48 -41
  49. tinygrad/runtime/ops_metal.py +149 -113
  50. tinygrad/runtime/ops_npy.py +2 -2
  51. tinygrad/runtime/ops_nv.py +238 -273
  52. tinygrad/runtime/ops_python.py +55 -50
  53. tinygrad/runtime/ops_qcom.py +129 -157
  54. tinygrad/runtime/ops_webgpu.py +225 -0
  55. tinygrad/runtime/support/allocator.py +94 -0
  56. tinygrad/runtime/support/am/__init__.py +0 -0
  57. tinygrad/runtime/support/am/amdev.py +396 -0
  58. tinygrad/runtime/support/am/ip.py +463 -0
  59. tinygrad/runtime/support/compiler_cuda.py +4 -2
  60. tinygrad/runtime/support/elf.py +28 -4
  61. tinygrad/runtime/support/hcq.py +256 -324
  62. tinygrad/runtime/support/llvm.py +26 -0
  63. tinygrad/shape/shapetracker.py +85 -53
  64. tinygrad/shape/view.py +104 -140
  65. tinygrad/spec.py +155 -0
  66. tinygrad/tensor.py +835 -527
  67. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  68. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  69. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  70. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  71. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  72. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  73. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  74. tinygrad/viz/index.html +544 -0
  75. tinygrad/viz/perfetto.html +178 -0
  76. tinygrad/viz/serve.py +205 -0
  77. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
  78. tinygrad-0.10.2.dist-info/RECORD +99 -0
  79. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
  80. tinygrad/codegen/uopgraph.py +0 -506
  81. tinygrad/engine/lazy.py +0 -228
  82. tinygrad/function.py +0 -212
  83. tinygrad/multi.py +0 -177
  84. tinygrad/runtime/graph/clang.py +0 -39
  85. tinygrad/runtime/ops_clang.py +0 -35
  86. tinygrad-0.10.0.dist-info/RECORD +0 -77
  87. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  88. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,23 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, List, Any
3
- import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
2
+ from typing import Any, cast
3
+ import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
4
4
  assert sys.platform != 'win32'
5
5
  from dataclasses import dataclass
6
- from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram
7
- from tinygrad.device import BufferOptions
8
- from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
7
+ from tinygrad.ops import sint
8
+ from tinygrad.device import BufferSpec, CPUProgram
9
+ from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
9
10
  from tinygrad.renderer.cstyle import AMDRenderer
10
- from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
11
+ from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
12
+ from tinygrad.runtime.autogen.am import am
11
13
  from tinygrad.runtime.support.compiler_hip import AMDCompiler
12
14
  from tinygrad.runtime.support.elf import elf_loader
15
+ from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
13
16
  if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
14
- if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
15
-
16
- def is_usable_gpu(gpu_id):
17
- with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
18
- return False
19
17
 
20
18
  regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
21
19
 
22
- # VGT_EVENT_TYPE in navi10_enum.h
23
- CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
24
-
25
- WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
20
+ EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
26
21
  WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
27
22
 
28
23
  COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
@@ -31,246 +26,224 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
31
26
  def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
32
27
 
33
28
  class AMDSignal(HCQSignal):
34
- def __init__(self, value=0, is_timeline=False):
35
- self._signal = AMDDevice.signals_pool.pop()
36
- self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
37
- if is_timeline:
38
- self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
39
- self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
40
- self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
41
- else: self._event_mailbox_ptr = 0
42
- super().__init__(value)
43
- def __del__(self): AMDDevice.signals_pool.append(self._signal)
44
- def _get_value(self) -> int: return self._signal[0]
45
- def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
46
- def _set_value(self, new_value:int): self._signal[0] = new_value
47
- def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
48
- start_time = time.time() * 1000
49
- while (time_spent:=time.time() * 1000 - start_time) < timeout:
50
- if self._signal[0] >= value: return
51
-
52
- # Wait active for 5s, then going to sleep.
53
- if time_spent > 5000 and self._event_mailbox_ptr != 0:
54
- kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
55
- raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
56
-
57
- class AMDComputeQueue(HWComputeQueue):
58
- def __init__(self):
59
- self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
60
- super().__init__()
29
+ def __init__(self, base_addr:int|None=None, **kwargs):
30
+ super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
61
31
 
32
+ def __del__(self):
33
+ if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
34
+
35
+ def _sleep(self, time_spent_waiting_ms:int):
36
+ # Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
37
+ if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
38
+
39
+ class AMDComputeQueue(HWQueue):
62
40
  def __del__(self):
63
41
  if self.binded_device is not None:
64
- self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True, uncached=True))
65
-
66
- def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
67
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
68
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
69
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
70
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
71
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
72
- amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
73
-
74
- def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
75
- cache_flush_flags = 0
76
-
77
- if cache_flush:
78
- cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
79
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
80
- amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
81
-
82
- # event_index__mec_release_mem__end_of_pipe = 5
83
- # event_index__mec_release_mem__shader_done = 6
84
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
85
- amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
86
- amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
87
- *data64_le(address), *data64_le(value), cst]
88
-
89
- def _memory_barrier(self):
90
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
91
- amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
92
- nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
93
- self._acquire_mem()
94
-
95
- def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
96
- self._acquire_mem(gli=0, gl2=0)
97
-
98
- cmd_idx = self._cur_cmd_idx()
99
- user_regs = [*data64_le(prg.device.scratch.va_addr), 0xffffffff, 0xc00000] if prg.enable_private_segment_sgpr else []
42
+ self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
43
+
44
+ def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
45
+
46
+ def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
47
+ wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
48
+ | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
49
+
50
+ self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
51
+
52
+ def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
53
+ cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
54
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
55
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
56
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
57
+ | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
58
+
59
+ self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
60
+
61
+ def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
62
+ cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
63
+ | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
64
+ | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
65
+
66
+ event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
67
+ | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
68
+
69
+ memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
70
+
71
+ self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
72
+
73
+ def memory_barrier(self):
74
+ self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
75
+ self.acquire_mem()
76
+ return self
77
+
78
+ def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
79
+ self.bind_args_state(args_state)
80
+
81
+ self.acquire_mem(gli=0, gl2=0)
82
+
83
+ if prg.enable_private_segment_sgpr:
84
+ scratch_hilo = data64_le(prg.dev.scratch.va_addr)
85
+ # sgpr word1 bit31 enables swizzle
86
+ # sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
87
+ user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000] if prg.enable_private_segment_sgpr else []
88
+ else: user_regs = []
100
89
  if prg.enable_dispatch_ptr:
101
90
  dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
102
- dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
103
- dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
91
+
92
+ self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
93
+ self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
104
94
  dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
105
95
  user_regs += [*data64_le(dp_addr)]
106
- self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
96
+
107
97
  user_regs += [*data64_le(args_state.ptr)]
108
98
 
109
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)]
110
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
111
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0]
112
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
113
- if prg.device.has_scratch_base_registers:
114
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2),
115
- gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.device.scratch.va_addr >> 8)]
116
- if prg.device.target < 110000: self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20]
117
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
118
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
119
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
120
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
121
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
122
-
123
- self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
124
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
125
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
126
-
127
- self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
128
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
129
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
130
-
131
- def _update_exec(self, cmd_idx, global_size, local_size):
132
- if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
133
- if global_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_global_offset[cmd_idx], data=global_size)
134
-
135
- if (dp:=self.cmd_idx_to_dispatch_packet.get(cmd_idx)) is not None:
136
- if local_size is not None: dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
137
- if global_size is not None:
138
- dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
139
-
140
- def _wait(self, signal, value=0):
141
- self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
142
- amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
143
- amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
144
-
145
- def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
146
-
147
- def _signal(self, signal, value=0):
99
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
100
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
101
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
102
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
103
+ if prg.dev.has_scratch_base_registers:
104
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
105
+ if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
106
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
107
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
108
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
109
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
110
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
111
+
112
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
113
+ self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
114
+
115
+ self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
116
+ self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
117
+ return self
118
+
119
+ def wait(self, signal:AMDSignal, value:sint=0):
120
+ self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
121
+ return self
122
+
123
+ def timestamp(self, signal:AMDSignal):
124
+ self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
125
+ return self
126
+
127
+ def signal(self, signal:AMDSignal, value:sint=0):
148
128
  # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
149
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
150
- if signal._event_mailbox_ptr != 0:
151
- self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
152
- value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
153
-
154
- def _update_wait(self, cmd_idx, signal=None, value=None):
155
- if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
156
- if value is not None: self._patch(cmd_idx, offset=4, data=[value])
157
-
158
- def _update_signal(self, cmd_idx, signal=None, value=None):
159
- if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
160
- if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
161
-
162
- # Check if the signal command has mailptr part
163
- if signal is not None and self.cmds_len[cmd_idx] > 8:
164
- self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
165
-
166
- def bind(self, device):
167
- self.binded_device = device
168
- self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
129
+ self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
130
+ amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
131
+
132
+ if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
133
+ self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
134
+ amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
135
+ return self
136
+
137
+ def bind(self, dev:AMDDevice):
138
+ self.binded_device = dev
139
+ self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
169
140
  hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
170
- for i, value in enumerate(self.q): hw_view[i] = value
141
+ for i, value in enumerate(self._q): hw_view[i] = value
171
142
 
172
143
  self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
173
- len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
174
- self.q = hw_view # type: ignore
144
+ len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
145
+ self._q = hw_view
146
+ return self
175
147
 
176
- def _submit(self, device):
177
- cmds = self.indirect_cmd if device == self.binded_device else self.q
148
+ def _submit(self, dev:AMDDevice):
149
+ cmds = self.indirect_cmd if dev == self.binded_device else self._q
178
150
 
179
- for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
151
+ for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
180
152
 
181
- device.compute_queue.put_value += len(cmds)
182
- device.compute_queue.write_ptr[0] = device.compute_queue.put_value
183
- device.compute_queue.doorbell[0] = device.compute_queue.put_value
153
+ dev.compute_queue.put_value += len(cmds)
154
+ dev.compute_queue.signal_doorbell()
184
155
 
185
- SDMA_MAX_COPY_SIZE = 0x400000
186
- class AMDCopyQueue(HWCopyQueue):
187
- def __init__(self):
188
- self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
156
+ class AMDCopyQueue(HWQueue):
157
+ def __init__(self, max_copy_size=0x40000000):
158
+ self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
189
159
  super().__init__()
190
160
 
191
- def _q(self, arr):
192
- self.q += arr
161
+ def q(self, *arr):
162
+ super().q(*arr)
193
163
  self.internal_cmd_sizes.append(len(arr))
194
164
 
195
- def _copy(self, dest, src, copy_size):
196
- copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
197
- self.copy_cmds_per_copy[len(self) - 1] = copy_commands
165
+ def copy(self, dest:sint, src:sint, copy_size:int):
166
+ copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
167
+
198
168
  for _ in range(copy_commands):
199
- step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
169
+ step_copy_size = min(copy_size - copied, self.max_copy_size)
200
170
 
201
- self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
202
- amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
171
+ self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
172
+ amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
203
173
 
204
174
  copied += step_copy_size
175
+ return self
205
176
 
206
- def _update_copy(self, cmd_idx, dest=None, src=None):
207
- for i in range(self.copy_cmds_per_copy[cmd_idx]):
208
- if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
209
- if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
177
+ def signal(self, signal:AMDSignal, value:sint=0):
178
+ self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
210
179
 
211
- def _signal(self, signal, value=0):
212
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
180
+ if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
181
+ self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
182
+ self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
183
+ elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
213
184
 
214
- if signal._event_mailbox_ptr != 0:
215
- self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
216
- self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
185
+ return self
217
186
 
218
- def _wait(self, signal, value=0):
219
- self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
220
- amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
221
- amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
187
+ def wait(self, signal:AMDSignal, value:sint=0):
188
+ self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
189
+ amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
190
+ amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
191
+ return self
222
192
 
223
- def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
224
- def _update_wait(self, cmd_idx, signal=None, value=None):
225
- if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
226
- if value is not None: self._patch(cmd_idx, offset=3, data=[value])
193
+ def timestamp(self, signal:AMDSignal):
194
+ self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
195
+ *data64_le(signal.timestamp_addr))
196
+ return self
227
197
 
228
- def _timestamp(self, signal):
229
- self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
230
- *data64_le(signal._timestamp_addr)])
198
+ def bind(self, dev:AMDDevice):
199
+ if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return
231
200
 
232
- def _submit(self, device):
233
- if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
201
+ self.binded_device = dev
202
+ self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
203
+ hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
204
+ for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
234
205
 
235
- tail_blit_dword = 0
236
- for cmdsz in self.internal_cmd_sizes:
237
- if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
238
- tail_blit_dword += cmdsz
206
+ self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
207
+ self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
239
208
 
240
- start_idx = (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes) // 4
241
- device.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
242
- device.sdma_queue.put_value += tail_blit_dword * 4
209
+ def _submit(self, dev:AMDDevice):
210
+ if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
243
211
 
244
- if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
245
- zero_fill = device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes
246
- ctypes.memset(mv_address(device.sdma_queue.ring) + (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes), 0, zero_fill)
247
- device.sdma_queue.put_value += zero_fill
212
+ if self.binded_device == dev:
213
+ # An IB packet must end on a 8 DW boundary.
214
+ add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
215
+ cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
248
216
 
249
- device.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
250
- device.sdma_queue.put_value += rem_packet_cnt * 4
217
+ if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
218
+ cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
219
+ else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
251
220
 
252
- device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
253
- device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
221
+ tail_blit_dword = 0
222
+ for cmdsz in cmd_sizes:
223
+ if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
224
+ tail_blit_dword += cmdsz
254
225
 
255
- class AMDArgsState(HCQArgsState):
256
- def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
257
- super().__init__(ptr, prg, bufs, vals=vals)
226
+ start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
227
+ dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
228
+ dev.sdma_queue.put_value += tail_blit_dword * 4
258
229
 
259
- self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
260
- self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
230
+ if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
231
+ zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
232
+ ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
233
+ dev.sdma_queue.put_value += zero_fill
261
234
 
262
- self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
263
- self.vals[:] = array.array('I', vals)
235
+ dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
236
+ dev.sdma_queue.put_value += rem_packet_cnt * 4
264
237
 
265
- def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
266
- def update_var(self, index:int, val:int): self.vals[index] = val
238
+ dev.sdma_queue.signal_doorbell()
267
239
 
268
240
  class AMDProgram(HCQProgram):
269
- def __init__(self, device:AMDDevice, name:str, lib:bytes):
241
+ def __init__(self, dev:AMDDevice, name:str, lib:bytes):
270
242
  # TODO; this API needs the type signature of the function and global_size/local_size
271
- self.device, self.name, self.lib = device, name, lib
243
+ self.dev: AMDDevice = dev
244
+ self.name, self.lib = name, lib
272
245
  image, sections, _ = elf_loader(self.lib)
273
- self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000), BufferOptions(cpu_access=True, nolru=True))
246
+ self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
274
247
  ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
275
248
 
276
249
  entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
@@ -279,43 +252,41 @@ class AMDProgram(HCQProgram):
279
252
  self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
280
253
 
281
254
  lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
282
- if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
283
- if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
255
+ if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
256
+
257
+ # Ensure scratch size
258
+ self.dev._ensure_has_local_memory(self.private_segment_size)
284
259
 
285
260
  code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
286
261
  assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
287
262
 
288
263
  # Set rsrc1.priv=1 on gfx11 to workaround cwsr.
289
- self.rsrc1 = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.device.target < 120000 else 0)
290
- self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
291
- self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
264
+ self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
265
+ self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
266
+ self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
292
267
 
293
268
  # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
294
269
  # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
295
- self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
296
- self.enable_private_segment_sgpr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
270
+ self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
271
+ self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
297
272
  additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
298
273
 
299
- super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
274
+ super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
300
275
 
301
276
  def __del__(self):
302
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True, nolru=True))
277
+ if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
303
278
 
304
- class AMDAllocator(HCQAllocator):
305
- def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
279
+ class AMDAllocator(HCQAllocator['AMDDevice']):
280
+ def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
281
+ return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
306
282
 
307
- def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
308
- if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
309
- if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
310
- return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
283
+ def _free(self, opaque, options:BufferSpec):
284
+ self.dev.synchronize()
285
+ self.dev.dev_iface.free(opaque)
311
286
 
312
- def _free(self, opaque, options:BufferOptions):
313
- self.device.synchronize()
314
- self.device._gpu_free(opaque)
287
+ def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
315
288
 
316
- def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
317
-
318
- MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
289
+ MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
319
290
 
320
291
  @dataclass
321
292
  class AMDQueueDesc:
@@ -325,147 +296,340 @@ class AMDQueueDesc:
325
296
  doorbell: memoryview
326
297
  put_value: int = 0
327
298
 
328
- class AMDDevice(HCQCompiled):
329
- kfd:int = -1
330
- event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
331
- signals_page:Any = None
332
- signals_pool:List[memoryview] = []
333
- gpus:List[pathlib.Path] = []
334
-
335
- def _gpu_map(self, mem):
336
- if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
337
- mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
338
- c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
339
- stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
340
- n_devices=len(mem.mapped_gpu_ids))
341
- assert stm.n_success == len(mem.mapped_gpu_ids)
342
-
343
- def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
344
- flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
345
- if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
346
- if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
299
+ def signal_doorbell(self):
300
+ self.write_ptr[0] = self.put_value
301
+
302
+ # Ensure all prior writes are visible to the GPU.
303
+ if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
304
+ self.doorbell[0] = self.put_value
305
+
306
+ class KFDIface:
307
+ kfd:HWInterface|None = None
308
+ event_page:HCQBuffer|None = None
309
+ gpus:list[HWInterface] = []
310
+
311
+ def _is_usable_gpu(self, gpu_id):
312
+ with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
313
+ return False
314
+
315
+ def __init__(self, dev, device_id):
316
+ self.dev = dev
317
+
318
+ kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
319
+
320
+ # Initialize KFD interface during first run
321
+ if KFDIface.kfd is None:
322
+ KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
323
+ gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
324
+ gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
325
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
326
+ KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
327
+
328
+ if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
329
+
330
+ self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
331
+ self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
332
+ self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
333
+
334
+ kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
335
+
336
+ # Set these for our device.
337
+ if KFDIface.event_page is None:
338
+ KFDIface.event_page = self.alloc(0x8000, uncached=True)
339
+ kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_page_offset=KFDIface.event_page.meta.handle)
340
+ else: self.map(KFDIface.event_page)
341
+
342
+ # Event to wait for queues completion
343
+ self.dev.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
344
+ self.dev.queue_event_mailbox_ptr = KFDIface.event_page.va_addr + self.dev.queue_event.event_slot_index * 8
345
+ self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.dev.queue_event.event_id)
346
+ self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
347
+
348
+ # OS events to collect memory and hardware faults
349
+ self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
350
+ self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
351
+
352
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
353
+ flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
354
+
355
+ if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
356
+ else: flags |= (kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
357
+
358
+ if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
359
+
347
360
  if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
348
- buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
349
- else:
350
- buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
361
+ buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
362
+ else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
351
363
  assert addr != 0xffffffffffffffff
352
364
 
353
365
  try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
354
366
  flags=flags, mmap_offset=buf)
355
367
  except OSError as e:
356
- if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
368
+ if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
357
369
  raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
358
370
  if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
359
371
  raise
360
372
 
361
373
  if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
362
- buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
374
+ buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
363
375
  assert addr == buf == mem.va_addr
364
- if map_to_gpu: self._gpu_map(mem)
365
- return mem
366
376
 
367
- def _gpu_free(self, mem):
368
- if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
377
+ self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
378
+ return hcqbuf
379
+
380
+ def free(self, mem):
381
+ if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
369
382
  c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
370
- stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
383
+ stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
371
384
  assert stm.n_success == len(gpus)
372
- libc.munmap(mem.va_addr, mem.size)
373
- kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
385
+ if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
386
+ kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
387
+
388
+ def map(self, mem):
389
+ if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
390
+ mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
391
+ c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
392
+ stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
393
+ n_devices=len(mem.meta.mapped_gpu_ids))
394
+ assert stm.n_success == len(mem.meta.mapped_gpu_ids)
395
+
396
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
397
+ cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
398
+ queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
399
+ queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
400
+ eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
401
+ ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
402
+ write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
374
403
 
375
- def __init__(self, device:str=""):
376
- if AMDDevice.kfd == -1:
377
- AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
378
- gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
379
- gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
404
+ if not hasattr(self, 'doorbells'):
405
+ self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
406
+ self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
407
+
408
+ return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
409
+ read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
410
+ doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
411
+
412
+ def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
413
+
414
+ def on_device_hang(self):
415
+ def _collect_str(st): return ' '.join(f'{k[0]}={getattr(st, k[0])}' for k in st._fields_)
416
+
417
+ report = []
418
+ for evnt in [self.mem_fault_event, self.hw_fault_event]:
419
+ ev = (kfd.struct_kfd_event_data)(event_id=evnt.event_id)
420
+ kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
421
+ if evnt == self.mem_fault_event and ev.memory_exception_data.gpu_id:
422
+ report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {_collect_str(ev.memory_exception_data.failure)}"]
423
+ if evnt == self.hw_fault_event and ev.hw_exception_data.gpu_id: report += [f"HW fault: {_collect_str(ev.hw_exception_data)}"]
424
+
425
+ raise RuntimeError("\n".join(report))
426
+
427
+ @dataclass
428
+ class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
429
+
430
+ class PCIIface:
431
+ supported_devs:list[int] = [0x744c, 0x7480]
432
+ vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
433
+ vfio_fd:HWInterface
434
+ gpus:list[Any] = []
435
+
436
+ def __init__(self, dev, dev_id):
437
+ self.dev = dev
438
+
439
+ if first_dev:=len(PCIIface.gpus) == 0:
440
+ for pcibus in HWInterface("/sys/bus/pci/devices").listdir():
441
+ vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
442
+ device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
443
+ if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
444
+
445
+ # TODO: visible_devices should be handled layer above this?
380
446
  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
381
- AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
447
+ PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
448
+
449
+ self.pcibus = PCIIface.gpus[dev_id]
450
+
451
+ # Unbind the device from the kernel driver
452
+ if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
453
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
454
+
455
+ supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
456
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
457
+
458
+ # Try to init vfio. Use it if success.
459
+ if PCIIface.vfio:
460
+ try:
461
+ if first_dev:
462
+ HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
463
+ PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
464
+ vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
465
+
466
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
467
+ HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
468
+
469
+ iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
470
+ except OSError:
471
+ if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).")
472
+ PCIIface.vfio = False
473
+
474
+ # Init vfio for the device
475
+ if PCIIface.vfio:
476
+ self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
477
+ vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
478
+
479
+ if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
480
+ self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
481
+
482
+ self.irq_fd = HWInterface.eventfd(0, 0)
483
+ self.irq_poller = select.poll()
484
+ self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
485
+
486
+ irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
487
+ argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
488
+ vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
489
+ else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
490
+
491
+ self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
492
+ self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
493
+ self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
494
+
495
+ bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
496
+ self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
497
+
498
+ self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
499
+ self.doorbell_cpu_addr = mv_address(dbell)
500
+
501
+ pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
502
+ self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
503
+
504
+ array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
505
+ simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
506
+ self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': self.adev.ip_versions[am.GC_HWIP],
507
+ 'max_slots_scratch_cu': self.adev.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.adev.gc_info.gc_max_waves_per_simd,
508
+ 'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
509
+
510
+ def _map_pci_range(self, bar, off=0, addr=0, size=None):
511
+ fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
512
+ libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
513
+ return to_mv(loc, sz)
514
+
515
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
516
+ if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
517
+ vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
518
+ va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
519
+
520
+ # Read pagemap to get the physical address of each page. The pages are locked.
521
+ self.pagemap.seek(va // mmap.PAGESIZE * 8)
522
+ paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
523
+ am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
524
+ return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
525
+
526
+ am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
527
+ if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
528
+ return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
529
+
530
+ def free(self, mem):
531
+ for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
532
+ if not mem.meta.mapping.system: self.adev.mm.vfree(mem.meta.mapping)
533
+
534
+ def map(self, mem):
535
+ # Check if the memory is already mapped on this device
536
+ if self.dev in mem.meta.mapped_devs: return
537
+ mem.meta.mapped_devs.append(self.dev)
538
+
539
+ paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
540
+ self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
541
+
542
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
543
+ if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
544
+ self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
545
+ doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
546
+ else:
547
+ self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
548
+ eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
382
549
 
383
- self.device_id = int(device.split(":")[1]) if ":" in device else 0
384
- if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
550
+ return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
551
+ read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
552
+
553
+ def sleep(self, timeout):
554
+ if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
555
+ self.irq_fd.read(8 * events_cnt)
556
+ self.adev.ih.interrupt_handler()
557
+
558
+ def on_device_hang(self):
559
+ for d in self.dev.devices: d.dev_iface.adev.gmc.on_interrupt()
560
+ raise RuntimeError("Device hang detected")
561
+
562
+ def device_fini(self): self.adev.fini()
385
563
 
386
- with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
387
- with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
388
- self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
389
- self.target = int(self.properties['gfx_target_version'])
564
+ class AMDDevice(HCQCompiled):
565
+ driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
566
+ signals_page:Any = None
567
+ signals_pool:list[int] = []
568
+
569
+ def __init__(self, device:str=""):
570
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
571
+ self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
572
+ self.target = int(self.dev_iface.props['gfx_target_version'])
390
573
  self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
391
574
  if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
392
575
 
393
- kfd.AMDKFD_IOC_ACQUIRE_VM(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
576
+ if AMDDevice.signals_page is None:
577
+ AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
578
+ AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
579
+ else: self.dev_iface.map(AMDDevice.signals_page)
394
580
 
395
- if AMDDevice.event_page is None:
396
- AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
397
- AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
398
- AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
399
- kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
400
- else:
401
- self._gpu_map(AMDDevice.signals_page)
402
- self._gpu_map(AMDDevice.event_page)
403
-
404
- # Scratch setup
405
- max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
406
- max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
407
- self.max_private_segment_size = 4096
408
- wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
409
- self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
410
- self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
581
+ self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
582
+ self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
411
583
  self.has_scratch_base_registers = self.target >= 110000
412
- engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
413
- self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
414
584
 
415
585
  # https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
416
586
  sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
417
587
  vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
418
- wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (max_cu_id + 1), mmap.PAGESIZE)
419
- ctl_stack_size = round_up(12 * (max_cu_id + 1) * (max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
420
- self.debug_memory_size = round_up((max_cu_id + 1) * (max_wave_id + 1) * 32, 64)
588
+ wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
589
+ ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
590
+ debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
421
591
 
422
- self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
423
- eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
424
- self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
592
+ self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
593
+ eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
425
594
 
426
- self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
427
- self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
595
+ self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
428
596
 
429
597
  super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
430
598
  AMDSignal, AMDComputeQueue, AMDCopyQueue)
431
599
 
432
- def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None, ctl_stack_size=0) -> AMDQueueDesc:
433
- gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
434
- ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
435
- cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE),
436
- kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
437
- eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
438
- queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
439
- queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
440
- eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
441
- ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size if cwsr_ctx else 0,
442
- write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
443
-
444
- if not hasattr(self, 'doorbells'):
445
- self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
446
- self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
447
-
448
- return AMDQueueDesc(ring=to_mv(ring.va_addr, ring_size).cast("I"),
449
- read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
450
- doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
600
+ # Scratch setup
601
+ self.max_private_segment_size = 0
602
+ self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
603
+
604
+ def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
605
+ ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
606
+ gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
607
+ eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
608
+ return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
609
+ ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
610
+
611
+ def _ensure_has_local_memory(self, required):
612
+ if self.max_private_segment_size >= required: return
613
+
614
+ # <gfx103 requires alignment of 1024, >=gfx11 requires 256
615
+ wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
616
+
617
+ self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
618
+ if ok:
619
+ engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
620
+ waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
621
+ # >=gfx11 wavesize is per SE
622
+ wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
623
+ self.tmpring_size = waves << 12 | wavesize
624
+ self.max_private_segment_size = required
451
625
 
452
626
  def invalidate_caches(self):
453
627
  AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
454
628
  self.timeline_value += 1
455
629
  self.synchronize()
456
630
 
457
- def on_device_hang(self):
458
- report = []
459
-
460
- ev = (kfd.struct_kfd_event_data)(event_id=self.mem_fault_event.event_id)
461
- kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
462
- if ev.memory_exception_data.gpu_id:
463
- pfstatus = ' '.join(f'{k[0]}={getattr(ev.memory_exception_data.failure, k[0])}' for k in ev.memory_exception_data.failure._fields_)
464
- report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {pfstatus}"]
631
+ def on_device_hang(self): self.dev_iface.on_device_hang()
465
632
 
466
- ev = (kfd.struct_kfd_event_data)(event_id=self.hw_fault_event.event_id)
467
- kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
468
- if ev.hw_exception_data.gpu_id:
469
- report += [f"HW fault: {' '.join(f'{k[0]}={getattr(ev.hw_exception_data, k[0])}' for k in ev.hw_exception_data._fields_)}"]
470
-
471
- raise RuntimeError("\n".join(report))
633
+ def finalize(self):
634
+ self.synchronize()
635
+ if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()