tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. tinygrad/__init__.py +1 -1
  2. tinygrad/apps/llm.py +206 -0
  3. tinygrad/codegen/__init__.py +116 -0
  4. tinygrad/codegen/devectorizer.py +315 -172
  5. tinygrad/codegen/expander.py +8 -16
  6. tinygrad/codegen/gpudims.py +89 -0
  7. tinygrad/codegen/linearize.py +205 -203
  8. tinygrad/codegen/lowerer.py +92 -139
  9. tinygrad/codegen/opt/__init__.py +38 -0
  10. tinygrad/codegen/opt/heuristic.py +125 -0
  11. tinygrad/codegen/opt/kernel.py +510 -0
  12. tinygrad/{engine → codegen/opt}/search.py +51 -35
  13. tinygrad/codegen/opt/swizzler.py +134 -0
  14. tinygrad/codegen/opt/tc.py +127 -0
  15. tinygrad/codegen/quantize.py +67 -0
  16. tinygrad/device.py +122 -132
  17. tinygrad/dtype.py +152 -35
  18. tinygrad/engine/jit.py +81 -54
  19. tinygrad/engine/memory.py +46 -27
  20. tinygrad/engine/realize.py +82 -41
  21. tinygrad/engine/schedule.py +70 -445
  22. tinygrad/frontend/__init__.py +0 -0
  23. tinygrad/frontend/onnx.py +1253 -0
  24. tinygrad/frontend/torch.py +5 -0
  25. tinygrad/gradient.py +19 -27
  26. tinygrad/helpers.py +95 -47
  27. tinygrad/nn/__init__.py +7 -8
  28. tinygrad/nn/optim.py +72 -41
  29. tinygrad/nn/state.py +37 -23
  30. tinygrad/renderer/__init__.py +40 -60
  31. tinygrad/renderer/cstyle.py +143 -128
  32. tinygrad/renderer/llvmir.py +113 -62
  33. tinygrad/renderer/ptx.py +50 -32
  34. tinygrad/renderer/wgsl.py +27 -23
  35. tinygrad/runtime/autogen/am/am.py +5861 -0
  36. tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
  37. tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
  38. tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
  39. tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
  40. tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
  41. tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
  42. tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
  43. tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
  44. tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
  45. tinygrad/runtime/autogen/comgr.py +35 -9
  46. tinygrad/runtime/autogen/comgr_3.py +906 -0
  47. tinygrad/runtime/autogen/cuda.py +2419 -494
  48. tinygrad/runtime/autogen/hsa.py +57 -16
  49. tinygrad/runtime/autogen/ib.py +7171 -0
  50. tinygrad/runtime/autogen/io_uring.py +917 -118
  51. tinygrad/runtime/autogen/kfd.py +748 -26
  52. tinygrad/runtime/autogen/libc.py +613 -218
  53. tinygrad/runtime/autogen/libusb.py +1643 -0
  54. tinygrad/runtime/autogen/nv/nv.py +8602 -0
  55. tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
  56. tinygrad/runtime/autogen/opencl.py +2 -4
  57. tinygrad/runtime/autogen/sqtt.py +1789 -0
  58. tinygrad/runtime/autogen/vfio.py +3 -3
  59. tinygrad/runtime/autogen/webgpu.py +273 -264
  60. tinygrad/runtime/graph/cuda.py +3 -3
  61. tinygrad/runtime/graph/hcq.py +68 -29
  62. tinygrad/runtime/graph/metal.py +29 -13
  63. tinygrad/runtime/graph/remote.py +114 -0
  64. tinygrad/runtime/ops_amd.py +537 -320
  65. tinygrad/runtime/ops_cpu.py +108 -7
  66. tinygrad/runtime/ops_cuda.py +12 -14
  67. tinygrad/runtime/ops_disk.py +13 -10
  68. tinygrad/runtime/ops_dsp.py +47 -40
  69. tinygrad/runtime/ops_gpu.py +13 -11
  70. tinygrad/runtime/ops_hip.py +6 -9
  71. tinygrad/runtime/ops_llvm.py +35 -15
  72. tinygrad/runtime/ops_metal.py +29 -19
  73. tinygrad/runtime/ops_npy.py +5 -3
  74. tinygrad/runtime/ops_null.py +28 -0
  75. tinygrad/runtime/ops_nv.py +306 -234
  76. tinygrad/runtime/ops_python.py +62 -52
  77. tinygrad/runtime/ops_qcom.py +28 -39
  78. tinygrad/runtime/ops_remote.py +482 -0
  79. tinygrad/runtime/ops_webgpu.py +28 -28
  80. tinygrad/runtime/support/am/amdev.py +114 -249
  81. tinygrad/runtime/support/am/ip.py +211 -172
  82. tinygrad/runtime/support/amd.py +138 -0
  83. tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
  84. tinygrad/runtime/support/compiler_cuda.py +8 -11
  85. tinygrad/runtime/support/elf.py +2 -1
  86. tinygrad/runtime/support/hcq.py +184 -97
  87. tinygrad/runtime/support/ib.py +172 -0
  88. tinygrad/runtime/support/llvm.py +3 -4
  89. tinygrad/runtime/support/memory.py +251 -0
  90. tinygrad/runtime/support/nv/__init__.py +0 -0
  91. tinygrad/runtime/support/nv/ip.py +581 -0
  92. tinygrad/runtime/support/nv/nvdev.py +183 -0
  93. tinygrad/runtime/support/system.py +170 -0
  94. tinygrad/runtime/support/usb.py +268 -0
  95. tinygrad/runtime/support/webgpu.py +18 -0
  96. tinygrad/schedule/__init__.py +0 -0
  97. tinygrad/schedule/grouper.py +119 -0
  98. tinygrad/schedule/kernelize.py +368 -0
  99. tinygrad/schedule/multi.py +231 -0
  100. tinygrad/shape/shapetracker.py +40 -46
  101. tinygrad/shape/view.py +88 -52
  102. tinygrad/tensor.py +968 -542
  103. tinygrad/uop/__init__.py +117 -0
  104. tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
  105. tinygrad/uop/mathtraits.py +169 -0
  106. tinygrad/uop/ops.py +1021 -0
  107. tinygrad/uop/spec.py +228 -0
  108. tinygrad/{codegen → uop}/symbolic.py +239 -216
  109. tinygrad/uop/upat.py +163 -0
  110. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
  111. tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
  112. tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
  113. tinygrad/viz/index.html +203 -403
  114. tinygrad/viz/js/index.js +718 -0
  115. tinygrad/viz/js/worker.js +29 -0
  116. tinygrad/viz/serve.py +224 -102
  117. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
  118. tinygrad-0.11.0.dist-info/RECORD +141 -0
  119. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
  120. tinygrad/codegen/kernel.py +0 -693
  121. tinygrad/engine/multi.py +0 -161
  122. tinygrad/ops.py +0 -1003
  123. tinygrad/runtime/ops_cloud.py +0 -220
  124. tinygrad/runtime/support/allocator.py +0 -94
  125. tinygrad/spec.py +0 -155
  126. tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
  127. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
  128. tinygrad/viz/perfetto.html +0 -178
  129. tinygrad-0.10.2.dist-info/RECORD +0 -99
  130. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
  131. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,161 +1,337 @@
1
1
  from __future__ import annotations
2
- from typing import Any, cast
3
- import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
2
+ from typing import cast, ClassVar
3
+ import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, weakref
4
4
  assert sys.platform != 'win32'
5
5
  from dataclasses import dataclass
6
- from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
7
- from tinygrad.ops import sint
8
- from tinygrad.device import BufferSpec, CPUProgram
9
- from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
7
+ from tinygrad.runtime.support.hcq import MMIOInterface
8
+ from tinygrad.uop.ops import sint
9
+ from tinygrad.device import Compiled, DMAFdRef, BufferSpec
10
+ from tinygrad.helpers import getenv, to_mv, round_up, data64_le, all_same, flatten, DEBUG, AMD_LLVM, PROFILE, ProfileEvent, suppress_finalizing
10
11
  from tinygrad.renderer.cstyle import AMDRenderer
11
- from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
12
+ from tinygrad.renderer.llvmir import AMDLLVMRenderer
13
+ from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
12
14
  from tinygrad.runtime.autogen.am import am
13
- from tinygrad.runtime.support.compiler_hip import AMDCompiler
15
+ from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
14
16
  from tinygrad.runtime.support.elf import elf_loader
15
- from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
17
+ from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager
18
+ from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, setup_pci_bars
19
+ from tinygrad.runtime.support.system import System, PCIIfaceBase, PCIAllocationMeta, MAP_FIXED, MAP_NORESERVE
20
+ from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface
16
21
  if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
17
22
 
18
- regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
19
-
20
23
  EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
24
+ WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
25
+ WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
21
26
  WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
22
27
 
23
- COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
24
-
25
- def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
26
- def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
27
-
28
28
  class AMDSignal(HCQSignal):
29
- def __init__(self, base_addr:int|None=None, **kwargs):
30
- super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
31
-
32
- def __del__(self):
33
- if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
29
+ def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
34
30
 
35
31
  def _sleep(self, time_spent_waiting_ms:int):
36
32
  # Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
37
- if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
33
+ if time_spent_waiting_ms > 2000 and self.is_timeline and self.owner is not None: self.owner.iface.sleep(200)
38
34
 
39
35
  class AMDComputeQueue(HWQueue):
36
+ def __init__(self, dev:AMDDevice):
37
+ self.dev, self.soc, self.pm4, self.gc, self.nbio = dev, dev.soc, dev.pm4, dev.gc, dev.nbio
38
+ super().__init__()
39
+
40
40
  def __del__(self):
41
41
  if self.binded_device is not None:
42
42
  self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
43
43
 
44
- def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
44
+ def pkt3(self, cmd, *vals): self.q(self.pm4.PACKET3(cmd, len(vals) - 1), *vals)
45
+
46
+ def wreg(self, reg:AMDReg, *args:sint, **kwargs:int):
47
+ if bool(args) == bool(kwargs): raise RuntimeError('One (and only one) of *args or **kwargs must be specified')
48
+ if self.pm4.PACKET3_SET_SH_REG_START <= reg.addr[0] < self.pm4.PACKET3_SET_SH_REG_END:
49
+ set_packet, set_packet_start = self.pm4.PACKET3_SET_SH_REG, self.pm4.PACKET3_SET_SH_REG_START
50
+ elif self.pm4.PACKET3_SET_UCONFIG_REG_START <= reg.addr[0] < self.pm4.PACKET3_SET_UCONFIG_REG_START + 2**16-1:
51
+ set_packet, set_packet_start = self.pm4.PACKET3_SET_UCONFIG_REG, self.pm4.PACKET3_SET_UCONFIG_REG_START
52
+ else: raise RuntimeError(f'Cannot set {reg.name} ({reg.addr[0]}) via pm4 packet')
53
+ self.pkt3(set_packet, reg.addr[0] - set_packet_start, *(args or (reg.encode(**kwargs),)))
54
+
55
+ @contextlib.contextmanager
56
+ def pred_exec(self, xcc_mask:int):
57
+ if self.dev.xccs > 1:
58
+ self.pkt3(self.pm4.PACKET3_PRED_EXEC, xcc_mask << 24)
59
+ prev_len = len(self._q)
60
+ yield
61
+ if self.dev.xccs > 1:
62
+ self._q[prev_len-1] |= (len(self._q) - prev_len)
45
63
 
46
64
  def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
47
- wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
48
- | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
65
+ wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None)) \
66
+ | self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | self.pm4.WAIT_REG_MEM_ENGINE(0)
49
67
 
50
- self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
68
+ self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
51
69
 
52
70
  def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
53
- cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
54
- | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
55
- | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
56
- | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
57
- | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
71
+ if self.dev.target >= (10,0,0):
72
+ cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
73
+ | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
74
+ | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
75
+ | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
76
+ | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
77
+
78
+ self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
79
+ else:
80
+ cp_coher_cntl = self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(gli) | \
81
+ self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(glk) | \
82
+ self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(gl2) | \
83
+ self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(gl1) | \
84
+ self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(gl2)
85
+ self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, cp_coher_cntl, *data64_le(sz), *data64_le(addr), 0x0000000A)
86
+
87
+ def release_mem(self, address=0x0, value=0, data_sel=0, int_sel=2, ctxid=0, cache_flush=False):
88
+ if self.dev.target >= (10,0,0):
89
+ cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
90
+ | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
91
+ | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
92
+
93
+ event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
94
+ | self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
95
+
96
+ memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
97
+ | self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
98
+ else:
99
+ cache_flags_dw = 0 if not cache_flush else (self.pm4.EOP_TC_WB_ACTION_EN | self.pm4.EOP_TC_NC_ACTION_EN)
58
100
 
59
- self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
101
+ event_dw = self.pm4.EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) | self.pm4.EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
60
102
 
61
- def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
62
- cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
63
- | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
64
- | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
103
+ memsel_dw = self.pm4.DATA_SEL(data_sel) | self.pm4.INT_SEL(int_sel)
65
104
 
66
- event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
67
- | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
105
+ ctxid = 0
68
106
 
69
- memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
107
+ self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
70
108
 
71
- self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
109
+ def xcc_barrier(self):
110
+ if self.dev.xcc_sync is None: return self
111
+ assert self.dev.xccs == 8, 'only 8 XCCs supported'
112
+ a, b = self.dev.xcc_sync
113
+ mem_eq = self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | self.pm4.WAIT_REG_MEM_MEM_SPACE(1)
114
+ self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(a.value_addr), *data64_le(1), *data64_le(0), 0x10) # a += 1
115
+ self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(a.value_addr), 0, 0b111, 0x80) # a == 0 (mod 8) via bitmask
116
+ self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(b.value_addr), *data64_le(1), *data64_le(0), 0x10) # b += 1
117
+ self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(b.value_addr), 0, 0b111, 0x80) # b == 0 (mod 8) via bitmask
118
+ return self
72
119
 
73
120
  def memory_barrier(self):
74
- self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
121
+ pf = '' if self.nbio.version[0] == 2 else '0' if self.nbio.version[:2] != (7, 11) else '1'
122
+ self.wait_reg_mem(reg_req=getattr(self.nbio, f'regBIF_BX_PF{pf}_GPU_HDP_FLUSH_REQ').addr[0],
123
+ reg_done=getattr(self.nbio, f'regBIF_BX_PF{pf}_GPU_HDP_FLUSH_DONE').addr[0], value=0xffffffff)
75
124
  self.acquire_mem()
76
125
  return self
77
126
 
127
+ def xcc_config(self):
128
+ self.wreg(self.gc.regCOMPUTE_TG_CHUNK_SIZE, 1)
129
+ for xcc_id in range(self.dev.xccs):
130
+ with self.pred_exec(xcc_mask=1 << xcc_id):
131
+ self.wreg(self.dev.regCOMPUTE_CURRENT_LOGIC_XCC_ID, xcc_id)
132
+ return self
133
+
134
+ def spi_config(self, tracing:bool):
135
+ self.wreg(self.gc.regSPI_CONFIG_CNTL, ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
136
+ enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
137
+
138
+ ### SQTT ###
139
+
140
+ def sqtt_userdata(self, data, *extra_dwords):
141
+ data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
142
+ for i in range(0, len(data_ints), 2):
143
+ self.wreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2, *data_ints[i:i+2])
144
+
145
+ def sqtt_config(self, tracing:bool):
146
+ self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
147
+ rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK, util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
148
+
149
+ # Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
150
+ def sqtt_start(self, buf0s:list[HCQBuffer], se_mask:int):
151
+ self.memory_barrier()
152
+ self.spi_config(tracing=True)
153
+ # One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
154
+ for se in range(len(buf0s)):
155
+ self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1)
156
+ buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
157
+ self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size>>12)
158
+ self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
159
+ # NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
160
+ # For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
161
+ # and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
162
+ # sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
163
+ # be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
164
+ # CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
165
+ self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0)
166
+ REG_INCLUDE = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
167
+ self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
168
+ TOKEN_EXCLUDE = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
169
+ if not (se_mask >> se) & 0b1:
170
+ TOKEN_EXCLUDE |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
171
+ 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
172
+ 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
173
+ self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1)
174
+ # Enable SQTT
175
+ self.sqtt_config(tracing=True)
176
+ # Restore global broadcasting
177
+ self.wreg(self.gc.regGRBM_GFX_INDEX, se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
178
+ self.wreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE, 1)
179
+ self.memory_barrier()
180
+ return self
181
+
182
+ # Magic values from src/amd/common/ac_sqtt.c:ac_sqtt_emit_stop and src/amd/common/ac_sqtt.c:ac_sqtt_emit_wait
183
+ def sqtt_stop(self, ses: int, wptrs: HCQBuffer):
184
+ self.memory_barrier()
185
+ # Start shutting everything down
186
+ self.wreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE, 0)
187
+ self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_FINISH) | self.pm4.EVENT_INDEX(0))
188
+ # For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
189
+ for se in range(ses):
190
+ self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1)
191
+ # Wait for FINISH_PENDING==0
192
+ self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
193
+ self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), 4)
194
+ # Wait for FINISH_DONE!=0
195
+ self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
196
+ self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_done'), 4)
197
+ # Disable SQTT
198
+ self.sqtt_config(tracing=False)
199
+ # Wait for BUSY==0
200
+ self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
201
+ self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('busy'), 4)
202
+ # Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
203
+ self.pkt3(self.pm4.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, self.gc.regSQ_THREAD_TRACE_WPTR.addr[0], 0, *data64_le(wptrs.va_addr+(se*4)))
204
+ # Restore global broadcasting
205
+ self.wreg(self.gc.regGRBM_GFX_INDEX, se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
206
+ self.spi_config(tracing=False)
207
+ self.memory_barrier()
208
+ return self
209
+
210
+ def sqtt_prg_marker(self, prg:AMDProgram, global_size:tuple[sint, ...]):
211
+ BIND_POINT_COMPUTE = 1
212
+
213
+ self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(
214
+ _0=sqtt.union_rgp_sqtt_marker_pipeline_bind_0(_0=sqtt.struct_rgp_sqtt_marker_pipeline_bind_0_0(
215
+ identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE, bind_point=BIND_POINT_COMPUTE)),
216
+ _1=sqtt.union_rgp_sqtt_marker_pipeline_bind_1(api_pso_hash=data64_le(prg.libhash[0]))))
217
+
218
+ self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(
219
+ _0=sqtt.union_rgp_sqtt_marker_event_0(_0=sqtt.struct_rgp_sqtt_marker_event_0_0(has_thread_dims=1)),
220
+ _2=sqtt.union_rgp_sqtt_marker_event_2(cmd_id=prg.dev.cmd_id)), *global_size)
221
+
222
+ prg.dev.cmd_id += 1
223
+
78
224
  def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
79
225
  self.bind_args_state(args_state)
80
226
 
81
227
  self.acquire_mem(gli=0, gl2=0)
82
228
 
229
+ user_regs = []
83
230
  if prg.enable_private_segment_sgpr:
231
+ assert self.dev.xccs == 1, "Only architected flat scratch is suppored on multi-xcc"
84
232
  scratch_hilo = data64_le(prg.dev.scratch.va_addr)
85
233
  # sgpr word1 bit31 enables swizzle
86
234
  # sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
87
- user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000] if prg.enable_private_segment_sgpr else []
88
- else: user_regs = []
235
+ user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000]
236
+
89
237
  if prg.enable_dispatch_ptr:
90
- dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
238
+ dp = (dp_t:=hsa.hsa_kernel_dispatch_packet_t).from_address(cast(int, (disp_buf:=args_state.buf.offset(prg.kernargs_segment_size)).va_addr))
239
+
240
+ self.bind_sints(*local_size, mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='workgroup_size_x', fmt='H')
241
+ self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='grid_size_x', fmt='I')
242
+ dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.buf.va_addr
243
+ user_regs += [*data64_le(disp_buf.va_addr)]
244
+
245
+ user_regs += [*data64_le(args_state.buf.va_addr)]
91
246
 
92
- self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
93
- self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
94
- dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
95
- user_regs += [*data64_le(dp_addr)]
247
+ if prg.dev.sqtt_enabled: self.sqtt_prg_marker(prg, global_size)
96
248
 
97
- user_regs += [*data64_le(args_state.ptr)]
249
+ self.wreg(self.gc.regCOMPUTE_PGM_LO, *data64_le(prg.prog_addr >> 8))
250
+ self.wreg(self.gc.regCOMPUTE_PGM_RSRC1, prg.rsrc1, prg.rsrc2)
251
+ self.wreg(self.gc.regCOMPUTE_PGM_RSRC3, prg.rsrc3)
252
+ self.wreg(self.gc.regCOMPUTE_TMPRING_SIZE, prg.dev.tmpring_size)
98
253
 
99
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
100
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
101
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
102
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
103
254
  if prg.dev.has_scratch_base_registers:
104
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
105
- if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
106
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
107
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
108
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
109
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
110
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
111
-
112
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
113
- self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
114
-
115
- self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
116
- self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
255
+ for xcc_id in range(self.dev.xccs):
256
+ with self.pred_exec(xcc_mask=1<<xcc_id):
257
+ scratch_base = prg.dev.scratch.va_addr + (prg.dev.scratch.size // self.dev.xccs * xcc_id)
258
+ self.wreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO, *data64_le(scratch_base >> 8))
259
+
260
+ if (10,0,0) <= prg.dev.target < (11,0,0): self.wreg(self.gc.mmCP_COHER_START_DELAY, 0x20)
261
+
262
+ self.wreg(self.gc.regCOMPUTE_RESTART_X, 0, 0, 0)
263
+ self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF, 0xFFFFFFFF)
264
+ self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF, 0xFFFFFFFF)
265
+ if prg.dev.target >= (11,0,0): self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
266
+
267
+ self.wreg(self.gc.regCOMPUTE_USER_DATA_0, *user_regs)
268
+ self.wreg(self.gc.regCOMPUTE_RESOURCE_LIMITS, 0)
269
+
270
+ self.wreg(self.gc.regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0)
271
+
272
+ gfx10p = {'cs_w32_en': int(prg.wave32)} if prg.dev.target >= (10,0,0) else {}
273
+ DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(**gfx10p, force_start_at_000=1, compute_shader_en=1)
274
+ self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR)
275
+
276
+ if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
277
+ self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
278
+
279
+ if self.dev.xccs > 1:
280
+ self.release_mem(cache_flush=True)
281
+ self.acquire_mem(gli=0)
282
+ self.xcc_barrier()
117
283
  return self
118
284
 
119
285
  def wait(self, signal:AMDSignal, value:sint=0):
120
286
  self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
287
+ if self.dev.xccs > 1: self.xcc_barrier()
121
288
  return self
122
289
 
123
290
  def timestamp(self, signal:AMDSignal):
124
- self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
291
+ with self.pred_exec(xcc_mask=0b1):
292
+ self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
125
293
  return self
126
294
 
127
295
  def signal(self, signal:AMDSignal, value:sint=0):
128
- # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
129
- self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
130
- amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
131
-
132
- if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
133
- self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
134
- amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
296
+ with self.pred_exec(xcc_mask=0b1):
297
+ # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
298
+ self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
299
+ self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
300
+
301
+ if (dev:=signal.owner) is not None and signal.is_timeline and not dev.is_am():
302
+ self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
303
+ self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
135
304
  return self
136
305
 
137
306
  def bind(self, dev:AMDDevice):
138
307
  self.binded_device = dev
139
308
  self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
140
- hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
309
+ hw_view = self.hw_page.cpu_view().view(fmt='I')
141
310
  for i, value in enumerate(self._q): hw_view[i] = value
142
311
 
143
- self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
144
- len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
312
+ self.indirect_cmd = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
313
+ len(self._q) | self.pm4.INDIRECT_BUFFER_VALID]
145
314
  self._q = hw_view
146
315
  return self
147
316
 
148
317
  def _submit(self, dev:AMDDevice):
149
318
  cmds = self.indirect_cmd if dev == self.binded_device else self._q
319
+ # WORKAROUND: PACKET3_PRED_EXEC doesn't work in rings, only in IBs, create a fake IB inside a ring to work around that
320
+ if self.dev.xccs > 1 and dev != self.binded_device:
321
+ ib_end = ((dev.compute_queue.put_value + 5) % len(dev.compute_queue.ring)) + len(cmds)
322
+ ib_pad = len(dev.compute_queue.ring) - (ib_end - len(cmds)) if ib_end > len(dev.compute_queue.ring) else 0
323
+ ib_ptr = dev.compute_queue.ring.addr + ((dev.compute_queue.put_value + 5 + ib_pad) % len(dev.compute_queue.ring)) * 4
324
+ cmds = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(ib_ptr), len(cmds) | self.pm4.INDIRECT_BUFFER_VALID,
325
+ self.pm4.PACKET3(self.pm4.PACKET3_NOP, ib_pad + len(cmds) - 1), *((0,) * ib_pad), *cmds]
150
326
 
151
327
  for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
152
328
 
153
329
  dev.compute_queue.put_value += len(cmds)
154
- dev.compute_queue.signal_doorbell()
330
+ dev.compute_queue.signal_doorbell(dev)
155
331
 
156
332
  class AMDCopyQueue(HWQueue):
157
- def __init__(self, max_copy_size=0x40000000):
158
- self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
333
+ def __init__(self, dev, max_copy_size=0x40000000):
334
+ self.dev, self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev, dev.sdma, [], max_copy_size
159
335
  super().__init__()
160
336
 
161
337
  def q(self, *arr):
@@ -168,47 +344,47 @@ class AMDCopyQueue(HWQueue):
168
344
  for _ in range(copy_commands):
169
345
  step_copy_size = min(copy_size - copied, self.max_copy_size)
170
346
 
171
- self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
172
- amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
347
+ self.q(self.sdma.SDMA_OP_COPY | self.sdma.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_COPY_LINEAR),
348
+ self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
173
349
 
174
350
  copied += step_copy_size
175
351
  return self
176
352
 
177
353
  def signal(self, signal:AMDSignal, value:sint=0):
178
- self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
354
+ fence_flags = self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3) if self.dev.target >= (10,0,0) else 0
355
+ self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(signal.value_addr), value)
179
356
 
180
- if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
181
- self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
182
- self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
183
- elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
357
+ if (dev:=signal.owner) is not None and signal.is_timeline and not dev.is_am():
358
+ self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
359
+ self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
360
+ elif dev is not None and dev.is_am(): self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
184
361
 
185
362
  return self
186
363
 
187
364
  def wait(self, signal:AMDSignal, value:sint=0):
188
- self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
189
- amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
190
- amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
365
+ self.q(self.sdma.SDMA_OP_POLL_REGMEM | self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
366
+ self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
367
+ self.sdma.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | self.sdma.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
191
368
  return self
192
369
 
193
370
  def timestamp(self, signal:AMDSignal):
194
- self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
371
+ self.q(self.sdma.SDMA_OP_TIMESTAMP | self.sdma.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
195
372
  *data64_le(signal.timestamp_addr))
196
373
  return self
197
374
 
198
375
  def bind(self, dev:AMDDevice):
199
- if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return
376
+ if not getenv("AMD_SDMA_BIND", 0) or not dev.is_am(): return
200
377
 
201
378
  self.binded_device = dev
202
379
  self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
203
- hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
380
+ hw_view = self.hw_page.cpu_view().view(fmt='I')
204
381
  for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
205
382
 
206
- self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
383
+ self.indirect_cmd = [self.sdma.SDMA_OP_INDIRECT | self.sdma.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz,
384
+ *data64_le(0)]
207
385
  self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
208
386
 
209
387
  def _submit(self, dev:AMDDevice):
210
- if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
211
-
212
388
  if self.binded_device == dev:
213
389
  # An IB packet must end on a 8 DW boundary.
214
390
  add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
@@ -223,90 +399,122 @@ class AMDCopyQueue(HWQueue):
223
399
  if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
224
400
  tail_blit_dword += cmdsz
225
401
 
402
+ # Force align of submits to hit our usb layer write cache.
403
+ if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0 and dev.is_usb(): tail_blit_dword = 0
404
+
405
+ # USB devices run in single-step mode, so they can't overrun the queue.
406
+ total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) + rem_packet_cnt * 4
407
+ assert total_bytes < dev.sdma_queue.ring.nbytes, "SDMA queue overrun"
408
+ while not dev.is_usb() and dev.sdma_queue.put_value + total_bytes - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: pass
409
+
226
410
  start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
227
411
  dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
228
412
  dev.sdma_queue.put_value += tail_blit_dword * 4
229
413
 
230
414
  if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
231
415
  zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
232
- ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
416
+ dev.sdma_queue.ring.view(dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes, zero_fill, fmt='B')[:] = bytes(zero_fill)
233
417
  dev.sdma_queue.put_value += zero_fill
234
418
 
235
419
  dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
236
420
  dev.sdma_queue.put_value += rem_packet_cnt * 4
237
421
 
238
- dev.sdma_queue.signal_doorbell()
422
+ dev.sdma_queue.signal_doorbell(dev)
239
423
 
240
424
  class AMDProgram(HCQProgram):
241
425
  def __init__(self, dev:AMDDevice, name:str, lib:bytes):
242
426
  # TODO; this API needs the type signature of the function and global_size/local_size
243
- self.dev: AMDDevice = dev
244
- self.name, self.lib = name, lib
245
- image, sections, _ = elf_loader(self.lib)
246
- self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
247
- ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
427
+ self.dev, self.name, self.lib = dev, name, lib
248
428
 
249
- entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
250
- self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
251
- self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
252
- self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
429
+ image, sections, _ = elf_loader(self.lib)
430
+ self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True))
431
+ self.dev.allocator._copyin(self.lib_gpu, image)
432
+ self.dev.synchronize()
253
433
 
434
+ rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
435
+ text_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".text"), -1)
436
+ assert rodata_entry >= 0 and text_entry >= 0, ".text or .rodata section not found"
437
+ self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
438
+ self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0]
439
+ self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0]
254
440
  lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
255
- if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
441
+ if lds_size > (self.dev.iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
256
442
 
257
443
  # Ensure scratch size
258
444
  self.dev._ensure_has_local_memory(self.private_segment_size)
259
445
 
260
- code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
261
- assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
446
+ # NOTE: this is wrong, it's not this object. pad it, since it might be smaller than the struct
447
+ code = hsa.amd_kernel_code_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+256]) + b'\x00'*256)
448
+ self.wave32: bool = code.kernel_code_properties & 0x400 == 0x400
262
449
 
263
450
  # Set rsrc1.priv=1 on gfx11 to workaround cwsr.
264
- self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
451
+ self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if (11,0,0) <= self.dev.target < (12,0,0) else 0)
265
452
  self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
266
- self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
267
-
453
+ self.rsrc3: int = image[rodata_entry+44:rodata_entry+48].cast("I")[0] # NOTE: kernel descriptor, not in amd_kernel_code_t struct
454
+ self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
455
+ if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
268
456
  # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
269
457
  # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
270
458
  self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
271
459
  self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
272
460
  additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
273
461
 
274
- super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
462
+ if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2
275
463
 
276
- def __del__(self):
277
- if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
464
+ super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
465
+ base=self.lib_gpu.va_addr)
466
+ weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
278
467
 
279
468
  class AMDAllocator(HCQAllocator['AMDDevice']):
469
+ def __init__(self, dev:AMDDevice):
470
+ super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None)
471
+ if hasattr(dev.iface, "as_dmaref"): self._as_dmaref = dev.iface.as_dmaref
472
+ self.supports_copy_from_disk = not dev.is_usb()
473
+
280
474
  def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
281
- return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
475
+ return self.dev.iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
282
476
 
477
+ @suppress_finalizing
283
478
  def _free(self, opaque, options:BufferSpec):
284
479
  self.dev.synchronize()
285
- self.dev.dev_iface.free(opaque)
480
+ self.dev.iface.free(opaque)
286
481
 
287
- def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
482
+ def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
288
483
 
289
- MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
484
+ @dataclass(frozen=True)
485
+ class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
290
486
 
291
487
  @dataclass
292
488
  class AMDQueueDesc:
293
- ring: memoryview
294
- read_ptr: memoryview
295
- write_ptr: memoryview
296
- doorbell: memoryview
489
+ ring: MMIOInterface
490
+ read_ptrs: list[MMIOInterface]
491
+ write_ptrs: list[MMIOInterface]
492
+ doorbells: list[MMIOInterface]
297
493
  put_value: int = 0
298
494
 
299
- def signal_doorbell(self):
300
- self.write_ptr[0] = self.put_value
495
+ @property
496
+ def read_ptr(self): return min(p[0] for p in self.read_ptrs)
497
+
498
+ @classmethod
499
+ def multi(cls, *queues: AMDQueueDesc):
500
+ assert all_same([(q.ring.addr, q.put_value) for q in queues]), f"All queues must have the same ring and put_value: {queues}"
501
+ return cls(ring=queues[0].ring, put_value=queues[0].put_value, doorbells=flatten(q.doorbells for q in queues),
502
+ read_ptrs=flatten(q.read_ptrs for q in queues), write_ptrs=flatten(q.write_ptrs for q in queues))
503
+
504
+ def signal_doorbell(self, dev):
505
+ for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value
301
506
 
302
507
  # Ensure all prior writes are visible to the GPU.
303
- if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
304
- self.doorbell[0] = self.put_value
508
+ System.memory_barrier()
509
+
510
+ # Flush hdp if queue is in dev mem.
511
+ if dev.is_am() and not dev.is_usb(): dev.iface.dev_impl.gmc.flush_hdp()
512
+ for doorbell in self.doorbells: doorbell[0] = self.put_value
305
513
 
306
514
  class KFDIface:
307
- kfd:HWInterface|None = None
515
+ kfd:FileIOInterface|None = None
308
516
  event_page:HCQBuffer|None = None
309
- gpus:list[HWInterface] = []
517
+ gpus:list[FileIOInterface] = []
310
518
 
311
519
  def _is_usable_gpu(self, gpu_id):
312
520
  with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
@@ -319,17 +527,23 @@ class KFDIface:
319
527
 
320
528
  # Initialize KFD interface during first run
321
529
  if KFDIface.kfd is None:
322
- KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
323
- gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
530
+ KFDIface.kfd = FileIOInterface("/dev/kfd", os.O_RDWR)
531
+ gpus = [g for g in FileIOInterface(kfd_topo_path).listdir() if self._is_usable_gpu(FileIOInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
324
532
  gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
325
533
  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
326
534
  KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
327
535
 
328
536
  if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
329
537
 
330
- self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
331
- self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
332
- self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
538
+ self.gpu_id = int(FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
539
+ self.props = {(p:=l.split())[0]: int(p[1]) for l in FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
540
+ ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0"
541
+ id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP}
542
+ ip_hw = [(id2ip[int(hwid)], int(hwid)) for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip]
543
+ self.ip_versions = {ip:tuple(int(FileIOInterface(f'{ip_base}/{hw}/0/{part}').read()) for part in ['major','minor','revision']) for ip,hw in ip_hw}
544
+ self.ip_offsets = {ip:{int(i):tuple(int(x, 16) for x in FileIOInterface(f'{ip_base}/{hw}/{i}/base_addr').read().splitlines())
545
+ for i in FileIOInterface(f'{ip_base}/{hw}').listdir()} for ip,hw in ip_hw }
546
+ self.drm_fd = FileIOInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
333
547
 
334
548
  kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
335
549
 
@@ -349,7 +563,7 @@ class KFDIface:
349
563
  self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
350
564
  self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
351
565
 
352
- def alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
566
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, cpu_addr=None) -> HCQBuffer:
353
567
  flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
354
568
 
355
569
  if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
@@ -358,56 +572,60 @@ class KFDIface:
358
572
  if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
359
573
 
360
574
  if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
361
- buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
362
- else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
363
- assert addr != 0xffffffffffffffff
575
+ buf = addr = cpu_addr or FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
576
+ else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
364
577
 
365
578
  try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
366
579
  flags=flags, mmap_offset=buf)
367
580
  except OSError as e:
368
581
  if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
369
582
  raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
370
- if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
583
+ if e.errno == errno.ENOMEM: raise MemoryError(f"Cannot allocate {size} bytes: no memory is available.") from e
371
584
  raise
372
585
 
373
586
  if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
374
587
  buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
375
588
  assert addr == buf == mem.va_addr
376
589
 
377
- self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
590
+ view = MMIOInterface(mem.va_addr, mem.size, fmt='B') if cpu_access or host else None
591
+ self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem, view=view, owner=self.dev))
378
592
  return hcqbuf
379
593
 
380
594
  def free(self, mem):
381
- if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
382
- c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
383
- stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
595
+ if len(mem.mapped_devs) > 0:
596
+ gpus = (ctypes.c_int32 * len(mem.mapped_devs))(*[x.iface.gpu_id for x in mem.mapped_devs])
597
+ stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(gpus), n_devices=len(gpus))
384
598
  assert stm.n_success == len(gpus)
385
- if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
599
+ if mem.va_addr: FileIOInterface.munmap(mem.va_addr, mem.size)
386
600
  kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
387
601
 
602
+ def as_dmaref(self, mem:HCQBuffer) -> DMAFdRef:
603
+ base = mem._base if mem._base is not None else mem
604
+ dmaref = DMAFdRef(kfd.AMDKFD_IOC_EXPORT_DMABUF(KFDIface.kfd, handle=base.meta.handle, flags=0).dmabuf_fd, mem.va_addr-base.va_addr, mem.size)
605
+ weakref.finalize(dmaref, os.close, dmaref.fd)
606
+ return dmaref
607
+
388
608
  def map(self, mem):
389
- if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
390
- mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
391
- c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
392
- stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
393
- n_devices=len(mem.meta.mapped_gpu_ids))
394
- assert stm.n_success == len(mem.meta.mapped_gpu_ids)
395
-
396
- def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
397
- cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
609
+ if mem.owner is not None and mem.owner._is_cpu(): return self.alloc(mem.size, host=True, cpu_addr=mem.va_addr)
610
+
611
+ c_gpus = (ctypes.c_int32 * 1)(self.gpu_id)
612
+ stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=1)
613
+ assert stm.n_success == 1
614
+
615
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
398
616
  queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
399
- queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
617
+ queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE|(xcc_id<<8), queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
400
618
  eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
401
- ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
402
- write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
619
+ ctx_save_restore_address=cwsr_buffer.va_addr if cwsr_buffer else 0, ctx_save_restore_size=ctx_save_restore_size,
620
+ write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8 * (xcc_id + 1))
403
621
 
404
622
  if not hasattr(self, 'doorbells'):
405
623
  self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
406
- self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
624
+ self.doorbells = cast(FileIOInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
407
625
 
408
- return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
409
- read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
410
- doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
626
+ return AMDQueueDesc(ring=MMIOInterface(ring.va_addr, ring.size, fmt='I'), read_ptrs=[MMIOInterface(queue.read_pointer_address, 8, fmt='Q')],
627
+ write_ptrs=[MMIOInterface(queue.write_pointer_address, 8, fmt='Q')],
628
+ doorbells=[MMIOInterface(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8, fmt='Q')])
411
629
 
412
630
  def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
413
631
 
@@ -424,212 +642,211 @@ class KFDIface:
424
642
 
425
643
  raise RuntimeError("\n".join(report))
426
644
 
427
- @dataclass
428
- class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
429
-
430
- class PCIIface:
431
- supported_devs:list[int] = [0x744c, 0x7480]
432
- vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
433
- vfio_fd:HWInterface
434
- gpus:list[Any] = []
645
+ class PCIIface(PCIIfaceBase):
646
+ gpus:ClassVar[list[str]] = []
435
647
 
436
648
  def __init__(self, dev, dev_id):
437
- self.dev = dev
438
-
439
- if first_dev:=len(PCIIface.gpus) == 0:
440
- for pcibus in HWInterface("/sys/bus/pci/devices").listdir():
441
- vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
442
- device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
443
- if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
444
-
445
- # TODO: visible_devices should be handled layer above this?
446
- visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
447
- PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
448
-
449
- self.pcibus = PCIIface.gpus[dev_id]
450
-
451
- # Unbind the device from the kernel driver
452
- if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
453
- HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
454
-
455
- supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
456
- HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
457
-
458
- # Try to init vfio. Use it if success.
459
- if PCIIface.vfio:
460
- try:
461
- if first_dev:
462
- HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
463
- PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
464
- vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
649
+ super().__init__(dev, dev_id, vendor=0x1002, devices=[0x744c, 0x7480, 0x7550], bars=[0, 2, 5], vram_bar=0,
650
+ va_start=AMMemoryManager.va_allocator.base, va_size=AMMemoryManager.va_allocator.size)
651
+ self._setup_adev(self.pci_dev.pcibus, self.pci_dev.map_bar(0), self.pci_dev.map_bar(2, fmt='Q'), self.pci_dev.map_bar(5, fmt='I'))
652
+ self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2)
465
653
 
466
- HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
467
- HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
654
+ def _setup_adev(self, name, vram:MMIOInterface, doorbell:MMIOInterface, mmio:MMIOInterface, dma_regions:list[tuple[int, MMIOInterface]]|None=None):
655
+ self.dev_impl:AMDev = AMDev(name, vram, doorbell, mmio, dma_regions)
656
+ self.ip_offsets, self.ip_versions = self.dev_impl.regs_offset, self.dev_impl.ip_ver
468
657
 
469
- iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
470
- except OSError:
471
- if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).")
472
- PCIIface.vfio = False
658
+ gfxver = int(f"{self.dev_impl.ip_ver[am.GC_HWIP][0]:02d}{self.dev_impl.ip_ver[am.GC_HWIP][1]:02d}{self.dev_impl.ip_ver[am.GC_HWIP][2]:02d}")
659
+ array_count = self.dev_impl.gc_info.gc_num_sa_per_se * self.dev_impl.gc_info.gc_num_se
660
+ simd_count = 2 * array_count * (self.dev_impl.gc_info.gc_num_wgp0_per_sa + self.dev_impl.gc_info.gc_num_wgp1_per_sa)
661
+ self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': gfxver,
662
+ 'max_slots_scratch_cu': self.dev_impl.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.dev_impl.gc_info.gc_max_waves_per_simd,
663
+ 'simd_arrays_per_engine': self.dev_impl.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.dev_impl.gc_info.gc_lds_size}
473
664
 
474
- # Init vfio for the device
475
- if PCIIface.vfio:
476
- self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
477
- vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
665
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
666
+ assert cwsr_buffer is None, "no cwsr buffer for am"
478
667
 
479
- if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
480
- self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
481
-
482
- self.irq_fd = HWInterface.eventfd(0, 0)
483
- self.irq_poller = select.poll()
484
- self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
485
-
486
- irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
487
- argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
488
- vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
489
- else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
490
-
491
- self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
492
- self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
493
- self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
494
-
495
- bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
496
- self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
497
-
498
- self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
499
- self.doorbell_cpu_addr = mv_address(dbell)
500
-
501
- pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
502
- self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
503
-
504
- array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
505
- simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
506
- self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': self.adev.ip_versions[am.GC_HWIP],
507
- 'max_slots_scratch_cu': self.adev.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.adev.gc_info.gc_max_waves_per_simd,
508
- 'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
668
+ if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
669
+ self.dev_impl.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
670
+ doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
671
+ else:
672
+ self.dev_impl.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
673
+ eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
509
674
 
510
- def _map_pci_range(self, bar, off=0, addr=0, size=None):
511
- fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
512
- libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
513
- return to_mv(loc, sz)
675
+ return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.dev_impl.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
676
+ read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')])
514
677
 
515
- def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
516
- if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
517
- vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
518
- va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
678
+ def sleep(self, timeout):
679
+ if self.pci_dev.irq_poller is not None and (events_cnt:=len(self.pci_dev.irq_poller.poll(timeout))):
680
+ self.pci_dev.irq_fd.read(8 * events_cnt)
681
+ self.dev_impl.ih.interrupt_handler()
519
682
 
520
- # Read pagemap to get the physical address of each page. The pages are locked.
521
- self.pagemap.seek(va // mmap.PAGESIZE * 8)
522
- paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
523
- am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
524
- return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
683
+ def on_device_hang(self):
684
+ devs:list[AMDDevice] = [d for pg in HCQCompiled.peer_groups.values() for d in pg if isinstance(d, AMDDevice) and d.is_am()]
685
+ for d in devs: d.iface.dev_impl.gmc.on_interrupt()
686
+ raise RuntimeError("Device hang detected")
525
687
 
526
- am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
527
- if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
528
- return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
688
+ def device_fini(self): self.dev_impl.fini()
529
689
 
530
- def free(self, mem):
531
- for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
532
- if not mem.meta.mapping.system: self.adev.mm.vfree(mem.meta.mapping)
690
+ class USBIface(PCIIface):
691
+ def __init__(self, dev, dev_id):
692
+ self.dev = dev
693
+ self.usb = ASM24Controller()
694
+ self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x10000000, pref_mem_base=(32 << 30))
533
695
 
534
- def map(self, mem):
535
- # Check if the memory is already mapped on this device
536
- if self.dev in mem.meta.mapped_devs: return
537
- mem.meta.mapped_devs.append(self.dev)
696
+ self._setup_adev(f"usb:{dev_id}", USBMMIOInterface(self.usb, *self.bars[0], fmt='B'), USBMMIOInterface(self.usb, *self.bars[2], fmt='Q'),
697
+ USBMMIOInterface(self.usb, *self.bars[5], fmt='I'), dma_regions=[(0x200000, self._dma_view(0xf000, 0x80000))])
698
+ self.usb._pci_cacheable += [self.bars[2]] # doorbell region is cacheable
538
699
 
539
- paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
540
- self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
700
+ # special regions
701
+ self.copy_bufs = [self._dma_region(ctrl_addr=0xf000, sys_addr=0x200000, size=0x80000)]
702
+ self.sys_buf, self.sys_next_off = self._dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0x800
541
703
 
542
- def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
543
- if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
544
- self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
545
- doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
546
- else:
547
- self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
548
- eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
704
+ def _dma_view(self, ctrl_addr, size): return USBMMIOInterface(self.usb, ctrl_addr, size, fmt='B', pcimem=False)
705
+ def _dma_region(self, ctrl_addr, sys_addr, size):
706
+ region = self.dev_impl.mm.map_range(vaddr:=self.dev_impl.mm.alloc_vaddr(size=size), size, [(sys_addr, size)], system=True, uncached=True)
707
+ return HCQBuffer(vaddr, size, meta=PCIAllocationMeta(region, has_cpu_mapping=False), view=self._dma_view(ctrl_addr, size), owner=self.dev)
549
708
 
550
- return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
551
- read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
709
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, **kwargs) -> HCQBuffer:
710
+ if (host or (uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size:
711
+ self.sys_next_off += size
712
+ return self.sys_buf.offset(self.sys_next_off - size, size)
552
713
 
553
- def sleep(self, timeout):
554
- if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
555
- self.irq_fd.read(8 * events_cnt)
556
- self.adev.ih.interrupt_handler()
714
+ am_mapping = self.dev_impl.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contiguous=cpu_access)
715
+ return HCQBuffer(am_mapping.va_addr, size, meta=PCIAllocationMeta(am_mapping, has_cpu_mapping=False),
716
+ view=USBMMIOInterface(self.usb, self.bars[0][0] + am_mapping.paddrs[0][0], size, fmt='B') if cpu_access else None, owner=self.dev)
557
717
 
558
- def on_device_hang(self):
559
- for d in self.dev.devices: d.dev_iface.adev.gmc.on_interrupt()
560
- raise RuntimeError("Device hang detected")
718
+ def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
719
+ if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE: self.usb._pci_cacheable += [(ring.cpu_view().addr, ring.size)]
720
+ return super().create_queue(queue_type, ring, gart, eop_buffer, cwsr_buffer, ctl_stack_size, ctx_save_restore_size, xcc_id)
561
721
 
562
- def device_fini(self): self.adev.fini()
722
+ def sleep(self, timeout): pass
563
723
 
564
724
  class AMDDevice(HCQCompiled):
565
- driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
566
- signals_page:Any = None
567
- signals_pool:list[int] = []
725
+ def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface))
726
+ def is_usb(self) -> bool: return isinstance(self.iface, USBIface)
568
727
 
569
728
  def __init__(self, device:str=""):
570
729
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
571
- self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
572
- self.target = int(self.dev_iface.props['gfx_target_version'])
573
- self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
574
- if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
575
-
576
- if AMDDevice.signals_page is None:
577
- AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
578
- AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
579
- else: self.dev_iface.map(AMDDevice.signals_page)
580
-
581
- self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
582
- self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
583
- self.has_scratch_base_registers = self.target >= 110000
730
+ self.iface = self._select_iface(KFDIface, PCIIface, USBIface)
731
+ self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
732
+ self.arch = "gfx%d%x%x" % self.target
733
+ if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}")
734
+ if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
735
+
736
+ self.max_cu_id = self.iface.props['simd_count'] // self.iface.props['simd_per_cu'] // self.iface.props.get('num_xcc', 1) - 1
737
+ self.max_wave_id = (self.iface.props['max_waves_per_simd'] * self.iface.props['simd_per_cu'] - 1) if self.target >= (10,1,0) else \
738
+ (min((self.max_cu_id+1)*40, self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine'] * 512) - 1)
739
+ self.xccs = self.iface.props.get('num_xcc', 1) if getenv("XCCS", 1) else 1
740
+ # this is what llvm refers to as "architected flat scratch"
741
+ self.has_scratch_base_registers = self.target >= (11,0,0) or self.target in {(9,4,2), (9,5,0)}
584
742
 
585
743
  # https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
586
744
  sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
587
- vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
745
+ if self.target[:2] == (9,5): lds_size_per_cu = self.iface.props["lds_size_in_kb"] << 10
746
+ vgpr_size_per_cu = 0x60000 if self.target in {(11,0,0), (11,0,1), (12,0,0), (12,0,1)} else \
747
+ 0x80000 if (self.target[:2]) in {(9,4), (9,5)} or self.target in {(9,0,8), (9,0,10)} else 0x40000
588
748
  wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
589
- ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
590
- debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
749
+ ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE) if self.target >= (10,1,0) else \
750
+ round_up((self.max_wave_id + 1) * 8 + 8 + 40, mmap.PAGESIZE)
751
+ debug_memory_size = round_up((self.max_cu_id + 1 if self.target >= (10,1,0) else 1) * (self.max_wave_id + 1) * 32, 64)
752
+ if self.target[0] == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
753
+
754
+ self.soc = import_soc(self.target)
755
+ self.pm4 = importlib.import_module(f"tinygrad.runtime.autogen.am.pm4_{'nv' if self.target[0] >= 10 else 'soc15'}")
756
+ self.sdma = import_module('sdma', min(self.iface.ip_versions[am.SDMA0_HWIP], (6, 0, 0)))
757
+ self.gc = AMDIP('gc', self.iface.ip_versions[am.GC_HWIP], self.iface.ip_offsets[am.GC_HWIP])
758
+
759
+ # Define the regCOMPUTE_CURRENT_LOGIC_XCC_ID register, which is missing from the asic_regs files.
760
+ if self.target[:2] in {(9,4),(9,5)}: self.regCOMPUTE_CURRENT_LOGIC_XCC_ID = AMDReg("regCOMPUTE_CURRENT_LOGIC_XCC_ID", 0xe25, 0, {}, self.gc.bases)
761
+
762
+ nbio_name = 'nbio' if self.target[0] < 12 else 'nbif'
763
+ nbio_pad = (0,) if self.target[0] == 9 else ()
764
+ self.nbio = AMDIP(nbio_name, self.iface.ip_versions[am.NBIF_HWIP], {i:nbio_pad+x for i,x in self.iface.ip_offsets[am.NBIF_HWIP].items()})
591
765
 
592
- self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
593
- eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
766
+ self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000,
767
+ ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
594
768
 
595
- self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
769
+ max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
770
+ self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20))
596
771
 
597
- super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
598
- AMDSignal, AMDComputeQueue, AMDCopyQueue)
772
+ super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if AMD_LLVM else AMDRenderer(self.arch),
773
+ AMDLLVMCompiler(self.arch) if AMD_LLVM else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
774
+ AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size),
775
+ kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000)
599
776
 
600
777
  # Scratch setup
601
778
  self.max_private_segment_size = 0
602
779
  self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
603
780
 
781
+ # XCC setup
782
+ self.xcc_sync: tuple[AMDSignal, AMDSignal]|None = None
783
+ if self.xccs > 1:
784
+ self.xcc_sync_area = self.allocator.alloc(0x1000, BufferSpec(nolru=True, cpu_access=True))
785
+ self.xcc_sync = (AMDSignal(base_buf=self.xcc_sync_area), AMDSignal(base_buf=self.xcc_sync_area.offset(256)))
786
+ AMDComputeQueue(self).xcc_config().submit(self)
787
+
788
+ # SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
789
+ self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
790
+ if self.sqtt_enabled:
791
+ if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
792
+ if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
793
+ raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
794
+ f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
795
+ "For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md")
796
+ SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine
797
+ SQTT_NUM = self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine']
798
+ self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)]
799
+ self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
800
+ self.cmd_id = 0
801
+ AMDComputeQueue(self).sqtt_start(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
802
+
604
803
  def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
605
- ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
606
- gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
607
- eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
608
- return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
609
- ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
804
+ ring = self.iface.alloc(ring_size, uncached=True, cpu_access=True)
805
+ gart = self.iface.alloc(0x100, uncached=True, cpu_access=True)
806
+
807
+ cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.iface.props.get('num_xcc', 1), mmap.PAGESIZE)
808
+ cwsr_buffer = self.iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None
809
+ eop_buffer = self.iface.alloc(eop_buffer_size) if eop_buffer_size else None
810
+
811
+ return AMDQueueDesc.multi(*(self.iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer, xcc_id=xcc_id,
812
+ ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
813
+ for xcc_id in range(self.xccs if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE else 1)))
610
814
 
611
815
  def _ensure_has_local_memory(self, required):
612
816
  if self.max_private_segment_size >= required: return
613
817
 
614
818
  # <gfx103 requires alignment of 1024, >=gfx11 requires 256
615
- wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
819
+ wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= (11,0,0) else 1024)
616
820
 
617
- self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
821
+ scratch_size = (self.max_cu_id+1)*self.iface.props['max_slots_scratch_cu']*wave_scratch_len # per xcc
822
+ self.scratch, ok = self._realloc(getattr(self, 'scratch', None), scratch_size*self.xccs)
618
823
  if ok:
619
- engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
620
- waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
824
+ engines = self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine']
825
+ waves = wave_scratch_len // (256 if self.target >= (11,0,0) else 1024)
621
826
  # >=gfx11 wavesize is per SE
622
- wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
827
+ wavesize = scratch_size // ((wave_scratch_len * engines) if self.target >= (11,0,0) else wave_scratch_len)
623
828
  self.tmpring_size = waves << 12 | wavesize
624
829
  self.max_private_segment_size = required
625
830
 
626
831
  def invalidate_caches(self):
627
- AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
628
- self.timeline_value += 1
832
+ AMDComputeQueue(self).memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)
629
833
  self.synchronize()
630
834
 
631
- def on_device_hang(self): self.dev_iface.on_device_hang()
632
-
633
- def finalize(self):
634
- self.synchronize()
635
- if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
835
+ def on_device_hang(self): self.iface.on_device_hang()
836
+
837
+ def _at_profile_finalize(self):
838
+ if self.sqtt_enabled:
839
+ wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
840
+ wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size)
841
+ AMDComputeQueue(self).sqtt_stop(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self)
842
+ self.synchronize()
843
+ if DEBUG>=2: print('Saving SQTT in profile...')
844
+ for i,buf0 in enumerate(self.sqtt_buffers):
845
+ wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
846
+ if DEBUG>=2: print(f'Se {i} blob size {wptr:#x}')
847
+ assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
848
+ # When sqtt buffer overflows, wptr stops at the last dword
849
+ if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)")
850
+ self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0)
851
+ Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
852
+ super()._at_profile_finalize()