tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +1 -1
- tinygrad/apps/llm.py +206 -0
- tinygrad/codegen/__init__.py +116 -0
- tinygrad/codegen/devectorizer.py +315 -172
- tinygrad/codegen/expander.py +8 -16
- tinygrad/codegen/gpudims.py +89 -0
- tinygrad/codegen/linearize.py +205 -203
- tinygrad/codegen/lowerer.py +92 -139
- tinygrad/codegen/opt/__init__.py +38 -0
- tinygrad/codegen/opt/heuristic.py +125 -0
- tinygrad/codegen/opt/kernel.py +510 -0
- tinygrad/{engine → codegen/opt}/search.py +51 -35
- tinygrad/codegen/opt/swizzler.py +134 -0
- tinygrad/codegen/opt/tc.py +127 -0
- tinygrad/codegen/quantize.py +67 -0
- tinygrad/device.py +122 -132
- tinygrad/dtype.py +152 -35
- tinygrad/engine/jit.py +81 -54
- tinygrad/engine/memory.py +46 -27
- tinygrad/engine/realize.py +82 -41
- tinygrad/engine/schedule.py +70 -445
- tinygrad/frontend/__init__.py +0 -0
- tinygrad/frontend/onnx.py +1253 -0
- tinygrad/frontend/torch.py +5 -0
- tinygrad/gradient.py +19 -27
- tinygrad/helpers.py +95 -47
- tinygrad/nn/__init__.py +7 -8
- tinygrad/nn/optim.py +72 -41
- tinygrad/nn/state.py +37 -23
- tinygrad/renderer/__init__.py +40 -60
- tinygrad/renderer/cstyle.py +143 -128
- tinygrad/renderer/llvmir.py +113 -62
- tinygrad/renderer/ptx.py +50 -32
- tinygrad/renderer/wgsl.py +27 -23
- tinygrad/runtime/autogen/am/am.py +5861 -0
- tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
- tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
- tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
- tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
- tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
- tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
- tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
- tinygrad/runtime/autogen/comgr.py +35 -9
- tinygrad/runtime/autogen/comgr_3.py +906 -0
- tinygrad/runtime/autogen/cuda.py +2419 -494
- tinygrad/runtime/autogen/hsa.py +57 -16
- tinygrad/runtime/autogen/ib.py +7171 -0
- tinygrad/runtime/autogen/io_uring.py +917 -118
- tinygrad/runtime/autogen/kfd.py +748 -26
- tinygrad/runtime/autogen/libc.py +613 -218
- tinygrad/runtime/autogen/libusb.py +1643 -0
- tinygrad/runtime/autogen/nv/nv.py +8602 -0
- tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
- tinygrad/runtime/autogen/opencl.py +2 -4
- tinygrad/runtime/autogen/sqtt.py +1789 -0
- tinygrad/runtime/autogen/vfio.py +3 -3
- tinygrad/runtime/autogen/webgpu.py +273 -264
- tinygrad/runtime/graph/cuda.py +3 -3
- tinygrad/runtime/graph/hcq.py +68 -29
- tinygrad/runtime/graph/metal.py +29 -13
- tinygrad/runtime/graph/remote.py +114 -0
- tinygrad/runtime/ops_amd.py +537 -320
- tinygrad/runtime/ops_cpu.py +108 -7
- tinygrad/runtime/ops_cuda.py +12 -14
- tinygrad/runtime/ops_disk.py +13 -10
- tinygrad/runtime/ops_dsp.py +47 -40
- tinygrad/runtime/ops_gpu.py +13 -11
- tinygrad/runtime/ops_hip.py +6 -9
- tinygrad/runtime/ops_llvm.py +35 -15
- tinygrad/runtime/ops_metal.py +29 -19
- tinygrad/runtime/ops_npy.py +5 -3
- tinygrad/runtime/ops_null.py +28 -0
- tinygrad/runtime/ops_nv.py +306 -234
- tinygrad/runtime/ops_python.py +62 -52
- tinygrad/runtime/ops_qcom.py +28 -39
- tinygrad/runtime/ops_remote.py +482 -0
- tinygrad/runtime/ops_webgpu.py +28 -28
- tinygrad/runtime/support/am/amdev.py +114 -249
- tinygrad/runtime/support/am/ip.py +211 -172
- tinygrad/runtime/support/amd.py +138 -0
- tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
- tinygrad/runtime/support/compiler_cuda.py +8 -11
- tinygrad/runtime/support/elf.py +2 -1
- tinygrad/runtime/support/hcq.py +184 -97
- tinygrad/runtime/support/ib.py +172 -0
- tinygrad/runtime/support/llvm.py +3 -4
- tinygrad/runtime/support/memory.py +251 -0
- tinygrad/runtime/support/nv/__init__.py +0 -0
- tinygrad/runtime/support/nv/ip.py +581 -0
- tinygrad/runtime/support/nv/nvdev.py +183 -0
- tinygrad/runtime/support/system.py +170 -0
- tinygrad/runtime/support/usb.py +268 -0
- tinygrad/runtime/support/webgpu.py +18 -0
- tinygrad/schedule/__init__.py +0 -0
- tinygrad/schedule/grouper.py +119 -0
- tinygrad/schedule/kernelize.py +368 -0
- tinygrad/schedule/multi.py +231 -0
- tinygrad/shape/shapetracker.py +40 -46
- tinygrad/shape/view.py +88 -52
- tinygrad/tensor.py +968 -542
- tinygrad/uop/__init__.py +117 -0
- tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
- tinygrad/uop/mathtraits.py +169 -0
- tinygrad/uop/ops.py +1021 -0
- tinygrad/uop/spec.py +228 -0
- tinygrad/{codegen → uop}/symbolic.py +239 -216
- tinygrad/uop/upat.py +163 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
- tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
- tinygrad/viz/index.html +203 -403
- tinygrad/viz/js/index.js +718 -0
- tinygrad/viz/js/worker.js +29 -0
- tinygrad/viz/serve.py +224 -102
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
- tinygrad-0.11.0.dist-info/RECORD +141 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/kernel.py +0 -693
- tinygrad/engine/multi.py +0 -161
- tinygrad/ops.py +0 -1003
- tinygrad/runtime/ops_cloud.py +0 -220
- tinygrad/runtime/support/allocator.py +0 -94
- tinygrad/spec.py +0 -155
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
- tinygrad/viz/perfetto.html +0 -178
- tinygrad-0.10.2.dist-info/RECORD +0 -99
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,161 +1,337 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import
|
3
|
-
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys,
|
2
|
+
from typing import cast, ClassVar
|
3
|
+
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, weakref
|
4
4
|
assert sys.platform != 'win32'
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram,
|
7
|
-
from tinygrad.
|
8
|
-
from tinygrad.
|
9
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
|
7
|
+
from tinygrad.runtime.support.hcq import MMIOInterface
|
8
|
+
from tinygrad.uop.ops import sint
|
9
|
+
from tinygrad.device import Compiled, DMAFdRef, BufferSpec
|
10
|
+
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, all_same, flatten, DEBUG, AMD_LLVM, PROFILE, ProfileEvent, suppress_finalizing
|
10
11
|
from tinygrad.renderer.cstyle import AMDRenderer
|
11
|
-
from tinygrad.
|
12
|
+
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
13
|
+
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
|
12
14
|
from tinygrad.runtime.autogen.am import am
|
13
|
-
from tinygrad.runtime.support.
|
15
|
+
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
14
16
|
from tinygrad.runtime.support.elf import elf_loader
|
15
|
-
from tinygrad.runtime.support.am.amdev import AMDev,
|
17
|
+
from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager
|
18
|
+
from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, setup_pci_bars
|
19
|
+
from tinygrad.runtime.support.system import System, PCIIfaceBase, PCIAllocationMeta, MAP_FIXED, MAP_NORESERVE
|
20
|
+
from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface
|
16
21
|
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
17
22
|
|
18
|
-
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
19
|
-
|
20
23
|
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
24
|
+
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
25
|
+
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
|
21
26
|
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
22
27
|
|
23
|
-
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
24
|
-
|
25
|
-
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
26
|
-
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
27
|
-
|
28
28
|
class AMDSignal(HCQSignal):
|
29
|
-
def __init__(self,
|
30
|
-
super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
|
31
|
-
|
32
|
-
def __del__(self):
|
33
|
-
if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
|
29
|
+
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
|
34
30
|
|
35
31
|
def _sleep(self, time_spent_waiting_ms:int):
|
36
32
|
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
|
37
|
-
if time_spent_waiting_ms > 2000 and self.
|
33
|
+
if time_spent_waiting_ms > 2000 and self.is_timeline and self.owner is not None: self.owner.iface.sleep(200)
|
38
34
|
|
39
35
|
class AMDComputeQueue(HWQueue):
|
36
|
+
def __init__(self, dev:AMDDevice):
|
37
|
+
self.dev, self.soc, self.pm4, self.gc, self.nbio = dev, dev.soc, dev.pm4, dev.gc, dev.nbio
|
38
|
+
super().__init__()
|
39
|
+
|
40
40
|
def __del__(self):
|
41
41
|
if self.binded_device is not None:
|
42
42
|
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
43
43
|
|
44
|
-
def pkt3(self, cmd, *vals): self.q(
|
44
|
+
def pkt3(self, cmd, *vals): self.q(self.pm4.PACKET3(cmd, len(vals) - 1), *vals)
|
45
|
+
|
46
|
+
def wreg(self, reg:AMDReg, *args:sint, **kwargs:int):
|
47
|
+
if bool(args) == bool(kwargs): raise RuntimeError('One (and only one) of *args or **kwargs must be specified')
|
48
|
+
if self.pm4.PACKET3_SET_SH_REG_START <= reg.addr[0] < self.pm4.PACKET3_SET_SH_REG_END:
|
49
|
+
set_packet, set_packet_start = self.pm4.PACKET3_SET_SH_REG, self.pm4.PACKET3_SET_SH_REG_START
|
50
|
+
elif self.pm4.PACKET3_SET_UCONFIG_REG_START <= reg.addr[0] < self.pm4.PACKET3_SET_UCONFIG_REG_START + 2**16-1:
|
51
|
+
set_packet, set_packet_start = self.pm4.PACKET3_SET_UCONFIG_REG, self.pm4.PACKET3_SET_UCONFIG_REG_START
|
52
|
+
else: raise RuntimeError(f'Cannot set {reg.name} ({reg.addr[0]}) via pm4 packet')
|
53
|
+
self.pkt3(set_packet, reg.addr[0] - set_packet_start, *(args or (reg.encode(**kwargs),)))
|
54
|
+
|
55
|
+
@contextlib.contextmanager
|
56
|
+
def pred_exec(self, xcc_mask:int):
|
57
|
+
if self.dev.xccs > 1:
|
58
|
+
self.pkt3(self.pm4.PACKET3_PRED_EXEC, xcc_mask << 24)
|
59
|
+
prev_len = len(self._q)
|
60
|
+
yield
|
61
|
+
if self.dev.xccs > 1:
|
62
|
+
self._q[prev_len-1] |= (len(self._q) - prev_len)
|
45
63
|
|
46
64
|
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
|
47
|
-
wrm_info_dw =
|
48
|
-
|
|
65
|
+
wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
66
|
+
| self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | self.pm4.WAIT_REG_MEM_ENGINE(0)
|
49
67
|
|
50
|
-
self.pkt3(
|
68
|
+
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
51
69
|
|
52
70
|
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
71
|
+
if self.dev.target >= (10,0,0):
|
72
|
+
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
73
|
+
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
74
|
+
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
75
|
+
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
76
|
+
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
77
|
+
|
78
|
+
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
79
|
+
else:
|
80
|
+
cp_coher_cntl = self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(gli) | \
|
81
|
+
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(glk) | \
|
82
|
+
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(gl2) | \
|
83
|
+
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(gl1) | \
|
84
|
+
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(gl2)
|
85
|
+
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, cp_coher_cntl, *data64_le(sz), *data64_le(addr), 0x0000000A)
|
86
|
+
|
87
|
+
def release_mem(self, address=0x0, value=0, data_sel=0, int_sel=2, ctxid=0, cache_flush=False):
|
88
|
+
if self.dev.target >= (10,0,0):
|
89
|
+
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
90
|
+
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
91
|
+
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
|
92
|
+
|
93
|
+
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
94
|
+
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
|
95
|
+
|
96
|
+
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
|
97
|
+
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
|
98
|
+
else:
|
99
|
+
cache_flags_dw = 0 if not cache_flush else (self.pm4.EOP_TC_WB_ACTION_EN | self.pm4.EOP_TC_NC_ACTION_EN)
|
58
100
|
|
59
|
-
|
101
|
+
event_dw = self.pm4.EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) | self.pm4.EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
|
60
102
|
|
61
|
-
|
62
|
-
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
63
|
-
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
64
|
-
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
|
103
|
+
memsel_dw = self.pm4.DATA_SEL(data_sel) | self.pm4.INT_SEL(int_sel)
|
65
104
|
|
66
|
-
|
67
|
-
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
|
105
|
+
ctxid = 0
|
68
106
|
|
69
|
-
|
107
|
+
self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
70
108
|
|
71
|
-
|
109
|
+
def xcc_barrier(self):
|
110
|
+
if self.dev.xcc_sync is None: return self
|
111
|
+
assert self.dev.xccs == 8, 'only 8 XCCs supported'
|
112
|
+
a, b = self.dev.xcc_sync
|
113
|
+
mem_eq = self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | self.pm4.WAIT_REG_MEM_MEM_SPACE(1)
|
114
|
+
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(a.value_addr), *data64_le(1), *data64_le(0), 0x10) # a += 1
|
115
|
+
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(a.value_addr), 0, 0b111, 0x80) # a == 0 (mod 8) via bitmask
|
116
|
+
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(b.value_addr), *data64_le(1), *data64_le(0), 0x10) # b += 1
|
117
|
+
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(b.value_addr), 0, 0b111, 0x80) # b == 0 (mod 8) via bitmask
|
118
|
+
return self
|
72
119
|
|
73
120
|
def memory_barrier(self):
|
74
|
-
self.
|
121
|
+
pf = '' if self.nbio.version[0] == 2 else '0' if self.nbio.version[:2] != (7, 11) else '1'
|
122
|
+
self.wait_reg_mem(reg_req=getattr(self.nbio, f'regBIF_BX_PF{pf}_GPU_HDP_FLUSH_REQ').addr[0],
|
123
|
+
reg_done=getattr(self.nbio, f'regBIF_BX_PF{pf}_GPU_HDP_FLUSH_DONE').addr[0], value=0xffffffff)
|
75
124
|
self.acquire_mem()
|
76
125
|
return self
|
77
126
|
|
127
|
+
def xcc_config(self):
|
128
|
+
self.wreg(self.gc.regCOMPUTE_TG_CHUNK_SIZE, 1)
|
129
|
+
for xcc_id in range(self.dev.xccs):
|
130
|
+
with self.pred_exec(xcc_mask=1 << xcc_id):
|
131
|
+
self.wreg(self.dev.regCOMPUTE_CURRENT_LOGIC_XCC_ID, xcc_id)
|
132
|
+
return self
|
133
|
+
|
134
|
+
def spi_config(self, tracing:bool):
|
135
|
+
self.wreg(self.gc.regSPI_CONFIG_CNTL, ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
|
136
|
+
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
|
137
|
+
|
138
|
+
### SQTT ###
|
139
|
+
|
140
|
+
def sqtt_userdata(self, data, *extra_dwords):
|
141
|
+
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
|
142
|
+
for i in range(0, len(data_ints), 2):
|
143
|
+
self.wreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2, *data_ints[i:i+2])
|
144
|
+
|
145
|
+
def sqtt_config(self, tracing:bool):
|
146
|
+
self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
|
147
|
+
rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK, util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
|
148
|
+
|
149
|
+
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
|
150
|
+
def sqtt_start(self, buf0s:list[HCQBuffer], se_mask:int):
|
151
|
+
self.memory_barrier()
|
152
|
+
self.spi_config(tracing=True)
|
153
|
+
# One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
|
154
|
+
for se in range(len(buf0s)):
|
155
|
+
self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1)
|
156
|
+
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
|
157
|
+
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size>>12)
|
158
|
+
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
|
159
|
+
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
|
160
|
+
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
|
161
|
+
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
|
162
|
+
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
|
163
|
+
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
|
164
|
+
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
|
165
|
+
self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0)
|
166
|
+
REG_INCLUDE = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
|
167
|
+
self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
|
168
|
+
TOKEN_EXCLUDE = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
|
169
|
+
if not (se_mask >> se) & 0b1:
|
170
|
+
TOKEN_EXCLUDE |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
|
171
|
+
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
|
172
|
+
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
|
173
|
+
self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1)
|
174
|
+
# Enable SQTT
|
175
|
+
self.sqtt_config(tracing=True)
|
176
|
+
# Restore global broadcasting
|
177
|
+
self.wreg(self.gc.regGRBM_GFX_INDEX, se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
|
178
|
+
self.wreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE, 1)
|
179
|
+
self.memory_barrier()
|
180
|
+
return self
|
181
|
+
|
182
|
+
# Magic values from src/amd/common/ac_sqtt.c:ac_sqtt_emit_stop and src/amd/common/ac_sqtt.c:ac_sqtt_emit_wait
|
183
|
+
def sqtt_stop(self, ses: int, wptrs: HCQBuffer):
|
184
|
+
self.memory_barrier()
|
185
|
+
# Start shutting everything down
|
186
|
+
self.wreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE, 0)
|
187
|
+
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_FINISH) | self.pm4.EVENT_INDEX(0))
|
188
|
+
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
|
189
|
+
for se in range(ses):
|
190
|
+
self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1)
|
191
|
+
# Wait for FINISH_PENDING==0
|
192
|
+
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
193
|
+
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), 4)
|
194
|
+
# Wait for FINISH_DONE!=0
|
195
|
+
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
|
196
|
+
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_done'), 4)
|
197
|
+
# Disable SQTT
|
198
|
+
self.sqtt_config(tracing=False)
|
199
|
+
# Wait for BUSY==0
|
200
|
+
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
201
|
+
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('busy'), 4)
|
202
|
+
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
|
203
|
+
self.pkt3(self.pm4.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, self.gc.regSQ_THREAD_TRACE_WPTR.addr[0], 0, *data64_le(wptrs.va_addr+(se*4)))
|
204
|
+
# Restore global broadcasting
|
205
|
+
self.wreg(self.gc.regGRBM_GFX_INDEX, se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
|
206
|
+
self.spi_config(tracing=False)
|
207
|
+
self.memory_barrier()
|
208
|
+
return self
|
209
|
+
|
210
|
+
def sqtt_prg_marker(self, prg:AMDProgram, global_size:tuple[sint, ...]):
|
211
|
+
BIND_POINT_COMPUTE = 1
|
212
|
+
|
213
|
+
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(
|
214
|
+
_0=sqtt.union_rgp_sqtt_marker_pipeline_bind_0(_0=sqtt.struct_rgp_sqtt_marker_pipeline_bind_0_0(
|
215
|
+
identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE, bind_point=BIND_POINT_COMPUTE)),
|
216
|
+
_1=sqtt.union_rgp_sqtt_marker_pipeline_bind_1(api_pso_hash=data64_le(prg.libhash[0]))))
|
217
|
+
|
218
|
+
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(
|
219
|
+
_0=sqtt.union_rgp_sqtt_marker_event_0(_0=sqtt.struct_rgp_sqtt_marker_event_0_0(has_thread_dims=1)),
|
220
|
+
_2=sqtt.union_rgp_sqtt_marker_event_2(cmd_id=prg.dev.cmd_id)), *global_size)
|
221
|
+
|
222
|
+
prg.dev.cmd_id += 1
|
223
|
+
|
78
224
|
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
79
225
|
self.bind_args_state(args_state)
|
80
226
|
|
81
227
|
self.acquire_mem(gli=0, gl2=0)
|
82
228
|
|
229
|
+
user_regs = []
|
83
230
|
if prg.enable_private_segment_sgpr:
|
231
|
+
assert self.dev.xccs == 1, "Only architected flat scratch is suppored on multi-xcc"
|
84
232
|
scratch_hilo = data64_le(prg.dev.scratch.va_addr)
|
85
233
|
# sgpr word1 bit31 enables swizzle
|
86
234
|
# sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
|
87
|
-
user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000]
|
88
|
-
|
235
|
+
user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000]
|
236
|
+
|
89
237
|
if prg.enable_dispatch_ptr:
|
90
|
-
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(
|
238
|
+
dp = (dp_t:=hsa.hsa_kernel_dispatch_packet_t).from_address(cast(int, (disp_buf:=args_state.buf.offset(prg.kernargs_segment_size)).va_addr))
|
239
|
+
|
240
|
+
self.bind_sints(*local_size, mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='workgroup_size_x', fmt='H')
|
241
|
+
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='grid_size_x', fmt='I')
|
242
|
+
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.buf.va_addr
|
243
|
+
user_regs += [*data64_le(disp_buf.va_addr)]
|
244
|
+
|
245
|
+
user_regs += [*data64_le(args_state.buf.va_addr)]
|
91
246
|
|
92
|
-
|
93
|
-
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
|
94
|
-
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
95
|
-
user_regs += [*data64_le(dp_addr)]
|
247
|
+
if prg.dev.sqtt_enabled: self.sqtt_prg_marker(prg, global_size)
|
96
248
|
|
97
|
-
|
249
|
+
self.wreg(self.gc.regCOMPUTE_PGM_LO, *data64_le(prg.prog_addr >> 8))
|
250
|
+
self.wreg(self.gc.regCOMPUTE_PGM_RSRC1, prg.rsrc1, prg.rsrc2)
|
251
|
+
self.wreg(self.gc.regCOMPUTE_PGM_RSRC3, prg.rsrc3)
|
252
|
+
self.wreg(self.gc.regCOMPUTE_TMPRING_SIZE, prg.dev.tmpring_size)
|
98
253
|
|
99
|
-
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
100
|
-
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
101
|
-
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
|
102
|
-
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
103
254
|
if prg.dev.has_scratch_base_registers:
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
self.
|
113
|
-
self.
|
114
|
-
|
115
|
-
|
116
|
-
self.
|
255
|
+
for xcc_id in range(self.dev.xccs):
|
256
|
+
with self.pred_exec(xcc_mask=1<<xcc_id):
|
257
|
+
scratch_base = prg.dev.scratch.va_addr + (prg.dev.scratch.size // self.dev.xccs * xcc_id)
|
258
|
+
self.wreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO, *data64_le(scratch_base >> 8))
|
259
|
+
|
260
|
+
if (10,0,0) <= prg.dev.target < (11,0,0): self.wreg(self.gc.mmCP_COHER_START_DELAY, 0x20)
|
261
|
+
|
262
|
+
self.wreg(self.gc.regCOMPUTE_RESTART_X, 0, 0, 0)
|
263
|
+
self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF, 0xFFFFFFFF)
|
264
|
+
self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF, 0xFFFFFFFF)
|
265
|
+
if prg.dev.target >= (11,0,0): self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
266
|
+
|
267
|
+
self.wreg(self.gc.regCOMPUTE_USER_DATA_0, *user_regs)
|
268
|
+
self.wreg(self.gc.regCOMPUTE_RESOURCE_LIMITS, 0)
|
269
|
+
|
270
|
+
self.wreg(self.gc.regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0)
|
271
|
+
|
272
|
+
gfx10p = {'cs_w32_en': int(prg.wave32)} if prg.dev.target >= (10,0,0) else {}
|
273
|
+
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(**gfx10p, force_start_at_000=1, compute_shader_en=1)
|
274
|
+
self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR)
|
275
|
+
|
276
|
+
if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
|
277
|
+
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
278
|
+
|
279
|
+
if self.dev.xccs > 1:
|
280
|
+
self.release_mem(cache_flush=True)
|
281
|
+
self.acquire_mem(gli=0)
|
282
|
+
self.xcc_barrier()
|
117
283
|
return self
|
118
284
|
|
119
285
|
def wait(self, signal:AMDSignal, value:sint=0):
|
120
286
|
self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
|
287
|
+
if self.dev.xccs > 1: self.xcc_barrier()
|
121
288
|
return self
|
122
289
|
|
123
290
|
def timestamp(self, signal:AMDSignal):
|
124
|
-
self.
|
291
|
+
with self.pred_exec(xcc_mask=0b1):
|
292
|
+
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
|
125
293
|
return self
|
126
294
|
|
127
295
|
def signal(self, signal:AMDSignal, value:sint=0):
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
296
|
+
with self.pred_exec(xcc_mask=0b1):
|
297
|
+
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
298
|
+
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
299
|
+
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
300
|
+
|
301
|
+
if (dev:=signal.owner) is not None and signal.is_timeline and not dev.is_am():
|
302
|
+
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
303
|
+
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
135
304
|
return self
|
136
305
|
|
137
306
|
def bind(self, dev:AMDDevice):
|
138
307
|
self.binded_device = dev
|
139
308
|
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
140
|
-
hw_view =
|
309
|
+
hw_view = self.hw_page.cpu_view().view(fmt='I')
|
141
310
|
for i, value in enumerate(self._q): hw_view[i] = value
|
142
311
|
|
143
|
-
self.indirect_cmd = [
|
144
|
-
len(self._q) |
|
312
|
+
self.indirect_cmd = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
|
313
|
+
len(self._q) | self.pm4.INDIRECT_BUFFER_VALID]
|
145
314
|
self._q = hw_view
|
146
315
|
return self
|
147
316
|
|
148
317
|
def _submit(self, dev:AMDDevice):
|
149
318
|
cmds = self.indirect_cmd if dev == self.binded_device else self._q
|
319
|
+
# WORKAROUND: PACKET3_PRED_EXEC doesn't work in rings, only in IBs, create a fake IB inside a ring to work around that
|
320
|
+
if self.dev.xccs > 1 and dev != self.binded_device:
|
321
|
+
ib_end = ((dev.compute_queue.put_value + 5) % len(dev.compute_queue.ring)) + len(cmds)
|
322
|
+
ib_pad = len(dev.compute_queue.ring) - (ib_end - len(cmds)) if ib_end > len(dev.compute_queue.ring) else 0
|
323
|
+
ib_ptr = dev.compute_queue.ring.addr + ((dev.compute_queue.put_value + 5 + ib_pad) % len(dev.compute_queue.ring)) * 4
|
324
|
+
cmds = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(ib_ptr), len(cmds) | self.pm4.INDIRECT_BUFFER_VALID,
|
325
|
+
self.pm4.PACKET3(self.pm4.PACKET3_NOP, ib_pad + len(cmds) - 1), *((0,) * ib_pad), *cmds]
|
150
326
|
|
151
327
|
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
|
152
328
|
|
153
329
|
dev.compute_queue.put_value += len(cmds)
|
154
|
-
dev.compute_queue.signal_doorbell()
|
330
|
+
dev.compute_queue.signal_doorbell(dev)
|
155
331
|
|
156
332
|
class AMDCopyQueue(HWQueue):
|
157
|
-
def __init__(self, max_copy_size=0x40000000):
|
158
|
-
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
|
333
|
+
def __init__(self, dev, max_copy_size=0x40000000):
|
334
|
+
self.dev, self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev, dev.sdma, [], max_copy_size
|
159
335
|
super().__init__()
|
160
336
|
|
161
337
|
def q(self, *arr):
|
@@ -168,47 +344,47 @@ class AMDCopyQueue(HWQueue):
|
|
168
344
|
for _ in range(copy_commands):
|
169
345
|
step_copy_size = min(copy_size - copied, self.max_copy_size)
|
170
346
|
|
171
|
-
self.q(
|
172
|
-
|
347
|
+
self.q(self.sdma.SDMA_OP_COPY | self.sdma.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_COPY_LINEAR),
|
348
|
+
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
173
349
|
|
174
350
|
copied += step_copy_size
|
175
351
|
return self
|
176
352
|
|
177
353
|
def signal(self, signal:AMDSignal, value:sint=0):
|
178
|
-
self.
|
354
|
+
fence_flags = self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3) if self.dev.target >= (10,0,0) else 0
|
355
|
+
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(signal.value_addr), value)
|
179
356
|
|
180
|
-
if
|
181
|
-
self.q(
|
182
|
-
self.q(
|
183
|
-
elif
|
357
|
+
if (dev:=signal.owner) is not None and signal.is_timeline and not dev.is_am():
|
358
|
+
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
359
|
+
self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
360
|
+
elif dev is not None and dev.is_am(): self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
184
361
|
|
185
362
|
return self
|
186
363
|
|
187
364
|
def wait(self, signal:AMDSignal, value:sint=0):
|
188
|
-
self.q(
|
189
|
-
|
190
|
-
|
365
|
+
self.q(self.sdma.SDMA_OP_POLL_REGMEM | self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
366
|
+
self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
|
367
|
+
self.sdma.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | self.sdma.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
|
191
368
|
return self
|
192
369
|
|
193
370
|
def timestamp(self, signal:AMDSignal):
|
194
|
-
self.q(
|
371
|
+
self.q(self.sdma.SDMA_OP_TIMESTAMP | self.sdma.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
195
372
|
*data64_le(signal.timestamp_addr))
|
196
373
|
return self
|
197
374
|
|
198
375
|
def bind(self, dev:AMDDevice):
|
199
|
-
if not getenv("AMD_SDMA_BIND", 0) or not dev.
|
376
|
+
if not getenv("AMD_SDMA_BIND", 0) or not dev.is_am(): return
|
200
377
|
|
201
378
|
self.binded_device = dev
|
202
379
|
self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
203
|
-
hw_view =
|
380
|
+
hw_view = self.hw_page.cpu_view().view(fmt='I')
|
204
381
|
for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
|
205
382
|
|
206
|
-
self.indirect_cmd = [
|
383
|
+
self.indirect_cmd = [self.sdma.SDMA_OP_INDIRECT | self.sdma.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz,
|
384
|
+
*data64_le(0)]
|
207
385
|
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
|
208
386
|
|
209
387
|
def _submit(self, dev:AMDDevice):
|
210
|
-
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
211
|
-
|
212
388
|
if self.binded_device == dev:
|
213
389
|
# An IB packet must end on a 8 DW boundary.
|
214
390
|
add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
|
@@ -223,90 +399,122 @@ class AMDCopyQueue(HWQueue):
|
|
223
399
|
if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
|
224
400
|
tail_blit_dword += cmdsz
|
225
401
|
|
402
|
+
# Force align of submits to hit our usb layer write cache.
|
403
|
+
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0 and dev.is_usb(): tail_blit_dword = 0
|
404
|
+
|
405
|
+
# USB devices run in single-step mode, so they can't overrun the queue.
|
406
|
+
total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) + rem_packet_cnt * 4
|
407
|
+
assert total_bytes < dev.sdma_queue.ring.nbytes, "SDMA queue overrun"
|
408
|
+
while not dev.is_usb() and dev.sdma_queue.put_value + total_bytes - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: pass
|
409
|
+
|
226
410
|
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
|
227
411
|
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
|
228
412
|
dev.sdma_queue.put_value += tail_blit_dword * 4
|
229
413
|
|
230
414
|
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
|
231
415
|
zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
|
232
|
-
|
416
|
+
dev.sdma_queue.ring.view(dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes, zero_fill, fmt='B')[:] = bytes(zero_fill)
|
233
417
|
dev.sdma_queue.put_value += zero_fill
|
234
418
|
|
235
419
|
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
|
236
420
|
dev.sdma_queue.put_value += rem_packet_cnt * 4
|
237
421
|
|
238
|
-
dev.sdma_queue.signal_doorbell()
|
422
|
+
dev.sdma_queue.signal_doorbell(dev)
|
239
423
|
|
240
424
|
class AMDProgram(HCQProgram):
|
241
425
|
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
|
242
426
|
# TODO; this API needs the type signature of the function and global_size/local_size
|
243
|
-
self.dev
|
244
|
-
self.name, self.lib = name, lib
|
245
|
-
image, sections, _ = elf_loader(self.lib)
|
246
|
-
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
247
|
-
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
427
|
+
self.dev, self.name, self.lib = dev, name, lib
|
248
428
|
|
249
|
-
|
250
|
-
self.
|
251
|
-
self.
|
252
|
-
self.
|
429
|
+
image, sections, _ = elf_loader(self.lib)
|
430
|
+
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
431
|
+
self.dev.allocator._copyin(self.lib_gpu, image)
|
432
|
+
self.dev.synchronize()
|
253
433
|
|
434
|
+
rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
|
435
|
+
text_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".text"), -1)
|
436
|
+
assert rodata_entry >= 0 and text_entry >= 0, ".text or .rodata section not found"
|
437
|
+
self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
|
438
|
+
self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0]
|
439
|
+
self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0]
|
254
440
|
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
255
|
-
if lds_size > (self.dev.
|
441
|
+
if lds_size > (self.dev.iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
|
256
442
|
|
257
443
|
# Ensure scratch size
|
258
444
|
self.dev._ensure_has_local_memory(self.private_segment_size)
|
259
445
|
|
260
|
-
|
261
|
-
|
446
|
+
# NOTE: this is wrong, it's not this object. pad it, since it might be smaller than the struct
|
447
|
+
code = hsa.amd_kernel_code_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+256]) + b'\x00'*256)
|
448
|
+
self.wave32: bool = code.kernel_code_properties & 0x400 == 0x400
|
262
449
|
|
263
450
|
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
|
264
|
-
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if
|
451
|
+
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if (11,0,0) <= self.dev.target < (12,0,0) else 0)
|
265
452
|
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
|
266
|
-
self.
|
267
|
-
|
453
|
+
self.rsrc3: int = image[rodata_entry+44:rodata_entry+48].cast("I")[0] # NOTE: kernel descriptor, not in amd_kernel_code_t struct
|
454
|
+
self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
|
455
|
+
if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
|
268
456
|
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
269
457
|
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
|
270
458
|
self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
271
459
|
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
272
460
|
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
273
461
|
|
274
|
-
|
462
|
+
if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2
|
275
463
|
|
276
|
-
|
277
|
-
|
464
|
+
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
|
465
|
+
base=self.lib_gpu.va_addr)
|
466
|
+
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
278
467
|
|
279
468
|
class AMDAllocator(HCQAllocator['AMDDevice']):
|
469
|
+
def __init__(self, dev:AMDDevice):
|
470
|
+
super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None)
|
471
|
+
if hasattr(dev.iface, "as_dmaref"): self._as_dmaref = dev.iface.as_dmaref
|
472
|
+
self.supports_copy_from_disk = not dev.is_usb()
|
473
|
+
|
280
474
|
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
281
|
-
return self.dev.
|
475
|
+
return self.dev.iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
282
476
|
|
477
|
+
@suppress_finalizing
|
283
478
|
def _free(self, opaque, options:BufferSpec):
|
284
479
|
self.dev.synchronize()
|
285
|
-
self.dev.
|
480
|
+
self.dev.iface.free(opaque)
|
286
481
|
|
287
|
-
def
|
482
|
+
def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
|
288
483
|
|
289
|
-
|
484
|
+
@dataclass(frozen=True)
|
485
|
+
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
|
290
486
|
|
291
487
|
@dataclass
|
292
488
|
class AMDQueueDesc:
|
293
|
-
ring:
|
294
|
-
|
295
|
-
|
296
|
-
|
489
|
+
ring: MMIOInterface
|
490
|
+
read_ptrs: list[MMIOInterface]
|
491
|
+
write_ptrs: list[MMIOInterface]
|
492
|
+
doorbells: list[MMIOInterface]
|
297
493
|
put_value: int = 0
|
298
494
|
|
299
|
-
|
300
|
-
|
495
|
+
@property
|
496
|
+
def read_ptr(self): return min(p[0] for p in self.read_ptrs)
|
497
|
+
|
498
|
+
@classmethod
|
499
|
+
def multi(cls, *queues: AMDQueueDesc):
|
500
|
+
assert all_same([(q.ring.addr, q.put_value) for q in queues]), f"All queues must have the same ring and put_value: {queues}"
|
501
|
+
return cls(ring=queues[0].ring, put_value=queues[0].put_value, doorbells=flatten(q.doorbells for q in queues),
|
502
|
+
read_ptrs=flatten(q.read_ptrs for q in queues), write_ptrs=flatten(q.write_ptrs for q in queues))
|
503
|
+
|
504
|
+
def signal_doorbell(self, dev):
|
505
|
+
for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value
|
301
506
|
|
302
507
|
# Ensure all prior writes are visible to the GPU.
|
303
|
-
|
304
|
-
|
508
|
+
System.memory_barrier()
|
509
|
+
|
510
|
+
# Flush hdp if queue is in dev mem.
|
511
|
+
if dev.is_am() and not dev.is_usb(): dev.iface.dev_impl.gmc.flush_hdp()
|
512
|
+
for doorbell in self.doorbells: doorbell[0] = self.put_value
|
305
513
|
|
306
514
|
class KFDIface:
|
307
|
-
kfd:
|
515
|
+
kfd:FileIOInterface|None = None
|
308
516
|
event_page:HCQBuffer|None = None
|
309
|
-
gpus:list[
|
517
|
+
gpus:list[FileIOInterface] = []
|
310
518
|
|
311
519
|
def _is_usable_gpu(self, gpu_id):
|
312
520
|
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
|
@@ -319,17 +527,23 @@ class KFDIface:
|
|
319
527
|
|
320
528
|
# Initialize KFD interface during first run
|
321
529
|
if KFDIface.kfd is None:
|
322
|
-
KFDIface.kfd =
|
323
|
-
gpus = [g for g in
|
530
|
+
KFDIface.kfd = FileIOInterface("/dev/kfd", os.O_RDWR)
|
531
|
+
gpus = [g for g in FileIOInterface(kfd_topo_path).listdir() if self._is_usable_gpu(FileIOInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
|
324
532
|
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
|
325
533
|
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
326
534
|
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
327
535
|
|
328
536
|
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
329
537
|
|
330
|
-
self.gpu_id = int(
|
331
|
-
self.props = {l.split()[0]: int(
|
332
|
-
|
538
|
+
self.gpu_id = int(FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
|
539
|
+
self.props = {(p:=l.split())[0]: int(p[1]) for l in FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
|
540
|
+
ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0"
|
541
|
+
id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP}
|
542
|
+
ip_hw = [(id2ip[int(hwid)], int(hwid)) for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip]
|
543
|
+
self.ip_versions = {ip:tuple(int(FileIOInterface(f'{ip_base}/{hw}/0/{part}').read()) for part in ['major','minor','revision']) for ip,hw in ip_hw}
|
544
|
+
self.ip_offsets = {ip:{int(i):tuple(int(x, 16) for x in FileIOInterface(f'{ip_base}/{hw}/{i}/base_addr').read().splitlines())
|
545
|
+
for i in FileIOInterface(f'{ip_base}/{hw}').listdir()} for ip,hw in ip_hw }
|
546
|
+
self.drm_fd = FileIOInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
333
547
|
|
334
548
|
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
|
335
549
|
|
@@ -349,7 +563,7 @@ class KFDIface:
|
|
349
563
|
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
|
350
564
|
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
351
565
|
|
352
|
-
def alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
|
566
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, cpu_addr=None) -> HCQBuffer:
|
353
567
|
flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
354
568
|
|
355
569
|
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
|
@@ -358,56 +572,60 @@ class KFDIface:
|
|
358
572
|
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
359
573
|
|
360
574
|
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
361
|
-
buf = addr =
|
362
|
-
else: buf, addr = 0,
|
363
|
-
assert addr != 0xffffffffffffffff
|
575
|
+
buf = addr = cpu_addr or FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
576
|
+
else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
|
364
577
|
|
365
578
|
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
|
366
579
|
flags=flags, mmap_offset=buf)
|
367
580
|
except OSError as e:
|
368
581
|
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
|
369
582
|
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
370
|
-
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate
|
583
|
+
if e.errno == errno.ENOMEM: raise MemoryError(f"Cannot allocate {size} bytes: no memory is available.") from e
|
371
584
|
raise
|
372
585
|
|
373
586
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
374
587
|
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
|
375
588
|
assert addr == buf == mem.va_addr
|
376
589
|
|
377
|
-
|
590
|
+
view = MMIOInterface(mem.va_addr, mem.size, fmt='B') if cpu_access or host else None
|
591
|
+
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem, view=view, owner=self.dev))
|
378
592
|
return hcqbuf
|
379
593
|
|
380
594
|
def free(self, mem):
|
381
|
-
if len(
|
382
|
-
|
383
|
-
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(
|
595
|
+
if len(mem.mapped_devs) > 0:
|
596
|
+
gpus = (ctypes.c_int32 * len(mem.mapped_devs))(*[x.iface.gpu_id for x in mem.mapped_devs])
|
597
|
+
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(gpus), n_devices=len(gpus))
|
384
598
|
assert stm.n_success == len(gpus)
|
385
|
-
if mem.va_addr:
|
599
|
+
if mem.va_addr: FileIOInterface.munmap(mem.va_addr, mem.size)
|
386
600
|
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
|
387
601
|
|
602
|
+
def as_dmaref(self, mem:HCQBuffer) -> DMAFdRef:
|
603
|
+
base = mem._base if mem._base is not None else mem
|
604
|
+
dmaref = DMAFdRef(kfd.AMDKFD_IOC_EXPORT_DMABUF(KFDIface.kfd, handle=base.meta.handle, flags=0).dmabuf_fd, mem.va_addr-base.va_addr, mem.size)
|
605
|
+
weakref.finalize(dmaref, os.close, dmaref.fd)
|
606
|
+
return dmaref
|
607
|
+
|
388
608
|
def map(self, mem):
|
389
|
-
if
|
390
|
-
|
391
|
-
c_gpus = (ctypes.c_int32 *
|
392
|
-
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
397
|
-
cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
|
609
|
+
if mem.owner is not None and mem.owner._is_cpu(): return self.alloc(mem.size, host=True, cpu_addr=mem.va_addr)
|
610
|
+
|
611
|
+
c_gpus = (ctypes.c_int32 * 1)(self.gpu_id)
|
612
|
+
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=1)
|
613
|
+
assert stm.n_success == 1
|
614
|
+
|
615
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
|
398
616
|
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
399
|
-
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
617
|
+
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE|(xcc_id<<8), queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
400
618
|
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
|
401
|
-
ctx_save_restore_address=
|
402
|
-
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
619
|
+
ctx_save_restore_address=cwsr_buffer.va_addr if cwsr_buffer else 0, ctx_save_restore_size=ctx_save_restore_size,
|
620
|
+
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8 * (xcc_id + 1))
|
403
621
|
|
404
622
|
if not hasattr(self, 'doorbells'):
|
405
623
|
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
406
|
-
self.doorbells = cast(
|
624
|
+
self.doorbells = cast(FileIOInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
|
407
625
|
|
408
|
-
return AMDQueueDesc(ring=
|
409
|
-
|
410
|
-
|
626
|
+
return AMDQueueDesc(ring=MMIOInterface(ring.va_addr, ring.size, fmt='I'), read_ptrs=[MMIOInterface(queue.read_pointer_address, 8, fmt='Q')],
|
627
|
+
write_ptrs=[MMIOInterface(queue.write_pointer_address, 8, fmt='Q')],
|
628
|
+
doorbells=[MMIOInterface(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8, fmt='Q')])
|
411
629
|
|
412
630
|
def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
|
413
631
|
|
@@ -424,212 +642,211 @@ class KFDIface:
|
|
424
642
|
|
425
643
|
raise RuntimeError("\n".join(report))
|
426
644
|
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
class PCIIface:
|
431
|
-
supported_devs:list[int] = [0x744c, 0x7480]
|
432
|
-
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
|
433
|
-
vfio_fd:HWInterface
|
434
|
-
gpus:list[Any] = []
|
645
|
+
class PCIIface(PCIIfaceBase):
|
646
|
+
gpus:ClassVar[list[str]] = []
|
435
647
|
|
436
648
|
def __init__(self, dev, dev_id):
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
|
442
|
-
device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
|
443
|
-
if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
|
444
|
-
|
445
|
-
# TODO: visible_devices should be handled layer above this?
|
446
|
-
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
447
|
-
PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
|
448
|
-
|
449
|
-
self.pcibus = PCIIface.gpus[dev_id]
|
450
|
-
|
451
|
-
# Unbind the device from the kernel driver
|
452
|
-
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
453
|
-
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
|
454
|
-
|
455
|
-
supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
|
456
|
-
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
|
457
|
-
|
458
|
-
# Try to init vfio. Use it if success.
|
459
|
-
if PCIIface.vfio:
|
460
|
-
try:
|
461
|
-
if first_dev:
|
462
|
-
HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
|
463
|
-
PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
|
464
|
-
vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
649
|
+
super().__init__(dev, dev_id, vendor=0x1002, devices=[0x744c, 0x7480, 0x7550], bars=[0, 2, 5], vram_bar=0,
|
650
|
+
va_start=AMMemoryManager.va_allocator.base, va_size=AMMemoryManager.va_allocator.size)
|
651
|
+
self._setup_adev(self.pci_dev.pcibus, self.pci_dev.map_bar(0), self.pci_dev.map_bar(2, fmt='Q'), self.pci_dev.map_bar(5, fmt='I'))
|
652
|
+
self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2)
|
465
653
|
|
466
|
-
|
467
|
-
|
654
|
+
def _setup_adev(self, name, vram:MMIOInterface, doorbell:MMIOInterface, mmio:MMIOInterface, dma_regions:list[tuple[int, MMIOInterface]]|None=None):
|
655
|
+
self.dev_impl:AMDev = AMDev(name, vram, doorbell, mmio, dma_regions)
|
656
|
+
self.ip_offsets, self.ip_versions = self.dev_impl.regs_offset, self.dev_impl.ip_ver
|
468
657
|
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
658
|
+
gfxver = int(f"{self.dev_impl.ip_ver[am.GC_HWIP][0]:02d}{self.dev_impl.ip_ver[am.GC_HWIP][1]:02d}{self.dev_impl.ip_ver[am.GC_HWIP][2]:02d}")
|
659
|
+
array_count = self.dev_impl.gc_info.gc_num_sa_per_se * self.dev_impl.gc_info.gc_num_se
|
660
|
+
simd_count = 2 * array_count * (self.dev_impl.gc_info.gc_num_wgp0_per_sa + self.dev_impl.gc_info.gc_num_wgp1_per_sa)
|
661
|
+
self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': gfxver,
|
662
|
+
'max_slots_scratch_cu': self.dev_impl.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.dev_impl.gc_info.gc_max_waves_per_simd,
|
663
|
+
'simd_arrays_per_engine': self.dev_impl.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.dev_impl.gc_info.gc_lds_size}
|
473
664
|
|
474
|
-
|
475
|
-
|
476
|
-
self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
477
|
-
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
|
665
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
|
666
|
+
assert cwsr_buffer is None, "no cwsr buffer for am"
|
478
667
|
|
479
|
-
|
480
|
-
self.
|
481
|
-
|
482
|
-
|
483
|
-
self.
|
484
|
-
|
485
|
-
|
486
|
-
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
487
|
-
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
|
488
|
-
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
489
|
-
else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
|
490
|
-
|
491
|
-
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
|
492
|
-
self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
|
493
|
-
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
|
494
|
-
|
495
|
-
bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
|
496
|
-
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
|
497
|
-
|
498
|
-
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
499
|
-
self.doorbell_cpu_addr = mv_address(dbell)
|
500
|
-
|
501
|
-
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
|
502
|
-
self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
|
503
|
-
|
504
|
-
array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
|
505
|
-
simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
|
506
|
-
self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': self.adev.ip_versions[am.GC_HWIP],
|
507
|
-
'max_slots_scratch_cu': self.adev.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.adev.gc_info.gc_max_waves_per_simd,
|
508
|
-
'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
|
668
|
+
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
669
|
+
self.dev_impl.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
670
|
+
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
|
671
|
+
else:
|
672
|
+
self.dev_impl.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
673
|
+
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
509
674
|
|
510
|
-
|
511
|
-
|
512
|
-
libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
|
513
|
-
return to_mv(loc, sz)
|
675
|
+
return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.dev_impl.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
|
676
|
+
read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')])
|
514
677
|
|
515
|
-
def
|
516
|
-
if
|
517
|
-
|
518
|
-
|
678
|
+
def sleep(self, timeout):
|
679
|
+
if self.pci_dev.irq_poller is not None and (events_cnt:=len(self.pci_dev.irq_poller.poll(timeout))):
|
680
|
+
self.pci_dev.irq_fd.read(8 * events_cnt)
|
681
|
+
self.dev_impl.ih.interrupt_handler()
|
519
682
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
|
683
|
+
def on_device_hang(self):
|
684
|
+
devs:list[AMDDevice] = [d for pg in HCQCompiled.peer_groups.values() for d in pg if isinstance(d, AMDDevice) and d.is_am()]
|
685
|
+
for d in devs: d.iface.dev_impl.gmc.on_interrupt()
|
686
|
+
raise RuntimeError("Device hang detected")
|
525
687
|
|
526
|
-
|
527
|
-
if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
|
528
|
-
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
|
688
|
+
def device_fini(self): self.dev_impl.fini()
|
529
689
|
|
530
|
-
|
531
|
-
|
532
|
-
|
690
|
+
class USBIface(PCIIface):
|
691
|
+
def __init__(self, dev, dev_id):
|
692
|
+
self.dev = dev
|
693
|
+
self.usb = ASM24Controller()
|
694
|
+
self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x10000000, pref_mem_base=(32 << 30))
|
533
695
|
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
mem.meta.mapped_devs.append(self.dev)
|
696
|
+
self._setup_adev(f"usb:{dev_id}", USBMMIOInterface(self.usb, *self.bars[0], fmt='B'), USBMMIOInterface(self.usb, *self.bars[2], fmt='Q'),
|
697
|
+
USBMMIOInterface(self.usb, *self.bars[5], fmt='I'), dma_regions=[(0x200000, self._dma_view(0xf000, 0x80000))])
|
698
|
+
self.usb._pci_cacheable += [self.bars[2]] # doorbell region is cacheable
|
538
699
|
|
539
|
-
|
540
|
-
self.
|
700
|
+
# special regions
|
701
|
+
self.copy_bufs = [self._dma_region(ctrl_addr=0xf000, sys_addr=0x200000, size=0x80000)]
|
702
|
+
self.sys_buf, self.sys_next_off = self._dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0x800
|
541
703
|
|
542
|
-
def
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
else:
|
547
|
-
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
548
|
-
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
704
|
+
def _dma_view(self, ctrl_addr, size): return USBMMIOInterface(self.usb, ctrl_addr, size, fmt='B', pcimem=False)
|
705
|
+
def _dma_region(self, ctrl_addr, sys_addr, size):
|
706
|
+
region = self.dev_impl.mm.map_range(vaddr:=self.dev_impl.mm.alloc_vaddr(size=size), size, [(sys_addr, size)], system=True, uncached=True)
|
707
|
+
return HCQBuffer(vaddr, size, meta=PCIAllocationMeta(region, has_cpu_mapping=False), view=self._dma_view(ctrl_addr, size), owner=self.dev)
|
549
708
|
|
550
|
-
|
551
|
-
|
709
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, **kwargs) -> HCQBuffer:
|
710
|
+
if (host or (uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size:
|
711
|
+
self.sys_next_off += size
|
712
|
+
return self.sys_buf.offset(self.sys_next_off - size, size)
|
552
713
|
|
553
|
-
|
554
|
-
|
555
|
-
self.
|
556
|
-
self.adev.ih.interrupt_handler()
|
714
|
+
am_mapping = self.dev_impl.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contiguous=cpu_access)
|
715
|
+
return HCQBuffer(am_mapping.va_addr, size, meta=PCIAllocationMeta(am_mapping, has_cpu_mapping=False),
|
716
|
+
view=USBMMIOInterface(self.usb, self.bars[0][0] + am_mapping.paddrs[0][0], size, fmt='B') if cpu_access else None, owner=self.dev)
|
557
717
|
|
558
|
-
def
|
559
|
-
|
560
|
-
|
718
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
|
719
|
+
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE: self.usb._pci_cacheable += [(ring.cpu_view().addr, ring.size)]
|
720
|
+
return super().create_queue(queue_type, ring, gart, eop_buffer, cwsr_buffer, ctl_stack_size, ctx_save_restore_size, xcc_id)
|
561
721
|
|
562
|
-
def
|
722
|
+
def sleep(self, timeout): pass
|
563
723
|
|
564
724
|
class AMDDevice(HCQCompiled):
|
565
|
-
|
566
|
-
|
567
|
-
signals_pool:list[int] = []
|
725
|
+
def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface))
|
726
|
+
def is_usb(self) -> bool: return isinstance(self.iface, USBIface)
|
568
727
|
|
569
728
|
def __init__(self, device:str=""):
|
570
729
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
571
|
-
self.
|
572
|
-
self.target =
|
573
|
-
self.arch = "gfx%d%x%x" %
|
574
|
-
if self.target <
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
self.
|
583
|
-
self.has_scratch_base_registers = self.target >= 110000
|
730
|
+
self.iface = self._select_iface(KFDIface, PCIIface, USBIface)
|
731
|
+
self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
|
732
|
+
self.arch = "gfx%d%x%x" % self.target
|
733
|
+
if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}")
|
734
|
+
if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
|
735
|
+
|
736
|
+
self.max_cu_id = self.iface.props['simd_count'] // self.iface.props['simd_per_cu'] // self.iface.props.get('num_xcc', 1) - 1
|
737
|
+
self.max_wave_id = (self.iface.props['max_waves_per_simd'] * self.iface.props['simd_per_cu'] - 1) if self.target >= (10,1,0) else \
|
738
|
+
(min((self.max_cu_id+1)*40, self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine'] * 512) - 1)
|
739
|
+
self.xccs = self.iface.props.get('num_xcc', 1) if getenv("XCCS", 1) else 1
|
740
|
+
# this is what llvm refers to as "architected flat scratch"
|
741
|
+
self.has_scratch_base_registers = self.target >= (11,0,0) or self.target in {(9,4,2), (9,5,0)}
|
584
742
|
|
585
743
|
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
|
586
744
|
sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
|
587
|
-
|
745
|
+
if self.target[:2] == (9,5): lds_size_per_cu = self.iface.props["lds_size_in_kb"] << 10
|
746
|
+
vgpr_size_per_cu = 0x60000 if self.target in {(11,0,0), (11,0,1), (12,0,0), (12,0,1)} else \
|
747
|
+
0x80000 if (self.target[:2]) in {(9,4), (9,5)} or self.target in {(9,0,8), (9,0,10)} else 0x40000
|
588
748
|
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
|
589
|
-
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
590
|
-
|
749
|
+
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE) if self.target >= (10,1,0) else \
|
750
|
+
round_up((self.max_wave_id + 1) * 8 + 8 + 40, mmap.PAGESIZE)
|
751
|
+
debug_memory_size = round_up((self.max_cu_id + 1 if self.target >= (10,1,0) else 1) * (self.max_wave_id + 1) * 32, 64)
|
752
|
+
if self.target[0] == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
|
753
|
+
|
754
|
+
self.soc = import_soc(self.target)
|
755
|
+
self.pm4 = importlib.import_module(f"tinygrad.runtime.autogen.am.pm4_{'nv' if self.target[0] >= 10 else 'soc15'}")
|
756
|
+
self.sdma = import_module('sdma', min(self.iface.ip_versions[am.SDMA0_HWIP], (6, 0, 0)))
|
757
|
+
self.gc = AMDIP('gc', self.iface.ip_versions[am.GC_HWIP], self.iface.ip_offsets[am.GC_HWIP])
|
758
|
+
|
759
|
+
# Define the regCOMPUTE_CURRENT_LOGIC_XCC_ID register, which is missing from the asic_regs files.
|
760
|
+
if self.target[:2] in {(9,4),(9,5)}: self.regCOMPUTE_CURRENT_LOGIC_XCC_ID = AMDReg("regCOMPUTE_CURRENT_LOGIC_XCC_ID", 0xe25, 0, {}, self.gc.bases)
|
761
|
+
|
762
|
+
nbio_name = 'nbio' if self.target[0] < 12 else 'nbif'
|
763
|
+
nbio_pad = (0,) if self.target[0] == 9 else ()
|
764
|
+
self.nbio = AMDIP(nbio_name, self.iface.ip_versions[am.NBIF_HWIP], {i:nbio_pad+x for i,x in self.iface.ip_offsets[am.NBIF_HWIP].items()})
|
591
765
|
|
592
|
-
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE,
|
593
|
-
|
766
|
+
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000,
|
767
|
+
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
594
768
|
|
595
|
-
|
769
|
+
max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
770
|
+
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20))
|
596
771
|
|
597
|
-
super().__init__(device, AMDAllocator(self),
|
598
|
-
|
772
|
+
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if AMD_LLVM else AMDRenderer(self.arch),
|
773
|
+
AMDLLVMCompiler(self.arch) if AMD_LLVM else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
|
774
|
+
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size),
|
775
|
+
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000)
|
599
776
|
|
600
777
|
# Scratch setup
|
601
778
|
self.max_private_segment_size = 0
|
602
779
|
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
603
780
|
|
781
|
+
# XCC setup
|
782
|
+
self.xcc_sync: tuple[AMDSignal, AMDSignal]|None = None
|
783
|
+
if self.xccs > 1:
|
784
|
+
self.xcc_sync_area = self.allocator.alloc(0x1000, BufferSpec(nolru=True, cpu_access=True))
|
785
|
+
self.xcc_sync = (AMDSignal(base_buf=self.xcc_sync_area), AMDSignal(base_buf=self.xcc_sync_area.offset(256)))
|
786
|
+
AMDComputeQueue(self).xcc_config().submit(self)
|
787
|
+
|
788
|
+
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
789
|
+
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
790
|
+
if self.sqtt_enabled:
|
791
|
+
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
|
792
|
+
if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
|
793
|
+
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
|
794
|
+
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
|
795
|
+
"For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md")
|
796
|
+
SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine
|
797
|
+
SQTT_NUM = self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine']
|
798
|
+
self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)]
|
799
|
+
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
|
800
|
+
self.cmd_id = 0
|
801
|
+
AMDComputeQueue(self).sqtt_start(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
|
802
|
+
|
604
803
|
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
605
|
-
ring = self.
|
606
|
-
gart = self.
|
607
|
-
|
608
|
-
|
609
|
-
|
804
|
+
ring = self.iface.alloc(ring_size, uncached=True, cpu_access=True)
|
805
|
+
gart = self.iface.alloc(0x100, uncached=True, cpu_access=True)
|
806
|
+
|
807
|
+
cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.iface.props.get('num_xcc', 1), mmap.PAGESIZE)
|
808
|
+
cwsr_buffer = self.iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None
|
809
|
+
eop_buffer = self.iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
810
|
+
|
811
|
+
return AMDQueueDesc.multi(*(self.iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer, xcc_id=xcc_id,
|
812
|
+
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
|
813
|
+
for xcc_id in range(self.xccs if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE else 1)))
|
610
814
|
|
611
815
|
def _ensure_has_local_memory(self, required):
|
612
816
|
if self.max_private_segment_size >= required: return
|
613
817
|
|
614
818
|
# <gfx103 requires alignment of 1024, >=gfx11 requires 256
|
615
|
-
wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >=
|
819
|
+
wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= (11,0,0) else 1024)
|
616
820
|
|
617
|
-
|
821
|
+
scratch_size = (self.max_cu_id+1)*self.iface.props['max_slots_scratch_cu']*wave_scratch_len # per xcc
|
822
|
+
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), scratch_size*self.xccs)
|
618
823
|
if ok:
|
619
|
-
engines = self.
|
620
|
-
waves = wave_scratch_len // (256 if self.target >=
|
824
|
+
engines = self.iface.props['array_count'] // self.iface.props['simd_arrays_per_engine']
|
825
|
+
waves = wave_scratch_len // (256 if self.target >= (11,0,0) else 1024)
|
621
826
|
# >=gfx11 wavesize is per SE
|
622
|
-
wavesize =
|
827
|
+
wavesize = scratch_size // ((wave_scratch_len * engines) if self.target >= (11,0,0) else wave_scratch_len)
|
623
828
|
self.tmpring_size = waves << 12 | wavesize
|
624
829
|
self.max_private_segment_size = required
|
625
830
|
|
626
831
|
def invalidate_caches(self):
|
627
|
-
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.
|
628
|
-
self.timeline_value += 1
|
832
|
+
AMDComputeQueue(self).memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)
|
629
833
|
self.synchronize()
|
630
834
|
|
631
|
-
def on_device_hang(self): self.
|
632
|
-
|
633
|
-
def
|
634
|
-
self.
|
635
|
-
|
835
|
+
def on_device_hang(self): self.iface.on_device_hang()
|
836
|
+
|
837
|
+
def _at_profile_finalize(self):
|
838
|
+
if self.sqtt_enabled:
|
839
|
+
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
840
|
+
wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size)
|
841
|
+
AMDComputeQueue(self).sqtt_stop(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self)
|
842
|
+
self.synchronize()
|
843
|
+
if DEBUG>=2: print('Saving SQTT in profile...')
|
844
|
+
for i,buf0 in enumerate(self.sqtt_buffers):
|
845
|
+
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
|
846
|
+
if DEBUG>=2: print(f'Se {i} blob size {wptr:#x}')
|
847
|
+
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
|
848
|
+
# When sqtt buffer overflows, wptr stops at the last dword
|
849
|
+
if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)")
|
850
|
+
self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0)
|
851
|
+
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
|
852
|
+
super()._at_profile_finalize()
|