tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
- tinygrad/codegen/expander.py +121 -0
- tinygrad/codegen/kernel.py +141 -201
- tinygrad/codegen/linearize.py +223 -84
- tinygrad/codegen/lowerer.py +60 -42
- tinygrad/codegen/symbolic.py +476 -0
- tinygrad/codegen/transcendental.py +22 -13
- tinygrad/device.py +187 -47
- tinygrad/dtype.py +39 -28
- tinygrad/engine/jit.py +83 -65
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +161 -0
- tinygrad/engine/realize.py +62 -108
- tinygrad/engine/schedule.py +396 -357
- tinygrad/engine/search.py +55 -66
- tinygrad/gradient.py +73 -0
- tinygrad/helpers.py +81 -59
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +91 -66
- tinygrad/ops.py +492 -641
- tinygrad/renderer/__init__.py +95 -36
- tinygrad/renderer/cstyle.py +99 -92
- tinygrad/renderer/llvmir.py +83 -34
- tinygrad/renderer/ptx.py +83 -99
- tinygrad/renderer/wgsl.py +95 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/pci.py +1333 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/autogen/webgpu.py +6985 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +40 -43
- tinygrad/runtime/ops_amd.py +498 -334
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cpu.py +24 -0
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +159 -42
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +48 -41
- tinygrad/runtime/ops_metal.py +149 -113
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +238 -273
- tinygrad/runtime/ops_python.py +55 -50
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +225 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +396 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +28 -4
- tinygrad/runtime/support/hcq.py +256 -324
- tinygrad/runtime/support/llvm.py +26 -0
- tinygrad/shape/shapetracker.py +85 -53
- tinygrad/shape/view.py +104 -140
- tinygrad/spec.py +155 -0
- tinygrad/tensor.py +835 -527
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad/viz/index.html +544 -0
- tinygrad/viz/perfetto.html +178 -0
- tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
- tinygrad-0.10.2.dist-info/RECORD +99 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/uopgraph.py +0 -506
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad/runtime/ops_clang.py +0 -35
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,28 +1,23 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import
|
3
|
-
import os, ctypes, ctypes.util, functools,
|
2
|
+
from typing import Any, cast
|
3
|
+
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
|
4
4
|
assert sys.platform != 'win32'
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer,
|
7
|
-
from tinygrad.
|
8
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
7
|
+
from tinygrad.ops import sint
|
8
|
+
from tinygrad.device import BufferSpec, CPUProgram
|
9
|
+
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
|
9
10
|
from tinygrad.renderer.cstyle import AMDRenderer
|
10
|
-
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
|
11
|
+
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
|
12
|
+
from tinygrad.runtime.autogen.am import am
|
11
13
|
from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
12
14
|
from tinygrad.runtime.support.elf import elf_loader
|
15
|
+
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
|
13
16
|
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
14
|
-
if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
15
|
-
|
16
|
-
def is_usable_gpu(gpu_id):
|
17
|
-
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
|
18
|
-
return False
|
19
17
|
|
20
18
|
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
21
19
|
|
22
|
-
#
|
23
|
-
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
24
|
-
|
25
|
-
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
20
|
+
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
26
21
|
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
27
22
|
|
28
23
|
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
@@ -31,246 +26,224 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
|
31
26
|
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
32
27
|
|
33
28
|
class AMDSignal(HCQSignal):
|
34
|
-
def __init__(self,
|
35
|
-
|
36
|
-
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
|
37
|
-
if is_timeline:
|
38
|
-
self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
|
39
|
-
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
|
40
|
-
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
|
41
|
-
else: self._event_mailbox_ptr = 0
|
42
|
-
super().__init__(value)
|
43
|
-
def __del__(self): AMDDevice.signals_pool.append(self._signal)
|
44
|
-
def _get_value(self) -> int: return self._signal[0]
|
45
|
-
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
|
46
|
-
def _set_value(self, new_value:int): self._signal[0] = new_value
|
47
|
-
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
|
48
|
-
start_time = time.time() * 1000
|
49
|
-
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
50
|
-
if self._signal[0] >= value: return
|
51
|
-
|
52
|
-
# Wait active for 5s, then going to sleep.
|
53
|
-
if time_spent > 5000 and self._event_mailbox_ptr != 0:
|
54
|
-
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
55
|
-
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
|
56
|
-
|
57
|
-
class AMDComputeQueue(HWComputeQueue):
|
58
|
-
def __init__(self):
|
59
|
-
self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
|
60
|
-
super().__init__()
|
29
|
+
def __init__(self, base_addr:int|None=None, **kwargs):
|
30
|
+
super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
|
61
31
|
|
32
|
+
def __del__(self):
|
33
|
+
if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
|
34
|
+
|
35
|
+
def _sleep(self, time_spent_waiting_ms:int):
|
36
|
+
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
|
37
|
+
if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
|
38
|
+
|
39
|
+
class AMDComputeQueue(HWQueue):
|
62
40
|
def __del__(self):
|
63
41
|
if self.binded_device is not None:
|
64
|
-
self.binded_device.allocator.free(self.hw_page, self.hw_page.size,
|
65
|
-
|
66
|
-
def
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
self.
|
94
|
-
|
95
|
-
def
|
96
|
-
self.
|
97
|
-
|
98
|
-
|
99
|
-
|
42
|
+
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
43
|
+
|
44
|
+
def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
|
45
|
+
|
46
|
+
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
|
47
|
+
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
48
|
+
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
|
49
|
+
|
50
|
+
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
51
|
+
|
52
|
+
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
53
|
+
cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
54
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
55
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
56
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
57
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
58
|
+
|
59
|
+
self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
60
|
+
|
61
|
+
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
|
62
|
+
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
63
|
+
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
64
|
+
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
|
65
|
+
|
66
|
+
event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
67
|
+
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
|
68
|
+
|
69
|
+
memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
|
70
|
+
|
71
|
+
self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
72
|
+
|
73
|
+
def memory_barrier(self):
|
74
|
+
self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
|
75
|
+
self.acquire_mem()
|
76
|
+
return self
|
77
|
+
|
78
|
+
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
79
|
+
self.bind_args_state(args_state)
|
80
|
+
|
81
|
+
self.acquire_mem(gli=0, gl2=0)
|
82
|
+
|
83
|
+
if prg.enable_private_segment_sgpr:
|
84
|
+
scratch_hilo = data64_le(prg.dev.scratch.va_addr)
|
85
|
+
# sgpr word1 bit31 enables swizzle
|
86
|
+
# sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
|
87
|
+
user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000] if prg.enable_private_segment_sgpr else []
|
88
|
+
else: user_regs = []
|
100
89
|
if prg.enable_dispatch_ptr:
|
101
90
|
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
|
102
|
-
|
103
|
-
|
91
|
+
|
92
|
+
self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
|
93
|
+
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
|
104
94
|
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
105
95
|
user_regs += [*data64_le(dp_addr)]
|
106
|
-
|
96
|
+
|
107
97
|
user_regs += [*data64_le(args_state.ptr)]
|
108
98
|
|
109
|
-
self.
|
110
|
-
self.
|
111
|
-
self.
|
112
|
-
self.
|
113
|
-
if prg.
|
114
|
-
self.
|
115
|
-
|
116
|
-
|
117
|
-
self.
|
118
|
-
self.
|
119
|
-
self.
|
120
|
-
self.
|
121
|
-
|
122
|
-
|
123
|
-
self.
|
124
|
-
|
125
|
-
self.
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
|
139
|
-
|
140
|
-
def _wait(self, signal, value=0):
|
141
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
142
|
-
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
143
|
-
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
|
144
|
-
|
145
|
-
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
|
146
|
-
|
147
|
-
def _signal(self, signal, value=0):
|
99
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
100
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
101
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
|
102
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
103
|
+
if prg.dev.has_scratch_base_registers:
|
104
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
|
105
|
+
if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
|
106
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
|
107
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
|
108
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
|
109
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
110
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
|
111
|
+
|
112
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
|
113
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
|
114
|
+
|
115
|
+
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
|
116
|
+
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
117
|
+
return self
|
118
|
+
|
119
|
+
def wait(self, signal:AMDSignal, value:sint=0):
|
120
|
+
self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
|
121
|
+
return self
|
122
|
+
|
123
|
+
def timestamp(self, signal:AMDSignal):
|
124
|
+
self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
|
125
|
+
return self
|
126
|
+
|
127
|
+
def signal(self, signal:AMDSignal, value:sint=0):
|
148
128
|
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
149
|
-
self.
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
|
161
|
-
|
162
|
-
# Check if the signal command has mailptr part
|
163
|
-
if signal is not None and self.cmds_len[cmd_idx] > 8:
|
164
|
-
self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
|
165
|
-
|
166
|
-
def bind(self, device):
|
167
|
-
self.binded_device = device
|
168
|
-
self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
|
129
|
+
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
130
|
+
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
131
|
+
|
132
|
+
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
133
|
+
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
134
|
+
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
135
|
+
return self
|
136
|
+
|
137
|
+
def bind(self, dev:AMDDevice):
|
138
|
+
self.binded_device = dev
|
139
|
+
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
169
140
|
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
170
|
-
for i, value in enumerate(self.
|
141
|
+
for i, value in enumerate(self._q): hw_view[i] = value
|
171
142
|
|
172
143
|
self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
|
173
|
-
len(self.
|
174
|
-
self.
|
144
|
+
len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
|
145
|
+
self._q = hw_view
|
146
|
+
return self
|
175
147
|
|
176
|
-
def _submit(self,
|
177
|
-
cmds = self.indirect_cmd if
|
148
|
+
def _submit(self, dev:AMDDevice):
|
149
|
+
cmds = self.indirect_cmd if dev == self.binded_device else self._q
|
178
150
|
|
179
|
-
for i, value in enumerate(cmds):
|
151
|
+
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
|
180
152
|
|
181
|
-
|
182
|
-
|
183
|
-
device.compute_queue.doorbell[0] = device.compute_queue.put_value
|
153
|
+
dev.compute_queue.put_value += len(cmds)
|
154
|
+
dev.compute_queue.signal_doorbell()
|
184
155
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
|
156
|
+
class AMDCopyQueue(HWQueue):
|
157
|
+
def __init__(self, max_copy_size=0x40000000):
|
158
|
+
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
|
189
159
|
super().__init__()
|
190
160
|
|
191
|
-
def
|
192
|
-
|
161
|
+
def q(self, *arr):
|
162
|
+
super().q(*arr)
|
193
163
|
self.internal_cmd_sizes.append(len(arr))
|
194
164
|
|
195
|
-
def
|
196
|
-
copied, copy_commands = 0, (copy_size +
|
197
|
-
|
165
|
+
def copy(self, dest:sint, src:sint, copy_size:int):
|
166
|
+
copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
|
167
|
+
|
198
168
|
for _ in range(copy_commands):
|
199
|
-
step_copy_size = min(copy_size - copied,
|
169
|
+
step_copy_size = min(copy_size - copied, self.max_copy_size)
|
200
170
|
|
201
|
-
self.
|
202
|
-
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)
|
171
|
+
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
172
|
+
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
203
173
|
|
204
174
|
copied += step_copy_size
|
175
|
+
return self
|
205
176
|
|
206
|
-
def
|
207
|
-
|
208
|
-
if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
|
209
|
-
if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
177
|
+
def signal(self, signal:AMDSignal, value:sint=0):
|
178
|
+
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
|
210
179
|
|
211
|
-
|
212
|
-
|
180
|
+
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
181
|
+
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
182
|
+
self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
183
|
+
elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
213
184
|
|
214
|
-
|
215
|
-
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
|
216
|
-
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
|
185
|
+
return self
|
217
186
|
|
218
|
-
def
|
219
|
-
self.
|
220
|
-
|
221
|
-
|
187
|
+
def wait(self, signal:AMDSignal, value:sint=0):
|
188
|
+
self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
189
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
|
190
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
|
191
|
+
return self
|
222
192
|
|
223
|
-
def
|
224
|
-
|
225
|
-
|
226
|
-
|
193
|
+
def timestamp(self, signal:AMDSignal):
|
194
|
+
self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
195
|
+
*data64_le(signal.timestamp_addr))
|
196
|
+
return self
|
227
197
|
|
228
|
-
def
|
229
|
-
|
230
|
-
*data64_le(signal._timestamp_addr)])
|
198
|
+
def bind(self, dev:AMDDevice):
|
199
|
+
if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return
|
231
200
|
|
232
|
-
|
233
|
-
|
201
|
+
self.binded_device = dev
|
202
|
+
self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
203
|
+
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
204
|
+
for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
|
234
205
|
|
235
|
-
|
236
|
-
|
237
|
-
if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
|
238
|
-
tail_blit_dword += cmdsz
|
206
|
+
self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
|
207
|
+
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
|
239
208
|
|
240
|
-
|
241
|
-
|
242
|
-
device.sdma_queue.put_value += tail_blit_dword * 4
|
209
|
+
def _submit(self, dev:AMDDevice):
|
210
|
+
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
243
211
|
|
244
|
-
if
|
245
|
-
|
246
|
-
|
247
|
-
|
212
|
+
if self.binded_device == dev:
|
213
|
+
# An IB packet must end on a 8 DW boundary.
|
214
|
+
add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
|
215
|
+
cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
|
248
216
|
|
249
|
-
|
250
|
-
|
217
|
+
if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
|
218
|
+
cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
|
219
|
+
else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
|
251
220
|
|
252
|
-
|
253
|
-
|
221
|
+
tail_blit_dword = 0
|
222
|
+
for cmdsz in cmd_sizes:
|
223
|
+
if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
|
224
|
+
tail_blit_dword += cmdsz
|
254
225
|
|
255
|
-
|
256
|
-
|
257
|
-
|
226
|
+
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
|
227
|
+
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
|
228
|
+
dev.sdma_queue.put_value += tail_blit_dword * 4
|
258
229
|
|
259
|
-
|
260
|
-
|
230
|
+
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
|
231
|
+
zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
|
232
|
+
ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
|
233
|
+
dev.sdma_queue.put_value += zero_fill
|
261
234
|
|
262
|
-
|
263
|
-
|
235
|
+
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
|
236
|
+
dev.sdma_queue.put_value += rem_packet_cnt * 4
|
264
237
|
|
265
|
-
|
266
|
-
def update_var(self, index:int, val:int): self.vals[index] = val
|
238
|
+
dev.sdma_queue.signal_doorbell()
|
267
239
|
|
268
240
|
class AMDProgram(HCQProgram):
|
269
|
-
def __init__(self,
|
241
|
+
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
|
270
242
|
# TODO; this API needs the type signature of the function and global_size/local_size
|
271
|
-
self.
|
243
|
+
self.dev: AMDDevice = dev
|
244
|
+
self.name, self.lib = name, lib
|
272
245
|
image, sections, _ = elf_loader(self.lib)
|
273
|
-
self.lib_gpu = self.
|
246
|
+
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
274
247
|
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
275
248
|
|
276
249
|
entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
|
@@ -279,43 +252,41 @@ class AMDProgram(HCQProgram):
|
|
279
252
|
self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
|
280
253
|
|
281
254
|
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
282
|
-
if lds_size > (self.
|
283
|
-
|
255
|
+
if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
|
256
|
+
|
257
|
+
# Ensure scratch size
|
258
|
+
self.dev._ensure_has_local_memory(self.private_segment_size)
|
284
259
|
|
285
260
|
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
|
286
261
|
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
287
262
|
|
288
263
|
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
|
289
|
-
self.rsrc1 = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.
|
290
|
-
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
291
|
-
self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
264
|
+
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
|
265
|
+
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
|
266
|
+
self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
292
267
|
|
293
268
|
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
294
269
|
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
|
295
|
-
self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
296
|
-
self.enable_private_segment_sgpr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
270
|
+
self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
271
|
+
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
297
272
|
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
298
273
|
|
299
|
-
super().__init__(
|
274
|
+
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
300
275
|
|
301
276
|
def __del__(self):
|
302
|
-
if hasattr(self, 'lib_gpu'): self.
|
277
|
+
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
303
278
|
|
304
|
-
class AMDAllocator(HCQAllocator):
|
305
|
-
def
|
279
|
+
class AMDAllocator(HCQAllocator['AMDDevice']):
|
280
|
+
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
281
|
+
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
306
282
|
|
307
|
-
def
|
308
|
-
|
309
|
-
|
310
|
-
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
283
|
+
def _free(self, opaque, options:BufferSpec):
|
284
|
+
self.dev.synchronize()
|
285
|
+
self.dev.dev_iface.free(opaque)
|
311
286
|
|
312
|
-
def
|
313
|
-
self.device.synchronize()
|
314
|
-
self.device._gpu_free(opaque)
|
287
|
+
def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
|
315
288
|
|
316
|
-
|
317
|
-
|
318
|
-
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
289
|
+
MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
|
319
290
|
|
320
291
|
@dataclass
|
321
292
|
class AMDQueueDesc:
|
@@ -325,147 +296,340 @@ class AMDQueueDesc:
|
|
325
296
|
doorbell: memoryview
|
326
297
|
put_value: int = 0
|
327
298
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
299
|
+
def signal_doorbell(self):
|
300
|
+
self.write_ptr[0] = self.put_value
|
301
|
+
|
302
|
+
# Ensure all prior writes are visible to the GPU.
|
303
|
+
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
|
304
|
+
self.doorbell[0] = self.put_value
|
305
|
+
|
306
|
+
class KFDIface:
|
307
|
+
kfd:HWInterface|None = None
|
308
|
+
event_page:HCQBuffer|None = None
|
309
|
+
gpus:list[HWInterface] = []
|
310
|
+
|
311
|
+
def _is_usable_gpu(self, gpu_id):
|
312
|
+
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
|
313
|
+
return False
|
314
|
+
|
315
|
+
def __init__(self, dev, device_id):
|
316
|
+
self.dev = dev
|
317
|
+
|
318
|
+
kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
|
319
|
+
|
320
|
+
# Initialize KFD interface during first run
|
321
|
+
if KFDIface.kfd is None:
|
322
|
+
KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
|
323
|
+
gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
|
324
|
+
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
|
325
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
326
|
+
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
327
|
+
|
328
|
+
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
329
|
+
|
330
|
+
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
|
331
|
+
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
|
332
|
+
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
333
|
+
|
334
|
+
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
|
335
|
+
|
336
|
+
# Set these for our device.
|
337
|
+
if KFDIface.event_page is None:
|
338
|
+
KFDIface.event_page = self.alloc(0x8000, uncached=True)
|
339
|
+
kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_page_offset=KFDIface.event_page.meta.handle)
|
340
|
+
else: self.map(KFDIface.event_page)
|
341
|
+
|
342
|
+
# Event to wait for queues completion
|
343
|
+
self.dev.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
|
344
|
+
self.dev.queue_event_mailbox_ptr = KFDIface.event_page.va_addr + self.dev.queue_event.event_slot_index * 8
|
345
|
+
self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.dev.queue_event.event_id)
|
346
|
+
self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
|
347
|
+
|
348
|
+
# OS events to collect memory and hardware faults
|
349
|
+
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
|
350
|
+
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
351
|
+
|
352
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
|
353
|
+
flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
354
|
+
|
355
|
+
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
|
356
|
+
else: flags |= (kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
357
|
+
|
358
|
+
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
359
|
+
|
347
360
|
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
348
|
-
buf = addr =
|
349
|
-
else:
|
350
|
-
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
361
|
+
buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
362
|
+
else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
|
351
363
|
assert addr != 0xffffffffffffffff
|
352
364
|
|
353
365
|
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
|
354
366
|
flags=flags, mmap_offset=buf)
|
355
367
|
except OSError as e:
|
356
|
-
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and
|
368
|
+
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
|
357
369
|
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
358
370
|
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
|
359
371
|
raise
|
360
372
|
|
361
373
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
362
|
-
buf =
|
374
|
+
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
|
363
375
|
assert addr == buf == mem.va_addr
|
364
|
-
if map_to_gpu: self._gpu_map(mem)
|
365
|
-
return mem
|
366
376
|
|
367
|
-
|
368
|
-
|
377
|
+
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
|
378
|
+
return hcqbuf
|
379
|
+
|
380
|
+
def free(self, mem):
|
381
|
+
if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
|
369
382
|
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
370
|
-
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
383
|
+
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
371
384
|
assert stm.n_success == len(gpus)
|
372
|
-
|
373
|
-
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
|
385
|
+
if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
|
386
|
+
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
|
387
|
+
|
388
|
+
def map(self, mem):
|
389
|
+
if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
|
390
|
+
mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
|
391
|
+
c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
|
392
|
+
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
393
|
+
n_devices=len(mem.meta.mapped_gpu_ids))
|
394
|
+
assert stm.n_success == len(mem.meta.mapped_gpu_ids)
|
395
|
+
|
396
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
397
|
+
cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
|
398
|
+
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
399
|
+
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
400
|
+
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
|
401
|
+
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
|
402
|
+
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
374
403
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
404
|
+
if not hasattr(self, 'doorbells'):
|
405
|
+
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
406
|
+
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
|
407
|
+
|
408
|
+
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
|
409
|
+
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
410
|
+
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
|
411
|
+
|
412
|
+
def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
|
413
|
+
|
414
|
+
def on_device_hang(self):
|
415
|
+
def _collect_str(st): return ' '.join(f'{k[0]}={getattr(st, k[0])}' for k in st._fields_)
|
416
|
+
|
417
|
+
report = []
|
418
|
+
for evnt in [self.mem_fault_event, self.hw_fault_event]:
|
419
|
+
ev = (kfd.struct_kfd_event_data)(event_id=evnt.event_id)
|
420
|
+
kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
|
421
|
+
if evnt == self.mem_fault_event and ev.memory_exception_data.gpu_id:
|
422
|
+
report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {_collect_str(ev.memory_exception_data.failure)}"]
|
423
|
+
if evnt == self.hw_fault_event and ev.hw_exception_data.gpu_id: report += [f"HW fault: {_collect_str(ev.hw_exception_data)}"]
|
424
|
+
|
425
|
+
raise RuntimeError("\n".join(report))
|
426
|
+
|
427
|
+
@dataclass
|
428
|
+
class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
|
429
|
+
|
430
|
+
class PCIIface:
|
431
|
+
supported_devs:list[int] = [0x744c, 0x7480]
|
432
|
+
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
|
433
|
+
vfio_fd:HWInterface
|
434
|
+
gpus:list[Any] = []
|
435
|
+
|
436
|
+
def __init__(self, dev, dev_id):
|
437
|
+
self.dev = dev
|
438
|
+
|
439
|
+
if first_dev:=len(PCIIface.gpus) == 0:
|
440
|
+
for pcibus in HWInterface("/sys/bus/pci/devices").listdir():
|
441
|
+
vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
|
442
|
+
device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
|
443
|
+
if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
|
444
|
+
|
445
|
+
# TODO: visible_devices should be handled layer above this?
|
380
446
|
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
381
|
-
|
447
|
+
PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
|
448
|
+
|
449
|
+
self.pcibus = PCIIface.gpus[dev_id]
|
450
|
+
|
451
|
+
# Unbind the device from the kernel driver
|
452
|
+
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
453
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
|
454
|
+
|
455
|
+
supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
|
456
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
|
457
|
+
|
458
|
+
# Try to init vfio. Use it if success.
|
459
|
+
if PCIIface.vfio:
|
460
|
+
try:
|
461
|
+
if first_dev:
|
462
|
+
HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
|
463
|
+
PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
|
464
|
+
vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
465
|
+
|
466
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
|
467
|
+
HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
|
468
|
+
|
469
|
+
iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
|
470
|
+
except OSError:
|
471
|
+
if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).")
|
472
|
+
PCIIface.vfio = False
|
473
|
+
|
474
|
+
# Init vfio for the device
|
475
|
+
if PCIIface.vfio:
|
476
|
+
self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
477
|
+
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
|
478
|
+
|
479
|
+
if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
480
|
+
self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
|
481
|
+
|
482
|
+
self.irq_fd = HWInterface.eventfd(0, 0)
|
483
|
+
self.irq_poller = select.poll()
|
484
|
+
self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
|
485
|
+
|
486
|
+
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
487
|
+
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
|
488
|
+
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
489
|
+
else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
|
490
|
+
|
491
|
+
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
|
492
|
+
self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
|
493
|
+
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
|
494
|
+
|
495
|
+
bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
|
496
|
+
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
|
497
|
+
|
498
|
+
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
499
|
+
self.doorbell_cpu_addr = mv_address(dbell)
|
500
|
+
|
501
|
+
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
|
502
|
+
self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
|
503
|
+
|
504
|
+
array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
|
505
|
+
simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
|
506
|
+
self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': self.adev.ip_versions[am.GC_HWIP],
|
507
|
+
'max_slots_scratch_cu': self.adev.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.adev.gc_info.gc_max_waves_per_simd,
|
508
|
+
'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
|
509
|
+
|
510
|
+
def _map_pci_range(self, bar, off=0, addr=0, size=None):
|
511
|
+
fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
|
512
|
+
libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
|
513
|
+
return to_mv(loc, sz)
|
514
|
+
|
515
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
516
|
+
if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
|
517
|
+
vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
|
518
|
+
va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
|
519
|
+
|
520
|
+
# Read pagemap to get the physical address of each page. The pages are locked.
|
521
|
+
self.pagemap.seek(va // mmap.PAGESIZE * 8)
|
522
|
+
paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
|
523
|
+
am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
|
524
|
+
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
|
525
|
+
|
526
|
+
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
|
527
|
+
if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
|
528
|
+
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
|
529
|
+
|
530
|
+
def free(self, mem):
|
531
|
+
for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
|
532
|
+
if not mem.meta.mapping.system: self.adev.mm.vfree(mem.meta.mapping)
|
533
|
+
|
534
|
+
def map(self, mem):
|
535
|
+
# Check if the memory is already mapped on this device
|
536
|
+
if self.dev in mem.meta.mapped_devs: return
|
537
|
+
mem.meta.mapped_devs.append(self.dev)
|
538
|
+
|
539
|
+
paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
|
540
|
+
self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
|
541
|
+
|
542
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
543
|
+
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
544
|
+
self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
545
|
+
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
|
546
|
+
else:
|
547
|
+
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
548
|
+
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
382
549
|
|
383
|
-
|
384
|
-
|
550
|
+
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
|
551
|
+
read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
|
552
|
+
|
553
|
+
def sleep(self, timeout):
|
554
|
+
if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
|
555
|
+
self.irq_fd.read(8 * events_cnt)
|
556
|
+
self.adev.ih.interrupt_handler()
|
557
|
+
|
558
|
+
def on_device_hang(self):
|
559
|
+
for d in self.dev.devices: d.dev_iface.adev.gmc.on_interrupt()
|
560
|
+
raise RuntimeError("Device hang detected")
|
561
|
+
|
562
|
+
def device_fini(self): self.adev.fini()
|
385
563
|
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
564
|
+
class AMDDevice(HCQCompiled):
|
565
|
+
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
566
|
+
signals_page:Any = None
|
567
|
+
signals_pool:list[int] = []
|
568
|
+
|
569
|
+
def __init__(self, device:str=""):
|
570
|
+
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
571
|
+
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
|
572
|
+
self.target = int(self.dev_iface.props['gfx_target_version'])
|
390
573
|
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
|
391
574
|
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
392
575
|
|
393
|
-
|
576
|
+
if AMDDevice.signals_page is None:
|
577
|
+
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
|
578
|
+
AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
|
579
|
+
else: self.dev_iface.map(AMDDevice.signals_page)
|
394
580
|
|
395
|
-
|
396
|
-
|
397
|
-
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
398
|
-
AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
|
399
|
-
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
400
|
-
else:
|
401
|
-
self._gpu_map(AMDDevice.signals_page)
|
402
|
-
self._gpu_map(AMDDevice.event_page)
|
403
|
-
|
404
|
-
# Scratch setup
|
405
|
-
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
406
|
-
max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
407
|
-
self.max_private_segment_size = 4096
|
408
|
-
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
409
|
-
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
410
|
-
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
581
|
+
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
|
582
|
+
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
|
411
583
|
self.has_scratch_base_registers = self.target >= 110000
|
412
|
-
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
|
413
|
-
self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
|
414
584
|
|
415
585
|
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
|
416
586
|
sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
|
417
587
|
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
|
418
|
-
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (max_cu_id + 1), mmap.PAGESIZE)
|
419
|
-
ctl_stack_size = round_up(12 * (max_cu_id + 1) * (max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
420
|
-
|
588
|
+
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
|
589
|
+
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
590
|
+
debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
|
421
591
|
|
422
|
-
self.compute_queue = self.
|
423
|
-
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
|
424
|
-
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
592
|
+
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
|
593
|
+
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
425
594
|
|
426
|
-
self.
|
427
|
-
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
595
|
+
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
|
428
596
|
|
429
597
|
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
|
430
598
|
AMDSignal, AMDComputeQueue, AMDCopyQueue)
|
431
599
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
if
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
600
|
+
# Scratch setup
|
601
|
+
self.max_private_segment_size = 0
|
602
|
+
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
603
|
+
|
604
|
+
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
605
|
+
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
606
|
+
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
607
|
+
eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
608
|
+
return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
|
609
|
+
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
|
610
|
+
|
611
|
+
def _ensure_has_local_memory(self, required):
|
612
|
+
if self.max_private_segment_size >= required: return
|
613
|
+
|
614
|
+
# <gfx103 requires alignment of 1024, >=gfx11 requires 256
|
615
|
+
wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
|
616
|
+
|
617
|
+
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
|
618
|
+
if ok:
|
619
|
+
engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
|
620
|
+
waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
|
621
|
+
# >=gfx11 wavesize is per SE
|
622
|
+
wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
|
623
|
+
self.tmpring_size = waves << 12 | wavesize
|
624
|
+
self.max_private_segment_size = required
|
451
625
|
|
452
626
|
def invalidate_caches(self):
|
453
627
|
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
|
454
628
|
self.timeline_value += 1
|
455
629
|
self.synchronize()
|
456
630
|
|
457
|
-
def on_device_hang(self):
|
458
|
-
report = []
|
459
|
-
|
460
|
-
ev = (kfd.struct_kfd_event_data)(event_id=self.mem_fault_event.event_id)
|
461
|
-
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
|
462
|
-
if ev.memory_exception_data.gpu_id:
|
463
|
-
pfstatus = ' '.join(f'{k[0]}={getattr(ev.memory_exception_data.failure, k[0])}' for k in ev.memory_exception_data.failure._fields_)
|
464
|
-
report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {pfstatus}"]
|
631
|
+
def on_device_hang(self): self.dev_iface.on_device_hang()
|
465
632
|
|
466
|
-
|
467
|
-
|
468
|
-
if
|
469
|
-
report += [f"HW fault: {' '.join(f'{k[0]}={getattr(ev.hw_exception_data, k[0])}' for k in ev.hw_exception_data._fields_)}"]
|
470
|
-
|
471
|
-
raise RuntimeError("\n".join(report))
|
633
|
+
def finalize(self):
|
634
|
+
self.synchronize()
|
635
|
+
if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
|