tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/kernel.py +114 -172
- tinygrad/codegen/linearize.py +211 -81
- tinygrad/codegen/lowerer.py +30 -35
- tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
- tinygrad/codegen/transcendental.py +12 -13
- tinygrad/device.py +170 -47
- tinygrad/dtype.py +28 -26
- tinygrad/engine/jit.py +80 -63
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +162 -0
- tinygrad/engine/realize.py +58 -107
- tinygrad/engine/schedule.py +381 -314
- tinygrad/engine/search.py +40 -44
- tinygrad/gradient.py +70 -0
- tinygrad/helpers.py +77 -58
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +89 -64
- tinygrad/ops.py +562 -446
- tinygrad/renderer/__init__.py +79 -36
- tinygrad/renderer/cstyle.py +70 -84
- tinygrad/renderer/llvmir.py +32 -20
- tinygrad/renderer/ptx.py +79 -99
- tinygrad/renderer/wgsl.py +87 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libpciaccess.py +2023 -0
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +19 -21
- tinygrad/runtime/ops_amd.py +488 -327
- tinygrad/runtime/ops_clang.py +15 -28
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +129 -38
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +45 -40
- tinygrad/runtime/ops_metal.py +93 -73
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +232 -270
- tinygrad/runtime/ops_python.py +51 -46
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +63 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +384 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +26 -4
- tinygrad/runtime/support/hcq.py +254 -324
- tinygrad/runtime/support/llvm.py +32 -0
- tinygrad/shape/shapetracker.py +84 -53
- tinygrad/shape/view.py +103 -138
- tinygrad/spec.py +154 -0
- tinygrad/tensor.py +744 -496
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
- tinygrad-0.10.1.dist-info/RECORD +86 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,28 +1,23 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import
|
3
|
-
import os, ctypes, ctypes.util, functools,
|
2
|
+
from typing import Any, cast
|
3
|
+
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select, atexit
|
4
4
|
assert sys.platform != 'win32'
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer,
|
7
|
-
from tinygrad.
|
8
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
7
|
+
from tinygrad.ops import sint
|
8
|
+
from tinygrad.device import BufferSpec
|
9
|
+
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
|
9
10
|
from tinygrad.renderer.cstyle import AMDRenderer
|
10
|
-
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
|
11
|
+
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, libpciaccess, vfio
|
12
|
+
from tinygrad.runtime.autogen.am import am
|
11
13
|
from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
12
14
|
from tinygrad.runtime.support.elf import elf_loader
|
15
|
+
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
|
13
16
|
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
14
|
-
if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
15
|
-
|
16
|
-
def is_usable_gpu(gpu_id):
|
17
|
-
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
|
18
|
-
return False
|
19
17
|
|
20
18
|
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
21
19
|
|
22
|
-
#
|
23
|
-
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
24
|
-
|
25
|
-
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
20
|
+
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
26
21
|
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
27
22
|
|
28
23
|
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
@@ -31,246 +26,227 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
|
31
26
|
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
32
27
|
|
33
28
|
class AMDSignal(HCQSignal):
|
34
|
-
def __init__(self,
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
def __del__(self): AMDDevice.signals_pool.append(self._signal)
|
44
|
-
def _get_value(self) -> int: return self._signal[0]
|
45
|
-
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
|
46
|
-
def _set_value(self, new_value:int): self._signal[0] = new_value
|
47
|
-
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
|
48
|
-
start_time = time.time() * 1000
|
49
|
-
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
50
|
-
if self._signal[0] >= value: return
|
51
|
-
|
52
|
-
# Wait active for 5s, then going to sleep.
|
53
|
-
if time_spent > 5000 and self._event_mailbox_ptr != 0:
|
54
|
-
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
55
|
-
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
|
56
|
-
|
57
|
-
class AMDComputeQueue(HWComputeQueue):
|
58
|
-
def __init__(self):
|
59
|
-
self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
|
60
|
-
super().__init__()
|
29
|
+
def __init__(self, base_addr:int|None=None, **kwargs):
|
30
|
+
super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
|
31
|
+
|
32
|
+
def __del__(self):
|
33
|
+
if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
|
34
|
+
|
35
|
+
def _sleep(self, time_spent_waiting_ms:int):
|
36
|
+
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
|
37
|
+
if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
|
61
38
|
|
39
|
+
class AMDComputeQueue(HWQueue):
|
62
40
|
def __del__(self):
|
63
41
|
if self.binded_device is not None:
|
64
|
-
self.binded_device.allocator.free(self.hw_page, self.hw_page.size,
|
65
|
-
|
66
|
-
def
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
self.
|
94
|
-
|
95
|
-
def
|
96
|
-
self.
|
97
|
-
|
98
|
-
|
99
|
-
|
42
|
+
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
43
|
+
|
44
|
+
def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
|
45
|
+
|
46
|
+
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
|
47
|
+
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
48
|
+
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
|
49
|
+
|
50
|
+
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
51
|
+
|
52
|
+
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
53
|
+
cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
54
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
55
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
56
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
57
|
+
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
58
|
+
|
59
|
+
self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
60
|
+
|
61
|
+
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
|
62
|
+
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
63
|
+
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
64
|
+
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
|
65
|
+
|
66
|
+
event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
67
|
+
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
|
68
|
+
|
69
|
+
memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
|
70
|
+
|
71
|
+
self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
72
|
+
|
73
|
+
def memory_barrier(self):
|
74
|
+
self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
|
75
|
+
self.acquire_mem()
|
76
|
+
return self
|
77
|
+
|
78
|
+
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
79
|
+
self.bind_args_state(args_state)
|
80
|
+
|
81
|
+
self.acquire_mem(gli=0, gl2=0)
|
82
|
+
|
83
|
+
if prg.enable_private_segment_sgpr:
|
84
|
+
scratch_hilo = data64_le(prg.dev.scratch.va_addr)
|
85
|
+
# sgpr word1 bit31 enables swizzle
|
86
|
+
# sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
|
87
|
+
user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000] if prg.enable_private_segment_sgpr else []
|
88
|
+
else: user_regs = []
|
100
89
|
if prg.enable_dispatch_ptr:
|
101
90
|
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
|
102
|
-
|
103
|
-
|
91
|
+
|
92
|
+
self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
|
93
|
+
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
|
104
94
|
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
105
95
|
user_regs += [*data64_le(dp_addr)]
|
106
|
-
|
96
|
+
|
107
97
|
user_regs += [*data64_le(args_state.ptr)]
|
108
98
|
|
109
|
-
self.
|
110
|
-
self.
|
111
|
-
self.
|
112
|
-
self.
|
113
|
-
if prg.
|
114
|
-
self.
|
115
|
-
|
116
|
-
|
117
|
-
self.
|
118
|
-
self.
|
119
|
-
self.
|
120
|
-
self.
|
121
|
-
|
122
|
-
|
123
|
-
self.
|
124
|
-
|
125
|
-
self.
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
|
139
|
-
|
140
|
-
def _wait(self, signal, value=0):
|
141
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
142
|
-
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
143
|
-
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
|
144
|
-
|
145
|
-
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
|
146
|
-
|
147
|
-
def _signal(self, signal, value=0):
|
99
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
100
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
101
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
|
102
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
103
|
+
if prg.dev.has_scratch_base_registers:
|
104
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
|
105
|
+
if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
|
106
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
|
107
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
|
108
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
|
109
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
110
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
|
111
|
+
|
112
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
|
113
|
+
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
|
114
|
+
|
115
|
+
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
|
116
|
+
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
117
|
+
return self
|
118
|
+
|
119
|
+
def wait(self, signal:AMDSignal, value:sint=0):
|
120
|
+
self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
|
121
|
+
return self
|
122
|
+
|
123
|
+
def timestamp(self, signal:AMDSignal):
|
124
|
+
self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
|
125
|
+
return self
|
126
|
+
|
127
|
+
def signal(self, signal:AMDSignal, value:sint=0):
|
148
128
|
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
149
|
-
self.
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
|
161
|
-
|
162
|
-
# Check if the signal command has mailptr part
|
163
|
-
if signal is not None and self.cmds_len[cmd_idx] > 8:
|
164
|
-
self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
|
165
|
-
|
166
|
-
def bind(self, device):
|
167
|
-
self.binded_device = device
|
168
|
-
self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
|
129
|
+
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
130
|
+
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
131
|
+
|
132
|
+
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
133
|
+
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
134
|
+
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
135
|
+
return self
|
136
|
+
|
137
|
+
def bind(self, dev:AMDDevice):
|
138
|
+
self.binded_device = dev
|
139
|
+
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
169
140
|
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
170
|
-
for i, value in enumerate(self.
|
141
|
+
for i, value in enumerate(self._q): hw_view[i] = value
|
171
142
|
|
172
143
|
self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
|
173
|
-
len(self.
|
174
|
-
self.
|
144
|
+
len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
|
145
|
+
self._q = hw_view
|
146
|
+
return self
|
175
147
|
|
176
|
-
def _submit(self,
|
177
|
-
cmds = self.indirect_cmd if
|
148
|
+
def _submit(self, dev:AMDDevice):
|
149
|
+
cmds = self.indirect_cmd if dev == self.binded_device else self._q
|
178
150
|
|
179
|
-
for i, value in enumerate(cmds):
|
151
|
+
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
|
180
152
|
|
181
|
-
|
182
|
-
|
183
|
-
|
153
|
+
dev.compute_queue.put_value += len(cmds)
|
154
|
+
dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value
|
155
|
+
dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
|
184
156
|
|
185
157
|
SDMA_MAX_COPY_SIZE = 0x400000
|
186
|
-
class AMDCopyQueue(
|
158
|
+
class AMDCopyQueue(HWQueue):
|
187
159
|
def __init__(self):
|
188
|
-
self.internal_cmd_sizes
|
160
|
+
self.internal_cmd_sizes = []
|
189
161
|
super().__init__()
|
190
162
|
|
191
|
-
def
|
192
|
-
|
163
|
+
def q(self, *arr):
|
164
|
+
super().q(*arr)
|
193
165
|
self.internal_cmd_sizes.append(len(arr))
|
194
166
|
|
195
|
-
def
|
167
|
+
def copy(self, dest:sint, src:sint, copy_size:int):
|
196
168
|
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
197
|
-
|
169
|
+
|
198
170
|
for _ in range(copy_commands):
|
199
171
|
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
200
172
|
|
201
|
-
self.
|
202
|
-
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)
|
173
|
+
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
174
|
+
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
203
175
|
|
204
176
|
copied += step_copy_size
|
177
|
+
return self
|
205
178
|
|
206
|
-
def
|
207
|
-
|
208
|
-
if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
|
209
|
-
if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
179
|
+
def signal(self, signal:AMDSignal, value:sint=0):
|
180
|
+
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
|
210
181
|
|
211
|
-
|
212
|
-
|
182
|
+
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
183
|
+
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
184
|
+
self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
185
|
+
elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
213
186
|
|
214
|
-
|
215
|
-
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
|
216
|
-
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
|
187
|
+
return self
|
217
188
|
|
218
|
-
def
|
219
|
-
self.
|
220
|
-
|
221
|
-
|
189
|
+
def wait(self, signal:AMDSignal, value:sint=0):
|
190
|
+
self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
191
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
|
192
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
|
193
|
+
return self
|
222
194
|
|
223
|
-
def
|
224
|
-
|
225
|
-
|
226
|
-
|
195
|
+
def timestamp(self, signal:AMDSignal):
|
196
|
+
self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
197
|
+
*data64_le(signal.timestamp_addr))
|
198
|
+
return self
|
227
199
|
|
228
|
-
def
|
229
|
-
|
230
|
-
*data64_le(signal._timestamp_addr)])
|
200
|
+
def bind(self, dev:AMDDevice):
|
201
|
+
if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return
|
231
202
|
|
232
|
-
|
233
|
-
|
203
|
+
self.binded_device = dev
|
204
|
+
self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
205
|
+
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
206
|
+
for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
|
234
207
|
|
235
|
-
|
236
|
-
|
237
|
-
if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
|
238
|
-
tail_blit_dword += cmdsz
|
208
|
+
self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
|
209
|
+
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
|
239
210
|
|
240
|
-
|
241
|
-
|
242
|
-
device.sdma_queue.put_value += tail_blit_dword * 4
|
211
|
+
def _submit(self, dev:AMDDevice):
|
212
|
+
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
243
213
|
|
244
|
-
if
|
245
|
-
|
246
|
-
|
247
|
-
|
214
|
+
if self.binded_device == dev:
|
215
|
+
# An IB packet must end on a 8 DW boundary.
|
216
|
+
add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
|
217
|
+
cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
|
248
218
|
|
249
|
-
|
250
|
-
|
219
|
+
if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
|
220
|
+
cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
|
221
|
+
else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
|
251
222
|
|
252
|
-
|
253
|
-
|
223
|
+
tail_blit_dword = 0
|
224
|
+
for cmdsz in cmd_sizes:
|
225
|
+
if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
|
226
|
+
tail_blit_dword += cmdsz
|
254
227
|
|
255
|
-
|
256
|
-
|
257
|
-
|
228
|
+
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
|
229
|
+
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
|
230
|
+
dev.sdma_queue.put_value += tail_blit_dword * 4
|
258
231
|
|
259
|
-
|
260
|
-
|
232
|
+
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
|
233
|
+
zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
|
234
|
+
ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
|
235
|
+
dev.sdma_queue.put_value += zero_fill
|
261
236
|
|
262
|
-
|
263
|
-
|
237
|
+
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
|
238
|
+
dev.sdma_queue.put_value += rem_packet_cnt * 4
|
264
239
|
|
265
|
-
|
266
|
-
|
240
|
+
dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
|
241
|
+
dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
|
267
242
|
|
268
243
|
class AMDProgram(HCQProgram):
|
269
|
-
def __init__(self,
|
244
|
+
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
|
270
245
|
# TODO; this API needs the type signature of the function and global_size/local_size
|
271
|
-
self.
|
246
|
+
self.dev: AMDDevice = dev
|
247
|
+
self.name, self.lib = name, lib
|
272
248
|
image, sections, _ = elf_loader(self.lib)
|
273
|
-
self.lib_gpu = self.
|
249
|
+
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
274
250
|
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
275
251
|
|
276
252
|
entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
|
@@ -279,43 +255,43 @@ class AMDProgram(HCQProgram):
|
|
279
255
|
self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
|
280
256
|
|
281
257
|
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
282
|
-
if lds_size > (self.
|
283
|
-
|
258
|
+
if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
|
259
|
+
|
260
|
+
# Ensure scratch size
|
261
|
+
self.dev._ensure_has_local_memory(self.private_segment_size)
|
284
262
|
|
285
263
|
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
|
286
264
|
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
287
265
|
|
288
266
|
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
|
289
|
-
self.rsrc1 = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.
|
290
|
-
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
291
|
-
self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
267
|
+
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
|
268
|
+
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
|
269
|
+
self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
292
270
|
|
293
271
|
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
294
272
|
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
|
295
|
-
self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
296
|
-
self.enable_private_segment_sgpr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
273
|
+
self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
274
|
+
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
297
275
|
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
298
276
|
|
299
|
-
super().__init__(
|
277
|
+
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
300
278
|
|
301
279
|
def __del__(self):
|
302
|
-
if hasattr(self, 'lib_gpu'): self.
|
280
|
+
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
303
281
|
|
304
|
-
class AMDAllocator(HCQAllocator):
|
305
|
-
def __init__(self,
|
282
|
+
class AMDAllocator(HCQAllocator['AMDDevice']):
|
283
|
+
def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
|
306
284
|
|
307
|
-
def _alloc(self, size:int, options:
|
308
|
-
|
309
|
-
if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
310
|
-
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
285
|
+
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
286
|
+
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
311
287
|
|
312
|
-
def _free(self, opaque, options:
|
313
|
-
self.
|
314
|
-
self.
|
288
|
+
def _free(self, opaque, options:BufferSpec):
|
289
|
+
self.dev.synchronize()
|
290
|
+
self.dev.dev_iface.free(opaque)
|
315
291
|
|
316
|
-
def map(self, buf:HCQBuffer): self.
|
292
|
+
def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
|
317
293
|
|
318
|
-
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
294
|
+
MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
|
319
295
|
|
320
296
|
@dataclass
|
321
297
|
class AMDQueueDesc:
|
@@ -325,147 +301,332 @@ class AMDQueueDesc:
|
|
325
301
|
doorbell: memoryview
|
326
302
|
put_value: int = 0
|
327
303
|
|
328
|
-
class
|
329
|
-
kfd:
|
330
|
-
event_page:
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
304
|
+
class KFDIface:
|
305
|
+
kfd:HWInterface|None = None
|
306
|
+
event_page:HCQBuffer|None = None
|
307
|
+
gpus:list[HWInterface] = []
|
308
|
+
|
309
|
+
def _is_usable_gpu(self, gpu_id):
|
310
|
+
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
|
311
|
+
return False
|
312
|
+
|
313
|
+
def __init__(self, dev, device_id):
|
314
|
+
self.dev = dev
|
315
|
+
|
316
|
+
kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
|
317
|
+
|
318
|
+
# Initialize KFD interface during first run
|
319
|
+
if KFDIface.kfd is None:
|
320
|
+
KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
|
321
|
+
gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
|
322
|
+
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
|
323
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
324
|
+
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
325
|
+
|
326
|
+
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
327
|
+
|
328
|
+
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
|
329
|
+
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
|
330
|
+
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
331
|
+
|
332
|
+
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
|
333
|
+
|
334
|
+
# Set these for our device.
|
335
|
+
if KFDIface.event_page is None:
|
336
|
+
KFDIface.event_page = self.alloc(0x8000, uncached=True)
|
337
|
+
kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_page_offset=KFDIface.event_page.meta.handle)
|
338
|
+
else: self.map(KFDIface.event_page)
|
339
|
+
|
340
|
+
# Event to wait for queues completion
|
341
|
+
self.dev.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
|
342
|
+
self.dev.queue_event_mailbox_ptr = KFDIface.event_page.va_addr + self.dev.queue_event.event_slot_index * 8
|
343
|
+
self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.dev.queue_event.event_id)
|
344
|
+
self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
|
345
|
+
|
346
|
+
# OS events to collect memory and hardware faults
|
347
|
+
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
|
348
|
+
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
349
|
+
|
350
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
|
351
|
+
flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
352
|
+
|
353
|
+
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
|
354
|
+
else: flags |= (kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
355
|
+
|
356
|
+
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
357
|
+
|
347
358
|
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
348
|
-
buf = addr =
|
349
|
-
else:
|
350
|
-
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
359
|
+
buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
360
|
+
else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
|
351
361
|
assert addr != 0xffffffffffffffff
|
352
362
|
|
353
363
|
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
|
354
364
|
flags=flags, mmap_offset=buf)
|
355
365
|
except OSError as e:
|
356
|
-
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and
|
366
|
+
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
|
357
367
|
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
358
368
|
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
|
359
369
|
raise
|
360
370
|
|
361
371
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
362
|
-
buf =
|
372
|
+
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
|
363
373
|
assert addr == buf == mem.va_addr
|
364
|
-
if map_to_gpu: self._gpu_map(mem)
|
365
|
-
return mem
|
366
374
|
|
367
|
-
|
368
|
-
|
375
|
+
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
|
376
|
+
return hcqbuf
|
377
|
+
|
378
|
+
def free(self, mem):
|
379
|
+
if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
|
369
380
|
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
370
|
-
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
381
|
+
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
371
382
|
assert stm.n_success == len(gpus)
|
372
|
-
|
373
|
-
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
|
383
|
+
if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
|
384
|
+
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
|
385
|
+
|
386
|
+
def map(self, mem):
|
387
|
+
if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
|
388
|
+
mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
|
389
|
+
c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
|
390
|
+
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
391
|
+
n_devices=len(mem.meta.mapped_gpu_ids))
|
392
|
+
assert stm.n_success == len(mem.meta.mapped_gpu_ids)
|
393
|
+
|
394
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
395
|
+
cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
|
396
|
+
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
397
|
+
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
398
|
+
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
|
399
|
+
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
|
400
|
+
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
374
401
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
402
|
+
if not hasattr(self, 'doorbells'):
|
403
|
+
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
404
|
+
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
|
405
|
+
|
406
|
+
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
|
407
|
+
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
408
|
+
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
|
409
|
+
|
410
|
+
def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
|
411
|
+
|
412
|
+
def on_device_hang(self):
|
413
|
+
def _collect_str(st): return ' '.join(f'{k[0]}={getattr(st, k[0])}' for k in st._fields_)
|
414
|
+
|
415
|
+
report = []
|
416
|
+
for evnt in [self.mem_fault_event, self.hw_fault_event]:
|
417
|
+
ev = (kfd.struct_kfd_event_data)(event_id=evnt.event_id)
|
418
|
+
kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
|
419
|
+
if evnt == self.mem_fault_event and ev.memory_exception_data.gpu_id:
|
420
|
+
report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {_collect_str(ev.memory_exception_data.failure)}"]
|
421
|
+
if evnt == self.hw_fault_event and ev.hw_exception_data.gpu_id: report += [f"HW fault: {_collect_str(ev.hw_exception_data)}"]
|
422
|
+
|
423
|
+
raise RuntimeError("\n".join(report))
|
424
|
+
|
425
|
+
@dataclass
|
426
|
+
class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
|
427
|
+
|
428
|
+
class PCIIface:
|
429
|
+
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
|
430
|
+
vfio_fd:HWInterface
|
431
|
+
gpus:list[Any] = []
|
432
|
+
|
433
|
+
def __init__(self, dev, dev_id):
|
434
|
+
self.dev = dev
|
435
|
+
|
436
|
+
if first_dev:=len(PCIIface.gpus) == 0:
|
437
|
+
libpciaccess.pci_system_init()
|
438
|
+
pci_iter = libpciaccess.pci_id_match_iterator_create(None)
|
439
|
+
while pcidev:=libpciaccess.pci_device_next(pci_iter):
|
440
|
+
if pcidev.contents.vendor_id == 0x1002 and pcidev.contents.device_id == 0x744c: PCIIface.gpus.append(pcidev.contents)
|
441
|
+
|
442
|
+
# TODO: visible_devices should be handled layer above this?
|
380
443
|
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
381
|
-
|
444
|
+
PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
|
445
|
+
|
446
|
+
self.pcidev = PCIIface.gpus[dev_id]
|
447
|
+
self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
|
448
|
+
|
449
|
+
# Unbind the device from the kernel driver
|
450
|
+
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
451
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
|
452
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
|
453
|
+
|
454
|
+
# Probe device
|
455
|
+
libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
|
456
|
+
|
457
|
+
# Try to init vfio. Use it if success.
|
458
|
+
if PCIIface.vfio:
|
459
|
+
try:
|
460
|
+
if first_dev:
|
461
|
+
HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
|
462
|
+
PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
|
463
|
+
vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
464
|
+
|
465
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
|
466
|
+
HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
|
467
|
+
|
468
|
+
iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
|
469
|
+
except OSError:
|
470
|
+
if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).")
|
471
|
+
PCIIface.vfio = False
|
472
|
+
|
473
|
+
# Init vfio for the device
|
474
|
+
if PCIIface.vfio:
|
475
|
+
self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
476
|
+
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
|
477
|
+
|
478
|
+
if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
479
|
+
self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
|
480
|
+
|
481
|
+
self.irq_fd = HWInterface.eventfd(0, 0)
|
482
|
+
self.irq_poller = select.poll()
|
483
|
+
self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
|
484
|
+
|
485
|
+
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
486
|
+
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
|
487
|
+
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
488
|
+
else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
|
489
|
+
|
490
|
+
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
|
491
|
+
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
|
492
|
+
|
493
|
+
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
494
|
+
self.doorbell_cpu_addr = mv_address(dbell)
|
495
|
+
|
496
|
+
libpciaccess.pci_device_cfg_read_u16(self.pcidev, ctypes.byref(val:=ctypes.c_uint16()), libpciaccess.PCI_COMMAND)
|
497
|
+
libpciaccess.pci_device_cfg_write_u16(self.pcidev, val.value | libpciaccess.PCI_COMMAND_MASTER, libpciaccess.PCI_COMMAND)
|
498
|
+
|
499
|
+
array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
|
500
|
+
simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
|
501
|
+
self.props = {'simd_count': 2 * simd_count, 'simd_per_cu': 2, 'array_count': array_count, 'gfx_target_version': self.adev.ip_versions[am.GC_HWIP],
|
502
|
+
'max_slots_scratch_cu': self.adev.gc_info.gc_max_scratch_slots_per_cu, 'max_waves_per_simd': self.adev.gc_info.gc_max_waves_per_simd,
|
503
|
+
'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
|
504
|
+
|
505
|
+
def _map_pci_range(self, bar, off=0, addr=0, size=None):
|
506
|
+
fd, sz = self.bar_fds[bar], size or self.pcidev.regions[bar].size
|
507
|
+
return to_mv(fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz)
|
508
|
+
|
509
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
510
|
+
if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
|
511
|
+
vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
|
512
|
+
va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
|
513
|
+
|
514
|
+
# Read pagemap to get the physical address of each page. The pages are locked.
|
515
|
+
self.pagemap.seek(va // mmap.PAGESIZE * 8)
|
516
|
+
paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
|
517
|
+
am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
|
518
|
+
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
|
519
|
+
|
520
|
+
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
|
521
|
+
if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
|
522
|
+
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping))
|
523
|
+
|
524
|
+
def free(self, mem):
|
525
|
+
for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
|
526
|
+
if not mem.meta.mapping.system: self.adev.mm.vfree(mem.meta.mapping)
|
527
|
+
|
528
|
+
def map(self, mem):
|
529
|
+
# Check if the memory is already mapped on this device
|
530
|
+
if self.dev in mem.meta.mapped_devs: return
|
531
|
+
mem.meta.mapped_devs.append(self.dev)
|
532
|
+
|
533
|
+
owner_sys_base = mem.meta.owner.dev_iface.pcidev.regions[0].base_addr
|
534
|
+
paddrs = [(paddr if mem.meta.mapping.system else (paddr + owner_sys_base), size) for paddr, size in mem.meta.mapping.paddrs]
|
535
|
+
self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
|
536
|
+
|
537
|
+
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
538
|
+
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
539
|
+
self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
540
|
+
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
|
541
|
+
else:
|
542
|
+
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
543
|
+
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
382
544
|
|
383
|
-
|
384
|
-
|
545
|
+
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
|
546
|
+
read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
|
547
|
+
|
548
|
+
def sleep(self, timeout):
|
549
|
+
if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
|
550
|
+
self.irq_fd.read(8 * events_cnt)
|
551
|
+
self.adev.ih.interrupt_handler()
|
552
|
+
|
553
|
+
def on_device_hang(self):
|
554
|
+
for d in self.dev.devices: d.dev_iface.adev.gmc.on_interrupt()
|
555
|
+
raise RuntimeError("Device hang detected")
|
556
|
+
|
557
|
+
def device_fini(self): self.adev.fini()
|
385
558
|
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
559
|
+
class AMDDevice(HCQCompiled):
|
560
|
+
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
561
|
+
signals_page:Any = None
|
562
|
+
signals_pool:list[int] = []
|
563
|
+
|
564
|
+
def __init__(self, device:str=""):
|
565
|
+
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
566
|
+
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
|
567
|
+
self.target = int(self.dev_iface.props['gfx_target_version'])
|
390
568
|
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
|
391
569
|
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
392
570
|
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
398
|
-
AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
|
399
|
-
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
400
|
-
else:
|
401
|
-
self._gpu_map(AMDDevice.signals_page)
|
402
|
-
self._gpu_map(AMDDevice.event_page)
|
571
|
+
if AMDDevice.signals_page is None:
|
572
|
+
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
|
573
|
+
AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
|
574
|
+
else: self.dev_iface.map(AMDDevice.signals_page)
|
403
575
|
|
404
|
-
|
405
|
-
|
406
|
-
max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
407
|
-
self.max_private_segment_size = 4096
|
408
|
-
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
409
|
-
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
410
|
-
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
576
|
+
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
|
577
|
+
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
|
411
578
|
self.has_scratch_base_registers = self.target >= 110000
|
412
|
-
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
|
413
|
-
self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
|
414
579
|
|
415
580
|
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
|
416
581
|
sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
|
417
582
|
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
|
418
|
-
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (max_cu_id + 1), mmap.PAGESIZE)
|
419
|
-
ctl_stack_size = round_up(12 * (max_cu_id + 1) * (max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
420
|
-
|
583
|
+
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
|
584
|
+
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
585
|
+
debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
|
421
586
|
|
422
|
-
self.compute_queue = self.
|
423
|
-
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
|
424
|
-
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
587
|
+
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
|
588
|
+
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
425
589
|
|
426
|
-
self.
|
427
|
-
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
590
|
+
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
|
428
591
|
|
429
592
|
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
|
430
593
|
AMDSignal, AMDComputeQueue, AMDCopyQueue)
|
431
594
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE),
|
436
|
-
kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
|
437
|
-
eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
|
438
|
-
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
439
|
-
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
440
|
-
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
|
441
|
-
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size if cwsr_ctx else 0,
|
442
|
-
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
595
|
+
# Scratch setup
|
596
|
+
self.max_private_segment_size = 0
|
597
|
+
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
443
598
|
|
444
|
-
|
445
|
-
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
446
|
-
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
|
599
|
+
atexit.register(self.device_fini)
|
447
600
|
|
448
|
-
|
449
|
-
|
450
|
-
|
601
|
+
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
602
|
+
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
603
|
+
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
604
|
+
eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
605
|
+
return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
|
606
|
+
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
|
607
|
+
|
608
|
+
def _ensure_has_local_memory(self, required):
|
609
|
+
if self.max_private_segment_size >= required: return
|
610
|
+
|
611
|
+
# <gfx103 requires alignment of 1024, >=gfx11 requires 256
|
612
|
+
wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
|
613
|
+
|
614
|
+
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
|
615
|
+
if ok:
|
616
|
+
engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
|
617
|
+
waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
|
618
|
+
# >=gfx11 wavesize is per SE
|
619
|
+
wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
|
620
|
+
self.tmpring_size = waves << 12 | wavesize
|
621
|
+
self.max_private_segment_size = required
|
451
622
|
|
452
623
|
def invalidate_caches(self):
|
453
624
|
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
|
454
625
|
self.timeline_value += 1
|
455
626
|
self.synchronize()
|
456
627
|
|
457
|
-
def on_device_hang(self):
|
458
|
-
report = []
|
459
|
-
|
460
|
-
ev = (kfd.struct_kfd_event_data)(event_id=self.mem_fault_event.event_id)
|
461
|
-
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
|
462
|
-
if ev.memory_exception_data.gpu_id:
|
463
|
-
pfstatus = ' '.join(f'{k[0]}={getattr(ev.memory_exception_data.failure, k[0])}' for k in ev.memory_exception_data.failure._fields_)
|
464
|
-
report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {pfstatus}"]
|
628
|
+
def on_device_hang(self): self.dev_iface.on_device_hang()
|
465
629
|
|
466
|
-
|
467
|
-
|
468
|
-
if
|
469
|
-
report += [f"HW fault: {' '.join(f'{k[0]}={getattr(ev.hw_exception_data, k[0])}' for k in ev.hw_exception_data._fields_)}"]
|
470
|
-
|
471
|
-
raise RuntimeError("\n".join(report))
|
630
|
+
def device_fini(self):
|
631
|
+
self.synchronize()
|
632
|
+
if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
|