tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +11 -6
- tinygrad/codegen/kernel.py +308 -175
- tinygrad/codegen/linearize.py +95 -0
- tinygrad/codegen/lowerer.py +143 -0
- tinygrad/codegen/transcendental.py +257 -0
- tinygrad/codegen/uopgraph.py +506 -0
- tinygrad/device.py +72 -171
- tinygrad/dtype.py +122 -47
- tinygrad/engine/jit.py +184 -87
- tinygrad/{lazy.py → engine/lazy.py} +74 -66
- tinygrad/engine/memory.py +51 -0
- tinygrad/engine/realize.py +86 -61
- tinygrad/engine/schedule.py +366 -317
- tinygrad/engine/search.py +58 -47
- tinygrad/function.py +59 -58
- tinygrad/helpers.py +120 -102
- tinygrad/multi.py +82 -78
- tinygrad/nn/__init__.py +116 -67
- tinygrad/nn/datasets.py +12 -5
- tinygrad/nn/optim.py +1 -1
- tinygrad/nn/state.py +91 -6
- tinygrad/ops.py +1126 -143
- tinygrad/renderer/__init__.py +47 -23
- tinygrad/renderer/cstyle.py +338 -265
- tinygrad/renderer/llvmir.py +125 -143
- tinygrad/renderer/ptx.py +225 -0
- tinygrad/runtime/autogen/adreno.py +17904 -0
- tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
- tinygrad/runtime/autogen/cuda.py +6 -162
- tinygrad/runtime/autogen/io_uring.py +97 -63
- tinygrad/runtime/autogen/kfd.py +60 -47
- tinygrad/runtime/autogen/kgsl.py +1386 -0
- tinygrad/runtime/autogen/libc.py +5462 -0
- tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
- tinygrad/runtime/autogen/nvrtc.py +579 -0
- tinygrad/runtime/autogen/opencl.py +11 -11
- tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
- tinygrad/runtime/graph/clang.py +3 -3
- tinygrad/runtime/graph/cuda.py +11 -15
- tinygrad/runtime/graph/hcq.py +120 -107
- tinygrad/runtime/graph/metal.py +71 -43
- tinygrad/runtime/ops_amd.py +244 -323
- tinygrad/runtime/ops_clang.py +12 -5
- tinygrad/runtime/ops_cloud.py +220 -0
- tinygrad/runtime/ops_cuda.py +42 -99
- tinygrad/runtime/ops_disk.py +25 -26
- tinygrad/runtime/ops_dsp.py +181 -0
- tinygrad/runtime/ops_gpu.py +29 -16
- tinygrad/runtime/ops_hip.py +68 -0
- tinygrad/runtime/ops_llvm.py +15 -10
- tinygrad/runtime/ops_metal.py +147 -64
- tinygrad/runtime/ops_nv.py +356 -397
- tinygrad/runtime/ops_python.py +78 -79
- tinygrad/runtime/ops_qcom.py +405 -0
- tinygrad/runtime/support/__init__.py +0 -0
- tinygrad/runtime/support/compiler_cuda.py +77 -0
- tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
- tinygrad/runtime/support/elf.py +38 -0
- tinygrad/runtime/support/hcq.py +539 -0
- tinygrad/shape/shapetracker.py +40 -50
- tinygrad/shape/view.py +102 -63
- tinygrad/tensor.py +1109 -365
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
- tinygrad-0.10.0.dist-info/RECORD +77 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/linearizer.py +0 -528
- tinygrad/codegen/uops.py +0 -451
- tinygrad/engine/graph.py +0 -100
- tinygrad/renderer/assembly.py +0 -269
- tinygrad/shape/symbolic.py +0 -327
- tinygrad-0.9.1.dist-info/RECORD +0 -63
- /tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,60 +1,23 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Tuple, List, Any
|
3
|
-
import os,
|
3
|
+
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
|
4
|
+
assert sys.platform != 'win32'
|
4
5
|
from dataclasses import dataclass
|
5
|
-
from tinygrad.
|
6
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram
|
7
|
+
from tinygrad.device import BufferOptions
|
8
|
+
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
|
7
9
|
from tinygrad.renderer.cstyle import AMDRenderer
|
8
|
-
from tinygrad.runtime.
|
9
|
-
|
10
|
-
|
11
|
-
import
|
12
|
-
if getenv("
|
13
|
-
|
14
|
-
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
15
|
-
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
16
|
-
libc.mmap.restype = ctypes.c_void_p
|
17
|
-
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
18
|
-
libc.munmap.restype = ctypes.c_int
|
19
|
-
|
20
|
-
if getenv("MOCKGPU"):
|
21
|
-
import extra.mockgpu.mockgpu # noqa: F401
|
22
|
-
libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
|
23
|
-
libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
|
10
|
+
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
|
11
|
+
from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
12
|
+
from tinygrad.runtime.support.elf import elf_loader
|
13
|
+
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
14
|
+
if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
24
15
|
|
25
16
|
def is_usable_gpu(gpu_id):
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
return False
|
31
|
-
|
32
|
-
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
33
|
-
made = made_struct or user_struct(**kwargs)
|
34
|
-
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
35
|
-
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
36
|
-
return made
|
37
|
-
|
38
|
-
def ioctls_from_header():
|
39
|
-
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
40
|
-
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
41
|
-
#matches = re.findall(pattern, hdr, re.MULTILINE)
|
42
|
-
# get this from python instead
|
43
|
-
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
|
44
|
-
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
45
|
-
matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
46
|
-
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
47
|
-
fxns = {name.replace("AMDKFD_IOC_", "").lower():
|
48
|
-
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
49
|
-
for name, idir, nr, sname in matches}
|
50
|
-
return type("KIO", (object, ), fxns)
|
51
|
-
kio = ioctls_from_header()
|
52
|
-
|
53
|
-
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
|
54
|
-
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
55
|
-
|
56
|
-
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
|
57
|
-
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
|
17
|
+
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
|
18
|
+
return False
|
19
|
+
|
20
|
+
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
58
21
|
|
59
22
|
# VGT_EVENT_TYPE in navi10_enum.h
|
60
23
|
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
@@ -66,35 +29,41 @@ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
|
66
29
|
|
67
30
|
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
68
31
|
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
self.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
def
|
81
|
-
def
|
82
|
-
|
83
|
-
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
32
|
+
|
33
|
+
class AMDSignal(HCQSignal):
|
34
|
+
def __init__(self, value=0, is_timeline=False):
|
35
|
+
self._signal = AMDDevice.signals_pool.pop()
|
36
|
+
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
|
37
|
+
if is_timeline:
|
38
|
+
self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
|
39
|
+
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
|
40
|
+
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
|
41
|
+
else: self._event_mailbox_ptr = 0
|
42
|
+
super().__init__(value)
|
43
|
+
def __del__(self): AMDDevice.signals_pool.append(self._signal)
|
44
|
+
def _get_value(self) -> int: return self._signal[0]
|
45
|
+
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
|
46
|
+
def _set_value(self, new_value:int): self._signal[0] = new_value
|
47
|
+
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
|
48
|
+
start_time = time.time() * 1000
|
49
|
+
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
50
|
+
if self._signal[0] >= value: return
|
51
|
+
|
52
|
+
# Wait active for 5s, then going to sleep.
|
53
|
+
if time_spent > 5000 and self._event_mailbox_ptr != 0:
|
54
|
+
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
55
|
+
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
|
56
|
+
|
57
|
+
class AMDComputeQueue(HWComputeQueue):
|
88
58
|
def __init__(self):
|
89
|
-
self.
|
59
|
+
self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
|
90
60
|
super().__init__()
|
91
61
|
|
92
62
|
def __del__(self):
|
93
63
|
if self.binded_device is not None:
|
94
|
-
self.binded_device.
|
95
|
-
self.binded_device._gpu_free(self.hw_page)
|
64
|
+
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True, uncached=True))
|
96
65
|
|
97
|
-
def
|
66
|
+
def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
98
67
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
|
99
68
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
|
100
69
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
|
@@ -102,106 +71,101 @@ class HWPM4Queue(HWQueue):
|
|
102
71
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
103
72
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
|
104
73
|
|
105
|
-
def
|
74
|
+
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
75
|
+
cache_flush_flags = 0
|
76
|
+
|
77
|
+
if cache_flush:
|
78
|
+
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
|
79
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
|
80
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
|
81
|
+
|
82
|
+
# event_index__mec_release_mem__end_of_pipe = 5
|
83
|
+
# event_index__mec_release_mem__shader_done = 6
|
84
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
85
|
+
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
86
|
+
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
87
|
+
*data64_le(address), *data64_le(value), cst]
|
88
|
+
|
89
|
+
def _memory_barrier(self):
|
106
90
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
107
91
|
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
108
92
|
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
109
|
-
self.
|
110
|
-
return self._mark_command_end()
|
93
|
+
self._acquire_mem()
|
111
94
|
|
112
|
-
def
|
113
|
-
self.
|
95
|
+
def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
|
96
|
+
self._acquire_mem(gli=0, gl2=0)
|
114
97
|
|
115
|
-
|
116
|
-
|
117
|
-
|
98
|
+
cmd_idx = self._cur_cmd_idx()
|
99
|
+
user_regs = [*data64_le(prg.device.scratch.va_addr), 0xffffffff, 0xc00000] if prg.enable_private_segment_sgpr else []
|
100
|
+
if prg.enable_dispatch_ptr:
|
101
|
+
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
|
118
102
|
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
119
103
|
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
120
|
-
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size,
|
121
|
-
|
122
|
-
self.
|
104
|
+
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
105
|
+
user_regs += [*data64_le(dp_addr)]
|
106
|
+
self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
|
107
|
+
user_regs += [*data64_le(args_state.ptr)]
|
123
108
|
|
124
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG,
|
125
|
-
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
|
109
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)]
|
126
110
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
|
111
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0]
|
127
112
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
|
113
|
+
if prg.device.has_scratch_base_registers:
|
114
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2),
|
115
|
+
gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.device.scratch.va_addr >> 8)]
|
116
|
+
if prg.device.target < 110000: self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20]
|
128
117
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
|
129
118
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
|
130
119
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
|
131
120
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
|
132
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(
|
121
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
|
122
|
+
|
123
|
+
self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
|
133
124
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
|
134
125
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
|
126
|
+
|
127
|
+
self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
|
135
128
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
136
129
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
|
137
130
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
def update_exec(self, cmd_idx, global_size, local_size):
|
142
|
-
# Patch the exec cmd with new launch dims
|
143
|
-
assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
|
144
|
-
self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
|
145
|
-
self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
|
131
|
+
def _update_exec(self, cmd_idx, global_size, local_size):
|
132
|
+
if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
|
133
|
+
if global_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_global_offset[cmd_idx], data=global_size)
|
146
134
|
|
147
|
-
if (dp:=self.
|
148
|
-
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
149
|
-
|
135
|
+
if (dp:=self.cmd_idx_to_dispatch_packet.get(cmd_idx)) is not None:
|
136
|
+
if local_size is not None: dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
137
|
+
if global_size is not None:
|
138
|
+
dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
|
150
139
|
|
151
|
-
def
|
152
|
-
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
140
|
+
def _wait(self, signal, value=0):
|
153
141
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
154
142
|
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
155
|
-
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(
|
156
|
-
return self._mark_command_end()
|
143
|
+
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
|
157
144
|
|
158
|
-
def
|
159
|
-
cache_flush_flags = 0
|
145
|
+
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
|
160
146
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
147
|
+
def _signal(self, signal, value=0):
|
148
|
+
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
149
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
|
150
|
+
if signal._event_mailbox_ptr != 0:
|
151
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
|
152
|
+
value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
|
165
153
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
170
|
-
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
171
|
-
*data64_le(address), *data64_le(value), cst]
|
154
|
+
def _update_wait(self, cmd_idx, signal=None, value=None):
|
155
|
+
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
|
156
|
+
if value is not None: self._patch(cmd_idx, offset=4, data=[value])
|
172
157
|
|
173
|
-
def
|
174
|
-
self.
|
175
|
-
|
176
|
-
return self._mark_command_end()
|
158
|
+
def _update_signal(self, cmd_idx, signal=None, value=None):
|
159
|
+
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
|
160
|
+
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
|
177
161
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
|
184
|
-
value=signal.event_id, cst=signal.event_id, cache_flush=True)
|
185
|
-
return self._mark_command_end()
|
186
|
-
|
187
|
-
def update_wait(self, cmd_idx, signal=None, value=None):
|
188
|
-
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
|
189
|
-
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
190
|
-
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
|
191
|
-
return self
|
192
|
-
|
193
|
-
def update_signal(self, cmd_idx, signal=None, value=None):
|
194
|
-
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
|
195
|
-
if signal is not None:
|
196
|
-
self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
197
|
-
if signal.event_mailbox_ptr != 0:
|
198
|
-
self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
|
199
|
-
if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
|
200
|
-
return self
|
201
|
-
|
202
|
-
def bind(self, device: AMDDevice):
|
162
|
+
# Check if the signal command has mailptr part
|
163
|
+
if signal is not None and self.cmds_len[cmd_idx] > 8:
|
164
|
+
self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
|
165
|
+
|
166
|
+
def bind(self, device):
|
203
167
|
self.binded_device = device
|
204
|
-
self.hw_page = device.
|
168
|
+
self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
|
205
169
|
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
206
170
|
for i, value in enumerate(self.q): hw_view[i] = value
|
207
171
|
|
@@ -209,7 +173,7 @@ class HWPM4Queue(HWQueue):
|
|
209
173
|
len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
|
210
174
|
self.q = hw_view # type: ignore
|
211
175
|
|
212
|
-
def
|
176
|
+
def _submit(self, device):
|
213
177
|
cmds = self.indirect_cmd if device == self.binded_device else self.q
|
214
178
|
|
215
179
|
for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
|
@@ -217,25 +181,20 @@ class HWPM4Queue(HWQueue):
|
|
217
181
|
device.compute_queue.put_value += len(cmds)
|
218
182
|
device.compute_queue.write_ptr[0] = device.compute_queue.put_value
|
219
183
|
device.compute_queue.doorbell[0] = device.compute_queue.put_value
|
220
|
-
return self
|
221
184
|
|
222
185
|
SDMA_MAX_COPY_SIZE = 0x400000
|
223
|
-
class HWCopyQueue
|
186
|
+
class AMDCopyQueue(HWCopyQueue):
|
224
187
|
def __init__(self):
|
225
|
-
self.internal_cmd_sizes = []
|
188
|
+
self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
|
226
189
|
super().__init__()
|
227
190
|
|
228
191
|
def _q(self, arr):
|
229
192
|
self.q += arr
|
230
193
|
self.internal_cmd_sizes.append(len(arr))
|
231
194
|
|
232
|
-
def
|
233
|
-
|
234
|
-
self.
|
235
|
-
amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
|
236
|
-
|
237
|
-
copied = 0
|
238
|
-
copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
195
|
+
def _copy(self, dest, src, copy_size):
|
196
|
+
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
197
|
+
self.copy_cmds_per_copy[len(self) - 1] = copy_commands
|
239
198
|
for _ in range(copy_commands):
|
240
199
|
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
241
200
|
|
@@ -244,39 +203,33 @@ class HWCopyQueue(HWQueue):
|
|
244
203
|
|
245
204
|
copied += step_copy_size
|
246
205
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
def signal(self, signal: hsa.amd_signal_t, value=0):
|
253
|
-
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
|
206
|
+
def _update_copy(self, cmd_idx, dest=None, src=None):
|
207
|
+
for i in range(self.copy_cmds_per_copy[cmd_idx]):
|
208
|
+
if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
|
209
|
+
if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
254
210
|
|
255
|
-
|
256
|
-
|
257
|
-
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
|
211
|
+
def _signal(self, signal, value=0):
|
212
|
+
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
|
258
213
|
|
259
|
-
|
214
|
+
if signal._event_mailbox_ptr != 0:
|
215
|
+
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
|
216
|
+
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
|
260
217
|
|
261
|
-
def
|
218
|
+
def _wait(self, signal, value=0):
|
262
219
|
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
263
|
-
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(
|
220
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
|
264
221
|
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
|
265
222
|
|
266
|
-
|
223
|
+
def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
|
224
|
+
def _update_wait(self, cmd_idx, signal=None, value=None):
|
225
|
+
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
|
226
|
+
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
267
227
|
|
268
|
-
def
|
269
|
-
assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
|
270
|
-
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
271
|
-
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
|
272
|
-
return self
|
273
|
-
|
274
|
-
def timestamp(self, sig: hsa.amd_signal_t):
|
228
|
+
def _timestamp(self, signal):
|
275
229
|
self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
276
|
-
*data64_le(
|
277
|
-
return self._mark_command_end()
|
230
|
+
*data64_le(signal._timestamp_addr)])
|
278
231
|
|
279
|
-
def
|
232
|
+
def _submit(self, device):
|
280
233
|
if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
281
234
|
|
282
235
|
tail_blit_dword = 0
|
@@ -298,103 +251,69 @@ class HWCopyQueue(HWQueue):
|
|
298
251
|
|
299
252
|
device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
|
300
253
|
device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
|
301
|
-
return self
|
302
254
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
# TODO; this API needs the type signature of the function and global_size/local_size
|
307
|
-
self.device, self.name, self.lib = device, name, lib
|
255
|
+
class AMDArgsState(HCQArgsState):
|
256
|
+
def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
257
|
+
super().__init__(ptr, prg, bufs, vals=vals)
|
308
258
|
|
309
|
-
|
310
|
-
|
311
|
-
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
259
|
+
self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
|
260
|
+
self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
|
312
261
|
|
313
|
-
|
314
|
-
|
262
|
+
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
|
263
|
+
self.vals[:] = array.array('I', vals)
|
315
264
|
|
316
|
-
|
317
|
-
|
318
|
-
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
|
265
|
+
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
266
|
+
def update_var(self, index:int, val:int): self.vals[index] = val
|
319
267
|
|
320
|
-
|
321
|
-
|
268
|
+
class AMDProgram(HCQProgram):
|
269
|
+
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
270
|
+
# TODO; this API needs the type signature of the function and global_size/local_size
|
271
|
+
self.device, self.name, self.lib = device, name, lib
|
272
|
+
image, sections, _ = elf_loader(self.lib)
|
273
|
+
self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000), BufferOptions(cpu_access=True, nolru=True))
|
274
|
+
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
322
275
|
|
323
|
-
entry_point = min(sh
|
324
|
-
self.group_segment_size =
|
325
|
-
self.private_segment_size =
|
326
|
-
self.kernargs_segment_size =
|
327
|
-
self.kernargs_alloc_size = self.kernargs_segment_size
|
328
|
-
self.kernargs_offset = 0
|
276
|
+
entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
|
277
|
+
self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
|
278
|
+
self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
|
279
|
+
self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
|
329
280
|
|
330
281
|
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
331
282
|
if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
|
332
283
|
if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
|
333
284
|
|
334
285
|
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
|
335
|
-
self.rsrc1 = code.compute_pgm_rsrc1
|
336
|
-
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
337
|
-
|
338
|
-
if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
|
339
|
-
# Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
|
340
|
-
self.dispatch_packet_offset = self.kernargs_alloc_size
|
341
|
-
self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
342
|
-
|
343
286
|
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
344
|
-
assert code.workitem_private_segment_byte_size == 0
|
345
|
-
assert code.max_scratch_backing_memory_byte_size == 0
|
346
|
-
assert code.kernel_code_prefetch_byte_size == 0
|
347
287
|
|
288
|
+
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
|
289
|
+
self.rsrc1 = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.device.target < 120000 else 0)
|
290
|
+
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
348
291
|
self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
349
292
|
|
350
|
-
|
293
|
+
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
294
|
+
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
|
295
|
+
self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
296
|
+
self.enable_private_segment_sgpr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
297
|
+
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
298
|
+
|
299
|
+
super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
351
300
|
|
352
|
-
# NOTE: no programs are ever freed
|
353
301
|
def __del__(self):
|
354
|
-
if hasattr(self, 'lib_gpu'): self.device.
|
355
|
-
|
356
|
-
|
357
|
-
if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
|
358
|
-
self.device.kernargs_ptr = self.device.kernargs.va_addr
|
359
|
-
|
360
|
-
if not hasattr(self, "args_struct_t"):
|
361
|
-
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
362
|
-
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
363
|
-
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
364
|
-
raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
365
|
-
|
366
|
-
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
|
367
|
-
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
368
|
-
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
369
|
-
|
370
|
-
sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
371
|
-
|
372
|
-
q = HWPM4Queue()
|
373
|
-
q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
374
|
-
if wait or PROFILE: q.timestamp(sig_st)
|
375
|
-
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
376
|
-
if wait or PROFILE: q.timestamp(sig_en)
|
377
|
-
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
378
|
-
self.device.timeline_value += 1
|
379
|
-
self.device.kernargs_ptr += self.kernargs_alloc_size
|
380
|
-
|
381
|
-
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
382
|
-
if wait:
|
383
|
-
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
384
|
-
return (sig_en.start_ts - sig_st.start_ts) / 1e8
|
385
|
-
|
386
|
-
class AMDAllocator(HCQCompatAllocator):
|
302
|
+
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True, nolru=True))
|
303
|
+
|
304
|
+
class AMDAllocator(HCQAllocator):
|
387
305
|
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
|
388
306
|
|
389
|
-
def _alloc(self, size:int, options:BufferOptions):
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
except OSError as e:
|
394
|
-
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
|
395
|
-
raise
|
307
|
+
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
308
|
+
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
309
|
+
if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
310
|
+
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
396
311
|
|
397
|
-
def _free(self, opaque, options:BufferOptions):
|
312
|
+
def _free(self, opaque, options:BufferOptions):
|
313
|
+
self.device.synchronize()
|
314
|
+
self.device._gpu_free(opaque)
|
315
|
+
|
316
|
+
def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
|
398
317
|
|
399
318
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
400
319
|
|
@@ -406,18 +325,19 @@ class AMDQueueDesc:
|
|
406
325
|
doorbell: memoryview
|
407
326
|
put_value: int = 0
|
408
327
|
|
409
|
-
class AMDDevice(
|
328
|
+
class AMDDevice(HCQCompiled):
|
410
329
|
kfd:int = -1
|
411
330
|
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
412
331
|
signals_page:Any = None
|
413
|
-
signals_pool:List[
|
332
|
+
signals_pool:List[memoryview] = []
|
414
333
|
gpus:List[pathlib.Path] = []
|
415
334
|
|
416
335
|
def _gpu_map(self, mem):
|
417
336
|
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
|
418
337
|
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
|
419
338
|
c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
|
420
|
-
stm =
|
339
|
+
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
340
|
+
n_devices=len(mem.mapped_gpu_ids))
|
421
341
|
assert stm.n_success == len(mem.mapped_gpu_ids)
|
422
342
|
|
423
343
|
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
@@ -429,7 +349,15 @@ class AMDDevice(HCQCompatCompiled):
|
|
429
349
|
else:
|
430
350
|
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
431
351
|
assert addr != 0xffffffffffffffff
|
432
|
-
|
352
|
+
|
353
|
+
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
|
354
|
+
flags=flags, mmap_offset=buf)
|
355
|
+
except OSError as e:
|
356
|
+
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
|
357
|
+
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
358
|
+
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
|
359
|
+
raise
|
360
|
+
|
433
361
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
434
362
|
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
435
363
|
assert addr == buf == mem.va_addr
|
@@ -439,68 +367,39 @@ class AMDDevice(HCQCompatCompiled):
|
|
439
367
|
def _gpu_free(self, mem):
|
440
368
|
if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
|
441
369
|
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
442
|
-
stm =
|
370
|
+
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
443
371
|
assert stm.n_success == len(gpus)
|
444
372
|
libc.munmap(mem.va_addr, mem.size)
|
445
|
-
|
446
|
-
|
447
|
-
@classmethod
|
448
|
-
def _read_signal(self, sig): return sig.value
|
449
|
-
|
450
|
-
@classmethod
|
451
|
-
def _read_timestamp(self, sig): return sig.start_ts
|
452
|
-
|
453
|
-
@classmethod
|
454
|
-
def _set_signal(self, sig, value): sig.value = value
|
455
|
-
|
456
|
-
@classmethod
|
457
|
-
def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
|
458
|
-
self._set_signal(ret := self.signals_pool.pop(), value)
|
459
|
-
if (sync_event:=kwargs.get('sync_event')) is not None:
|
460
|
-
ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
461
|
-
ret.event_id = sync_event.event_id
|
462
|
-
else: ret.event_mailbox_ptr = ret.event_id = 0
|
463
|
-
return ret
|
464
|
-
|
465
|
-
@classmethod
|
466
|
-
def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
|
467
|
-
assert signal.event_id != 0, "can't wait on this signal"
|
468
|
-
evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
|
469
|
-
|
470
|
-
# Wait active for 5s, then going to sleep.
|
471
|
-
start_time = time.time() * 1000
|
472
|
-
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
473
|
-
if signal.value >= value: return
|
474
|
-
if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
475
|
-
raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
|
373
|
+
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
|
476
374
|
|
477
375
|
def __init__(self, device:str=""):
|
478
376
|
if AMDDevice.kfd == -1:
|
479
377
|
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
480
|
-
|
378
|
+
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
379
|
+
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
|
380
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
381
|
+
AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
382
|
+
|
481
383
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
384
|
+
if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
385
|
+
|
482
386
|
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
483
387
|
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
484
388
|
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
485
|
-
target = int(self.properties['gfx_target_version'])
|
486
|
-
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
487
|
-
|
389
|
+
self.target = int(self.properties['gfx_target_version'])
|
390
|
+
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
|
391
|
+
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
392
|
+
|
393
|
+
kfd.AMDKFD_IOC_ACQUIRE_VM(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
488
394
|
|
489
395
|
if AMDDevice.event_page is None:
|
490
|
-
AMDDevice.signals_page = self._gpu_alloc(
|
396
|
+
AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
491
397
|
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
492
|
-
for off in range(0, AMDDevice.signals_page.size,
|
493
|
-
|
494
|
-
sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
|
398
|
+
AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
|
399
|
+
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
495
400
|
else:
|
496
401
|
self._gpu_map(AMDDevice.signals_page)
|
497
402
|
self._gpu_map(AMDDevice.event_page)
|
498
|
-
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
499
|
-
|
500
|
-
self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
|
501
|
-
|
502
|
-
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
503
|
-
self.kernargs_ptr = self.kernargs.va_addr
|
504
403
|
|
505
404
|
# Scratch setup
|
506
405
|
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
@@ -509,28 +408,37 @@ class AMDDevice(HCQCompatCompiled):
|
|
509
408
|
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
510
409
|
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
511
410
|
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
411
|
+
self.has_scratch_base_registers = self.target >= 110000
|
512
412
|
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
|
513
413
|
self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
|
514
414
|
|
515
|
-
|
415
|
+
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
|
416
|
+
sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
|
417
|
+
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
|
418
|
+
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (max_cu_id + 1), mmap.PAGESIZE)
|
419
|
+
ctl_stack_size = round_up(12 * (max_cu_id + 1) * (max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
420
|
+
self.debug_memory_size = round_up((max_cu_id + 1) * (max_wave_id + 1) * 32, 64)
|
421
|
+
|
422
|
+
self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
|
423
|
+
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
|
516
424
|
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
517
425
|
|
518
|
-
|
519
|
-
|
426
|
+
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
|
427
|
+
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
520
428
|
|
521
|
-
|
522
|
-
|
523
|
-
return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
|
429
|
+
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
|
430
|
+
AMDSignal, AMDComputeQueue, AMDCopyQueue)
|
524
431
|
|
525
|
-
def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
|
432
|
+
def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None, ctl_stack_size=0) -> AMDQueueDesc:
|
526
433
|
gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
527
434
|
ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
528
|
-
cwsr_ctx = self._gpu_alloc(ctx_save_restore_size,
|
435
|
+
cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE),
|
436
|
+
kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
|
529
437
|
eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
|
530
|
-
queue =
|
438
|
+
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
531
439
|
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
532
|
-
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
|
533
|
-
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=
|
440
|
+
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
|
441
|
+
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size if cwsr_ctx else 0,
|
534
442
|
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
535
443
|
|
536
444
|
if not hasattr(self, 'doorbells'):
|
@@ -541,10 +449,23 @@ class AMDDevice(HCQCompatCompiled):
|
|
541
449
|
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
542
450
|
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
|
543
451
|
|
544
|
-
def
|
545
|
-
|
452
|
+
def invalidate_caches(self):
|
453
|
+
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
|
454
|
+
self.timeline_value += 1
|
455
|
+
self.synchronize()
|
456
|
+
|
457
|
+
def on_device_hang(self):
|
458
|
+
report = []
|
459
|
+
|
460
|
+
ev = (kfd.struct_kfd_event_data)(event_id=self.mem_fault_event.event_id)
|
461
|
+
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
|
462
|
+
if ev.memory_exception_data.gpu_id:
|
463
|
+
pfstatus = ' '.join(f'{k[0]}={getattr(ev.memory_exception_data.failure, k[0])}' for k in ev.memory_exception_data.failure._fields_)
|
464
|
+
report += [f"MMU fault: 0x{ev.memory_exception_data.va:X} | {pfstatus}"]
|
465
|
+
|
466
|
+
ev = (kfd.struct_kfd_event_data)(event_id=self.hw_fault_event.event_id)
|
467
|
+
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(ev), num_events=1, wait_for_all=1)
|
468
|
+
if ev.hw_exception_data.gpu_id:
|
469
|
+
report += [f"HW fault: {' '.join(f'{k[0]}={getattr(ev.hw_exception_data, k[0])}' for k in ev.hw_exception_data._fields_)}"]
|
546
470
|
|
547
|
-
|
548
|
-
self.kernargs_ptr = self.kernargs.va_addr
|
549
|
-
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
|
550
|
-
if PROFILE: self._prof_process_events()
|
471
|
+
raise RuntimeError("\n".join(report))
|