tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/kernel.py +248 -115
- tinygrad/codegen/lowerer.py +215 -0
- tinygrad/codegen/transcendental.py +310 -0
- tinygrad/codegen/uopgraph.py +622 -0
- tinygrad/codegen/uops.py +235 -393
- tinygrad/device.py +428 -69
- tinygrad/dtype.py +18 -4
- tinygrad/engine/graph.py +19 -32
- tinygrad/engine/jit.py +148 -70
- tinygrad/engine/realize.py +127 -51
- tinygrad/engine/schedule.py +259 -216
- tinygrad/engine/search.py +29 -22
- tinygrad/function.py +9 -0
- tinygrad/helpers.py +87 -49
- tinygrad/lazy.py +34 -35
- tinygrad/multi.py +41 -36
- tinygrad/nn/__init__.py +39 -22
- tinygrad/nn/state.py +3 -3
- tinygrad/ops.py +63 -62
- tinygrad/renderer/__init__.py +43 -21
- tinygrad/renderer/assembly.py +104 -106
- tinygrad/renderer/cstyle.py +87 -60
- tinygrad/renderer/llvmir.py +21 -30
- tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
- tinygrad/runtime/autogen/cuda.py +6 -162
- tinygrad/runtime/autogen/kfd.py +32 -0
- tinygrad/runtime/autogen/libc.py +4260 -0
- tinygrad/runtime/autogen/nvrtc.py +579 -0
- tinygrad/runtime/graph/clang.py +2 -2
- tinygrad/runtime/graph/cuda.py +8 -11
- tinygrad/runtime/graph/hcq.py +120 -107
- tinygrad/runtime/graph/metal.py +18 -15
- tinygrad/runtime/ops_amd.py +197 -305
- tinygrad/runtime/ops_clang.py +2 -2
- tinygrad/runtime/ops_cuda.py +36 -94
- tinygrad/runtime/ops_disk.py +3 -7
- tinygrad/runtime/ops_gpu.py +4 -2
- tinygrad/runtime/ops_hip.py +70 -0
- tinygrad/runtime/ops_metal.py +38 -27
- tinygrad/runtime/ops_nv.py +283 -363
- tinygrad/runtime/ops_python.py +26 -30
- tinygrad/runtime/support/compiler_cuda.py +78 -0
- tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
- tinygrad/runtime/support/elf.py +38 -0
- tinygrad/shape/shapetracker.py +5 -14
- tinygrad/shape/symbolic.py +4 -8
- tinygrad/shape/view.py +34 -22
- tinygrad/tensor.py +399 -97
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
- tinygrad-0.9.2.dist-info/RECORD +70 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/linearizer.py +0 -528
- tinygrad-0.9.1.dist-info/RECORD +0 -63
- /tinygrad/runtime/{driver → support}/__init__.py +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,60 +1,30 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Tuple, List, Any
|
3
|
-
import os, fcntl, ctypes, ctypes.util, functools,
|
2
|
+
from typing import Tuple, List, Any, cast
|
3
|
+
import os, fcntl, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
|
4
4
|
from dataclasses import dataclass
|
5
|
-
from tinygrad.device import
|
6
|
-
|
5
|
+
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \
|
6
|
+
HCQSignal, HCQProgram, BufferOptions
|
7
|
+
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, DEBUG, mv_address
|
7
8
|
from tinygrad.renderer.cstyle import AMDRenderer
|
8
|
-
from tinygrad.runtime.
|
9
|
-
|
10
|
-
|
11
|
-
import
|
12
|
-
if getenv("
|
13
|
-
|
14
|
-
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
15
|
-
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
16
|
-
libc.mmap.restype = ctypes.c_void_p
|
17
|
-
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
18
|
-
libc.munmap.restype = ctypes.c_int
|
19
|
-
|
20
|
-
if getenv("MOCKGPU"):
|
21
|
-
import extra.mockgpu.mockgpu # noqa: F401
|
22
|
-
libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
|
23
|
-
libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
|
9
|
+
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc
|
10
|
+
from tinygrad.runtime.support.compiler_hip import AMDCompiler, disasm
|
11
|
+
from tinygrad.runtime.support.elf import elf_loader
|
12
|
+
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
13
|
+
if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
24
14
|
|
25
15
|
def is_usable_gpu(gpu_id):
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
33
|
-
made = made_struct or user_struct(**kwargs)
|
34
|
-
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
16
|
+
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
|
17
|
+
return False
|
18
|
+
|
19
|
+
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
|
20
|
+
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made := user_struct(**kwargs))<<16) | (ord('K')<<8) | nr, made)
|
35
21
|
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
36
22
|
return made
|
37
23
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
# get this from python instead
|
43
|
-
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
|
44
|
-
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
45
|
-
matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
46
|
-
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
47
|
-
fxns = {name.replace("AMDKFD_IOC_", "").lower():
|
48
|
-
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
49
|
-
for name, idir, nr, sname in matches}
|
50
|
-
return type("KIO", (object, ), fxns)
|
51
|
-
kio = ioctls_from_header()
|
52
|
-
|
53
|
-
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
|
54
|
-
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
55
|
-
|
56
|
-
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
|
57
|
-
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
|
24
|
+
kio:Any = type("KIO", (object,), {name[11:].lower(): functools.partial(kfd_ioctl, {"IOW": 1, "IOR": 2, "IOWR": 3}[p[0]], p[1], p[2])
|
25
|
+
for name,p in kfd.__dict__.items() if name.startswith("AMDKFD_IOC_")})
|
26
|
+
|
27
|
+
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
58
28
|
|
59
29
|
# VGT_EVENT_TYPE in navi10_enum.h
|
60
30
|
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
@@ -66,27 +36,35 @@ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
|
66
36
|
|
67
37
|
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
68
38
|
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
self.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
def
|
82
|
-
|
83
|
-
|
84
|
-
def
|
85
|
-
def
|
86
|
-
|
87
|
-
|
39
|
+
|
40
|
+
class AMDSignal(HCQSignal):
|
41
|
+
def __init__(self, value=0, alloc_event=False):
|
42
|
+
self._signal = AMDDevice.signals_pool.pop()
|
43
|
+
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
|
44
|
+
if alloc_event:
|
45
|
+
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
46
|
+
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
47
|
+
self._event_id = sync_event.event_id
|
48
|
+
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event_id)
|
49
|
+
else: self._event_mailbox_ptr = self._event_id = 0
|
50
|
+
super().__init__(value)
|
51
|
+
def __del__(self): AMDDevice.signals_pool.append(self._signal)
|
52
|
+
def _get_value(self) -> int: return self._signal[0]
|
53
|
+
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
|
54
|
+
def _set_value(self, new_value:int): self._signal[0] = new_value
|
55
|
+
def wait(self, value:int, timeout:int=10000):
|
56
|
+
start_time = time.time() * 1000
|
57
|
+
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
58
|
+
if self._signal[0] >= value: return
|
59
|
+
|
60
|
+
# Wait active for 5s, then going to sleep.
|
61
|
+
if time_spent > 5000 and self._event_id != 0:
|
62
|
+
kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
63
|
+
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
|
64
|
+
|
65
|
+
class AMDComputeQueue(HWComputeQueue):
|
88
66
|
def __init__(self):
|
89
|
-
self.
|
67
|
+
self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
|
90
68
|
super().__init__()
|
91
69
|
|
92
70
|
def __del__(self):
|
@@ -94,7 +72,7 @@ class HWPM4Queue(HWQueue):
|
|
94
72
|
self.binded_device.synchronize()
|
95
73
|
self.binded_device._gpu_free(self.hw_page)
|
96
74
|
|
97
|
-
def
|
75
|
+
def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
98
76
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
|
99
77
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
|
100
78
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
|
@@ -102,24 +80,39 @@ class HWPM4Queue(HWQueue):
|
|
102
80
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
103
81
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
|
104
82
|
|
105
|
-
def
|
83
|
+
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
84
|
+
cache_flush_flags = 0
|
85
|
+
|
86
|
+
if cache_flush:
|
87
|
+
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
|
88
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
|
89
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
|
90
|
+
|
91
|
+
# event_index__mec_release_mem__end_of_pipe = 5
|
92
|
+
# event_index__mec_release_mem__shader_done = 6
|
93
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
94
|
+
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
95
|
+
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
96
|
+
*data64_le(address), *data64_le(value), cst]
|
97
|
+
|
98
|
+
def _memory_barrier(self):
|
106
99
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
107
100
|
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
108
101
|
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
109
|
-
self.
|
110
|
-
return self._mark_command_end()
|
102
|
+
self._acquire_mem()
|
111
103
|
|
112
|
-
def
|
113
|
-
self.
|
104
|
+
def _exec(self, prg, args_state, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
|
105
|
+
self._acquire_mem(gli=0, gl2=0)
|
114
106
|
|
115
|
-
|
116
|
-
if
|
117
|
-
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=
|
107
|
+
user_regs, cmd_idx = [], len(self) - 1
|
108
|
+
if prg.enable_dispatch_ptr:
|
109
|
+
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
|
118
110
|
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
119
111
|
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
120
|
-
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size,
|
121
|
-
|
122
|
-
self.
|
112
|
+
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
113
|
+
user_regs += [*data64_le(dp_addr)]
|
114
|
+
self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
|
115
|
+
user_regs += [*data64_le(args_state.ptr)]
|
123
116
|
|
124
117
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
|
125
118
|
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
|
@@ -129,79 +122,54 @@ class HWPM4Queue(HWQueue):
|
|
129
122
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
|
130
123
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
|
131
124
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
|
132
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(
|
125
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
|
126
|
+
|
127
|
+
self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
|
133
128
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
|
134
129
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
|
130
|
+
|
131
|
+
self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
|
135
132
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
136
133
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
|
137
134
|
|
138
|
-
|
139
|
-
|
135
|
+
def _update_exec(self, cmd_idx, global_size, local_size):
|
136
|
+
if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
|
137
|
+
if global_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_global_offset[cmd_idx], data=global_size)
|
140
138
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
|
139
|
+
if (dp:=self.cmd_idx_to_dispatch_packet.get(cmd_idx)) is not None:
|
140
|
+
if local_size is not None: dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
141
|
+
if global_size is not None:
|
142
|
+
dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
|
146
143
|
|
147
|
-
|
148
|
-
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
149
|
-
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
150
|
-
|
151
|
-
def wait(self, signal:hsa.amd_signal_t, value=0):
|
152
|
-
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
144
|
+
def _wait(self, signal, value=0):
|
153
145
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
154
146
|
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
155
|
-
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(
|
156
|
-
return self._mark_command_end()
|
147
|
+
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
|
157
148
|
|
158
|
-
def
|
159
|
-
cache_flush_flags = 0
|
149
|
+
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
|
160
150
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
151
|
+
def _signal(self, signal, value=0):
|
152
|
+
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
153
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
|
154
|
+
if signal._event_mailbox_ptr != 0:
|
155
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
|
156
|
+
value=signal._event_id, cst=signal._event_id, cache_flush=False)
|
165
157
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
170
|
-
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
171
|
-
*data64_le(address), *data64_le(value), cst]
|
158
|
+
def _update_wait(self, cmd_idx, signal=None, value=None):
|
159
|
+
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
|
160
|
+
if value is not None: self._patch(cmd_idx, offset=4, data=[value])
|
172
161
|
|
173
|
-
def
|
174
|
-
self.
|
175
|
-
|
176
|
-
return self._mark_command_end()
|
162
|
+
def _update_signal(self, cmd_idx, signal=None, value=None):
|
163
|
+
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
|
164
|
+
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
|
177
165
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
|
184
|
-
value=signal.event_id, cst=signal.event_id, cache_flush=True)
|
185
|
-
return self._mark_command_end()
|
186
|
-
|
187
|
-
def update_wait(self, cmd_idx, signal=None, value=None):
|
188
|
-
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
|
189
|
-
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
190
|
-
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
|
191
|
-
return self
|
192
|
-
|
193
|
-
def update_signal(self, cmd_idx, signal=None, value=None):
|
194
|
-
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
|
195
|
-
if signal is not None:
|
196
|
-
self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
197
|
-
if signal.event_mailbox_ptr != 0:
|
198
|
-
self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
|
199
|
-
if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
|
200
|
-
return self
|
201
|
-
|
202
|
-
def bind(self, device: AMDDevice):
|
166
|
+
# Check if the signal command has mailptr part
|
167
|
+
if signal is not None and self.cmds_len[cmd_idx] > 8:
|
168
|
+
self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event_id), signal._event_id])
|
169
|
+
|
170
|
+
def bind(self, device):
|
203
171
|
self.binded_device = device
|
204
|
-
self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
172
|
+
self.hw_page = cast(AMDDevice, device)._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
205
173
|
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
206
174
|
for i, value in enumerate(self.q): hw_view[i] = value
|
207
175
|
|
@@ -209,7 +177,7 @@ class HWPM4Queue(HWQueue):
|
|
209
177
|
len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
|
210
178
|
self.q = hw_view # type: ignore
|
211
179
|
|
212
|
-
def
|
180
|
+
def _submit(self, device):
|
213
181
|
cmds = self.indirect_cmd if device == self.binded_device else self.q
|
214
182
|
|
215
183
|
for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
|
@@ -217,25 +185,20 @@ class HWPM4Queue(HWQueue):
|
|
217
185
|
device.compute_queue.put_value += len(cmds)
|
218
186
|
device.compute_queue.write_ptr[0] = device.compute_queue.put_value
|
219
187
|
device.compute_queue.doorbell[0] = device.compute_queue.put_value
|
220
|
-
return self
|
221
188
|
|
222
189
|
SDMA_MAX_COPY_SIZE = 0x400000
|
223
|
-
class HWCopyQueue
|
190
|
+
class AMDCopyQueue(HWCopyQueue):
|
224
191
|
def __init__(self):
|
225
|
-
self.internal_cmd_sizes = []
|
192
|
+
self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
|
226
193
|
super().__init__()
|
227
194
|
|
228
195
|
def _q(self, arr):
|
229
196
|
self.q += arr
|
230
197
|
self.internal_cmd_sizes.append(len(arr))
|
231
198
|
|
232
|
-
def
|
233
|
-
|
234
|
-
self.
|
235
|
-
amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
|
236
|
-
|
237
|
-
copied = 0
|
238
|
-
copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
199
|
+
def _copy(self, dest, src, copy_size):
|
200
|
+
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
201
|
+
self.copy_cmds_per_copy[len(self) - 1] = copy_commands
|
239
202
|
for _ in range(copy_commands):
|
240
203
|
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
241
204
|
|
@@ -244,39 +207,33 @@ class HWCopyQueue(HWQueue):
|
|
244
207
|
|
245
208
|
copied += step_copy_size
|
246
209
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
210
|
+
def _update_copy(self, cmd_idx, dest=None, src=None):
|
211
|
+
for i in range(self.copy_cmds_per_copy[cmd_idx]):
|
212
|
+
if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
|
213
|
+
if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
251
214
|
|
252
|
-
def
|
253
|
-
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(
|
215
|
+
def _signal(self, signal, value=0):
|
216
|
+
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
|
254
217
|
|
255
|
-
if signal.
|
256
|
-
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.
|
257
|
-
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.
|
218
|
+
if signal._event_mailbox_ptr != 0:
|
219
|
+
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event_id])
|
220
|
+
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event_id)])
|
258
221
|
|
259
|
-
|
260
|
-
|
261
|
-
def wait(self, signal: hsa.amd_signal_t, value=0):
|
222
|
+
def _wait(self, signal, value=0):
|
262
223
|
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
263
|
-
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(
|
224
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
|
264
225
|
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
|
265
226
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
271
|
-
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
|
272
|
-
return self
|
227
|
+
def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
|
228
|
+
def _update_wait(self, cmd_idx, signal=None, value=None):
|
229
|
+
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
|
230
|
+
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
273
231
|
|
274
|
-
def
|
232
|
+
def _timestamp(self, signal):
|
275
233
|
self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
276
|
-
*data64_le(
|
277
|
-
return self._mark_command_end()
|
234
|
+
*data64_le(signal._timestamp_addr)])
|
278
235
|
|
279
|
-
def
|
236
|
+
def _submit(self, device):
|
280
237
|
if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
281
238
|
|
282
239
|
tail_blit_dword = 0
|
@@ -298,104 +255,68 @@ class HWCopyQueue(HWQueue):
|
|
298
255
|
|
299
256
|
device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
|
300
257
|
device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
|
301
|
-
return self
|
302
258
|
|
303
|
-
|
304
|
-
|
259
|
+
class AMDArgsState(HCQArgsState):
|
260
|
+
def __init__(self, ptr:int, prg:AMDProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
261
|
+
super().__init__(ptr, prg, bufs, vals=vals)
|
262
|
+
|
263
|
+
self.bufs = to_mv(self.ptr, len(bufs) * 8).cast('Q')
|
264
|
+
self.vals = to_mv(self.ptr + len(bufs) * 8, len(vals) * 4).cast('I')
|
265
|
+
|
266
|
+
self.bufs[:] = array.array('Q', [b.va_addr for b in bufs])
|
267
|
+
self.vals[:] = array.array('I', vals)
|
268
|
+
|
269
|
+
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
270
|
+
def update_var(self, index:int, val:int): self.vals[index] = val
|
271
|
+
|
272
|
+
class AMDProgram(HCQProgram):
|
305
273
|
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
306
274
|
# TODO; this API needs the type signature of the function and global_size/local_size
|
307
275
|
self.device, self.name, self.lib = device, name, lib
|
308
276
|
|
309
|
-
if DEBUG >= 6:
|
310
|
-
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
|
311
|
-
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
312
|
-
|
313
|
-
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
314
|
-
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
315
|
-
|
316
|
-
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
|
317
|
-
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
318
|
-
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
|
277
|
+
if DEBUG >= 6: print(disasm(lib))
|
319
278
|
|
320
|
-
|
321
|
-
|
279
|
+
image, sections, _ = elf_loader(self.lib)
|
280
|
+
self.lib_gpu = self.device._gpu_alloc(round_up(image.nbytes, 0x1000), kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
281
|
+
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
322
282
|
|
323
|
-
entry_point = min(sh
|
324
|
-
self.group_segment_size =
|
325
|
-
self.private_segment_size =
|
326
|
-
self.kernargs_segment_size =
|
327
|
-
self.kernargs_alloc_size = self.kernargs_segment_size
|
328
|
-
self.kernargs_offset = 0
|
283
|
+
entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
|
284
|
+
self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
|
285
|
+
self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
|
286
|
+
self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
|
329
287
|
|
330
288
|
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
331
289
|
if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
|
332
290
|
if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
|
333
291
|
|
334
292
|
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
|
335
|
-
self.rsrc1 = code.compute_pgm_rsrc1
|
336
|
-
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
337
|
-
|
338
|
-
if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
|
339
|
-
# Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
|
340
|
-
self.dispatch_packet_offset = self.kernargs_alloc_size
|
341
|
-
self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
342
|
-
|
343
293
|
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
344
|
-
assert code.workitem_private_segment_byte_size == 0
|
345
|
-
assert code.max_scratch_backing_memory_byte_size == 0
|
346
|
-
assert code.kernel_code_prefetch_byte_size == 0
|
347
294
|
|
295
|
+
self.rsrc1 = code.compute_pgm_rsrc1
|
296
|
+
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
348
297
|
self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
349
298
|
|
350
|
-
|
299
|
+
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
300
|
+
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
|
301
|
+
self.enable_dispatch_ptr = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
302
|
+
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
303
|
+
|
304
|
+
super().__init__(AMDArgsState, self.device, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
351
305
|
|
352
|
-
# NOTE: no programs are ever freed
|
353
306
|
def __del__(self):
|
354
|
-
if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
|
355
|
-
|
356
|
-
|
357
|
-
if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
|
358
|
-
self.device.kernargs_ptr = self.device.kernargs.va_addr
|
359
|
-
|
360
|
-
if not hasattr(self, "args_struct_t"):
|
361
|
-
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
362
|
-
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
363
|
-
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
364
|
-
raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
365
|
-
|
366
|
-
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
|
367
|
-
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
368
|
-
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
369
|
-
|
370
|
-
sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
371
|
-
|
372
|
-
q = HWPM4Queue()
|
373
|
-
q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
374
|
-
if wait or PROFILE: q.timestamp(sig_st)
|
375
|
-
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
376
|
-
if wait or PROFILE: q.timestamp(sig_en)
|
377
|
-
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
378
|
-
self.device.timeline_value += 1
|
379
|
-
self.device.kernargs_ptr += self.kernargs_alloc_size
|
380
|
-
|
381
|
-
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
382
|
-
if wait:
|
383
|
-
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
384
|
-
return (sig_en.start_ts - sig_st.start_ts) / 1e8
|
385
|
-
|
386
|
-
class AMDAllocator(HCQCompatAllocator):
|
307
|
+
if hasattr(self, 'lib_gpu'): cast(AMDDevice, self.device)._gpu_free(self.lib_gpu)
|
308
|
+
|
309
|
+
class AMDAllocator(HCQAllocator):
|
387
310
|
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
|
388
311
|
|
389
|
-
def _alloc(self, size:int, options:BufferOptions):
|
390
|
-
|
391
|
-
|
392
|
-
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
393
|
-
except OSError as e:
|
394
|
-
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
|
395
|
-
raise
|
312
|
+
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
313
|
+
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
314
|
+
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
396
315
|
|
397
316
|
def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
|
398
317
|
|
318
|
+
def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
|
319
|
+
|
399
320
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
400
321
|
|
401
322
|
@dataclass
|
@@ -406,11 +327,11 @@ class AMDQueueDesc:
|
|
406
327
|
doorbell: memoryview
|
407
328
|
put_value: int = 0
|
408
329
|
|
409
|
-
class AMDDevice(
|
330
|
+
class AMDDevice(HCQCompiled):
|
410
331
|
kfd:int = -1
|
411
332
|
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
412
333
|
signals_page:Any = None
|
413
|
-
signals_pool:List[
|
334
|
+
signals_pool:List[memoryview] = []
|
414
335
|
gpus:List[pathlib.Path] = []
|
415
336
|
|
416
337
|
def _gpu_map(self, mem):
|
@@ -429,7 +350,14 @@ class AMDDevice(HCQCompatCompiled):
|
|
429
350
|
else:
|
430
351
|
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
431
352
|
assert addr != 0xffffffffffffffff
|
432
|
-
|
353
|
+
|
354
|
+
try: mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
355
|
+
except OSError as e:
|
356
|
+
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
|
357
|
+
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
358
|
+
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
|
359
|
+
raise
|
360
|
+
|
433
361
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
434
362
|
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
435
363
|
assert addr == buf == mem.va_addr
|
@@ -444,63 +372,34 @@ class AMDDevice(HCQCompatCompiled):
|
|
444
372
|
libc.munmap(mem.va_addr, mem.size)
|
445
373
|
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
|
446
374
|
|
447
|
-
@classmethod
|
448
|
-
def _read_signal(self, sig): return sig.value
|
449
|
-
|
450
|
-
@classmethod
|
451
|
-
def _read_timestamp(self, sig): return sig.start_ts
|
452
|
-
|
453
|
-
@classmethod
|
454
|
-
def _set_signal(self, sig, value): sig.value = value
|
455
|
-
|
456
|
-
@classmethod
|
457
|
-
def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
|
458
|
-
self._set_signal(ret := self.signals_pool.pop(), value)
|
459
|
-
if (sync_event:=kwargs.get('sync_event')) is not None:
|
460
|
-
ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
461
|
-
ret.event_id = sync_event.event_id
|
462
|
-
else: ret.event_mailbox_ptr = ret.event_id = 0
|
463
|
-
return ret
|
464
|
-
|
465
|
-
@classmethod
|
466
|
-
def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
|
467
|
-
assert signal.event_id != 0, "can't wait on this signal"
|
468
|
-
evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
|
469
|
-
|
470
|
-
# Wait active for 5s, then going to sleep.
|
471
|
-
start_time = time.time() * 1000
|
472
|
-
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
473
|
-
if signal.value >= value: return
|
474
|
-
if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
475
|
-
raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
|
476
|
-
|
477
375
|
def __init__(self, device:str=""):
|
478
376
|
if AMDDevice.kfd == -1:
|
479
377
|
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
480
|
-
|
378
|
+
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
379
|
+
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
|
380
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
381
|
+
AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
382
|
+
|
481
383
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
384
|
+
if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
385
|
+
|
482
386
|
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
483
387
|
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
484
388
|
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
485
389
|
target = int(self.properties['gfx_target_version'])
|
486
390
|
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
391
|
+
if target < 110000 or target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
392
|
+
|
487
393
|
kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
488
394
|
|
489
395
|
if AMDDevice.event_page is None:
|
490
|
-
AMDDevice.signals_page = self._gpu_alloc(
|
396
|
+
AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
491
397
|
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
492
|
-
for off in range(0, AMDDevice.signals_page.size,
|
493
|
-
|
494
|
-
sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
|
398
|
+
AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
|
399
|
+
kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
495
400
|
else:
|
496
401
|
self._gpu_map(AMDDevice.signals_page)
|
497
402
|
self._gpu_map(AMDDevice.event_page)
|
498
|
-
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
499
|
-
|
500
|
-
self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
|
501
|
-
|
502
|
-
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
503
|
-
self.kernargs_ptr = self.kernargs.va_addr
|
504
403
|
|
505
404
|
# Scratch setup
|
506
405
|
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
@@ -515,12 +414,8 @@ class AMDDevice(HCQCompatCompiled):
|
|
515
414
|
self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
|
516
415
|
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
517
416
|
|
518
|
-
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
|
519
|
-
|
520
|
-
|
521
|
-
def _gpu2cpu_time(self, gpu_time, is_copy):
|
522
|
-
if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
|
523
|
-
return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
|
417
|
+
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
|
418
|
+
AMDSignal, AMDComputeQueue, AMDCopyQueue, (AMDSignal(alloc_event=True), AMDSignal(alloc_event=True)))
|
524
419
|
|
525
420
|
def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
|
526
421
|
gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
@@ -541,10 +436,7 @@ class AMDDevice(HCQCompatCompiled):
|
|
541
436
|
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
542
437
|
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
|
543
438
|
|
544
|
-
def
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
self.kernargs_ptr = self.kernargs.va_addr
|
549
|
-
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
|
550
|
-
if PROFILE: self._prof_process_events()
|
439
|
+
def invalidate_caches(self):
|
440
|
+
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
|
441
|
+
self.timeline_value += 1
|
442
|
+
self.synchronize()
|