tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/__init__.py +0 -0
- tinygrad/codegen/kernel.py +78 -90
- tinygrad/codegen/linearizer.py +237 -169
- tinygrad/codegen/uops.py +278 -242
- tinygrad/device.py +147 -10
- tinygrad/dtype.py +7 -7
- tinygrad/engine/graph.py +16 -16
- tinygrad/engine/jit.py +39 -36
- tinygrad/engine/realize.py +6 -5
- tinygrad/engine/schedule.py +15 -7
- tinygrad/engine/search.py +6 -3
- tinygrad/function.py +17 -23
- tinygrad/helpers.py +77 -8
- tinygrad/lazy.py +26 -26
- tinygrad/multi.py +13 -9
- tinygrad/nn/__init__.py +1 -1
- tinygrad/nn/datasets.py +2 -1
- tinygrad/nn/state.py +3 -4
- tinygrad/ops.py +49 -16
- tinygrad/renderer/__init__.py +8 -4
- tinygrad/renderer/assembly.py +93 -100
- tinygrad/renderer/cstyle.py +47 -42
- tinygrad/renderer/llvmir.py +30 -30
- tinygrad/runtime/__init__.py +0 -0
- tinygrad/runtime/autogen/amd_gpu.py +11504 -1
- tinygrad/runtime/autogen/comgr.py +36 -10
- tinygrad/runtime/autogen/hsa.py +146 -14
- tinygrad/runtime/autogen/io_uring.py +1486 -0
- tinygrad/runtime/autogen/nv_gpu.py +269 -0
- tinygrad/runtime/driver/__init__.py +0 -0
- tinygrad/runtime/driver/hip_comgr.py +20 -11
- tinygrad/runtime/graph/__init__.py +0 -0
- tinygrad/runtime/graph/clang.py +3 -2
- tinygrad/runtime/graph/cuda.py +2 -2
- tinygrad/runtime/graph/hcq.py +122 -78
- tinygrad/runtime/ops_amd.py +302 -316
- tinygrad/runtime/ops_cuda.py +3 -3
- tinygrad/runtime/ops_disk.py +70 -5
- tinygrad/runtime/ops_gpu.py +2 -2
- tinygrad/runtime/ops_metal.py +5 -6
- tinygrad/runtime/ops_npy.py +1 -1
- tinygrad/runtime/ops_nv.py +161 -166
- tinygrad/runtime/ops_python.py +20 -16
- tinygrad/shape/__init__.py +0 -0
- tinygrad/shape/shapetracker.py +5 -2
- tinygrad/shape/symbolic.py +1 -3
- tinygrad/shape/view.py +34 -19
- tinygrad/tensor.py +219 -135
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
- tinygrad-0.9.1.dist-info/RECORD +63 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
- tinygrad/runtime/driver/hsa.py +0 -143
- tinygrad/runtime/graph/hsa.py +0 -171
- tinygrad/runtime/ops_hsa.py +0 -278
- tinygrad-0.9.0.dist-info/RECORD +0 -60
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Tuple, List, Any
|
3
|
-
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
|
4
|
-
from
|
5
|
-
from tinygrad.
|
2
|
+
from typing import Tuple, List, Any
|
3
|
+
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
|
6
|
+
from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
|
6
7
|
from tinygrad.renderer.cstyle import AMDRenderer
|
7
8
|
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
8
|
-
from tinygrad.runtime.ops_hsa import HSACompiler
|
9
9
|
import tinygrad.runtime.autogen.kfd as kfd
|
10
10
|
import tinygrad.runtime.autogen.hsa as hsa
|
11
11
|
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
@@ -50,237 +50,254 @@ def ioctls_from_header():
|
|
50
50
|
return type("KIO", (object, ), fxns)
|
51
51
|
kio = ioctls_from_header()
|
52
52
|
|
53
|
-
|
54
|
-
# TODO: clean up this, if we want to keep it
|
55
|
-
structs = {}
|
56
|
-
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
|
57
|
-
names = set()
|
58
|
-
fields = []
|
59
|
-
for pkt_fields in pkt._fields_:
|
60
|
-
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
|
61
|
-
else:
|
62
|
-
assert pkt_fields[1]._fields_[0][0] == '_0'
|
63
|
-
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
|
64
|
-
fname = union_fields[0]
|
65
|
-
if fname in names: fname = pkt_fields[0]+fname
|
66
|
-
names.add(fname)
|
67
|
-
# merge together 64-bit fields, otherwise just append them
|
68
|
-
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
|
69
|
-
else: fields.append(tuple([fname, *union_fields[1:]]))
|
70
|
-
new_name = name[16:-4].lower()
|
71
|
-
structs[new_name] = init_c_struct_t(tuple(fields))
|
72
|
-
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
73
|
-
return type("SDMA_PKTS", (object, ), structs)
|
74
|
-
sdma_pkts = create_sdma_packets()
|
75
|
-
|
76
|
-
class AMDCompiler(Compiler):
|
77
|
-
def __init__(self, arch:str):
|
78
|
-
self.arch = arch
|
79
|
-
super().__init__(f"compile_hip_{self.arch}")
|
80
|
-
def compile(self, src:str) -> bytes:
|
81
|
-
try: return compile_hip(src, self.arch)
|
82
|
-
except RuntimeError as e: raise CompileError(e)
|
83
|
-
|
84
|
-
PAGE_SIZE = 0x1000
|
85
|
-
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
|
53
|
+
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
|
86
54
|
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
87
55
|
|
88
|
-
BASE_ADDR = 0x00001260
|
89
|
-
SUB = amd_gpu.PACKET3_SET_SH_REG_START - BASE_ADDR
|
90
|
-
|
91
|
-
regCOMPUTE_PGM_LO = 0x1bac - SUB
|
92
|
-
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
|
93
|
-
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
|
94
|
-
regCOMPUTE_START_X = 0x1ba4 - SUB
|
95
|
-
regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
|
96
|
-
regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
|
97
|
-
regCOMPUTE_RESTART_X = 0x1bbb - SUB
|
98
|
-
regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
|
99
|
-
regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
|
100
|
-
regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
|
101
|
-
|
102
56
|
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
|
103
57
|
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
|
104
58
|
|
105
59
|
# VGT_EVENT_TYPE in navi10_enum.h
|
106
60
|
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
107
|
-
CS_PARTIAL_FLUSH = 0x7
|
108
61
|
|
109
62
|
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
110
63
|
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
111
64
|
|
112
|
-
COMPUTE_SHADER_EN = 1
|
113
|
-
FORCE_START_AT_000 = 1 << 2
|
114
|
-
CS_W32_EN = 1 << 15
|
65
|
+
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
115
66
|
|
116
|
-
|
117
|
-
|
118
|
-
|
67
|
+
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
68
|
+
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
69
|
+
def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
|
119
70
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
gl1=1
|
133
|
-
gl2=1
|
134
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
|
135
|
-
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
136
|
-
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
137
|
-
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
138
|
-
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
71
|
+
class AMDCompiler(Compiler):
|
72
|
+
def __init__(self, arch:str):
|
73
|
+
self.arch = arch
|
74
|
+
super().__init__(f"compile_hip_{self.arch}")
|
75
|
+
def compile(self, src:str) -> bytes:
|
76
|
+
try: return compile_hip(src, self.arch)
|
77
|
+
except RuntimeError as e: raise CompileError(e) from e
|
78
|
+
|
79
|
+
class HWQueue:
|
80
|
+
def __init__(self): self.q, self.cmd_offsets = [], [0]
|
81
|
+
def _mark_command_end(self):
|
82
|
+
self.cmd_offsets.append(len(self.q))
|
139
83
|
return self
|
84
|
+
def _patch(self, off, data): self.q[off:off+len(data)] = array.array('I', data)
|
85
|
+
def __len__(self): return len(self.cmd_offsets) - 1
|
140
86
|
|
141
|
-
|
142
|
-
|
143
|
-
self.
|
87
|
+
class HWPM4Queue(HWQueue):
|
88
|
+
def __init__(self):
|
89
|
+
self.binded_device, self.ptr_to_dispatch_packet = None, {}
|
90
|
+
super().__init__()
|
144
91
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
self.q += [amd_gpu.PACKET3(amd_gpu.
|
160
|
-
|
161
|
-
|
162
|
-
self.
|
163
|
-
self.
|
164
|
-
|
165
|
-
|
166
|
-
self.
|
167
|
-
|
92
|
+
def __del__(self):
|
93
|
+
if self.binded_device is not None:
|
94
|
+
self.binded_device.synchronize()
|
95
|
+
self.binded_device._gpu_free(self.hw_page)
|
96
|
+
|
97
|
+
def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
98
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
|
99
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
|
100
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
|
101
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
|
102
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
103
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
|
104
|
+
|
105
|
+
def memory_barrier(self):
|
106
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
107
|
+
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
108
|
+
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
109
|
+
self._invalidate_cache()
|
110
|
+
return self._mark_command_end()
|
111
|
+
|
112
|
+
def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
|
113
|
+
self._invalidate_cache()
|
114
|
+
|
115
|
+
user_data = [*data64_le(kernargs)]
|
116
|
+
if hasattr(prg, 'dispatch_packet_offset'):
|
117
|
+
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.dispatch_packet_offset)
|
118
|
+
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
119
|
+
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
120
|
+
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
|
121
|
+
user_data = [*data64_le(dp_addr)] + user_data
|
122
|
+
self.ptr_to_dispatch_packet[len(self)] = dp
|
123
|
+
|
124
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
|
125
|
+
*data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
|
126
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
|
127
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
|
128
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
|
129
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
|
130
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
|
131
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
|
132
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_data)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_data
|
133
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
|
134
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
|
168
135
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
169
136
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
|
170
137
|
|
171
138
|
if signal is not None: self.signal(signal, signal_value)
|
172
|
-
return self
|
139
|
+
return self._mark_command_end()
|
173
140
|
|
174
|
-
def update_exec(self,
|
141
|
+
def update_exec(self, cmd_idx, global_size, local_size):
|
175
142
|
# Patch the exec cmd with new launch dims
|
176
|
-
assert self.q[
|
177
|
-
self.q[
|
178
|
-
self.q[
|
143
|
+
assert self.q[self.cmd_offsets[cmd_idx] + 60] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), f"Command at index {cmd_idx} is not exec"
|
144
|
+
self.q[self.cmd_offsets[cmd_idx] + 52 : self.cmd_offsets[cmd_idx] + 55] = array.array('I', local_size)
|
145
|
+
self.q[self.cmd_offsets[cmd_idx] + 61 : self.cmd_offsets[cmd_idx] + 64] = array.array('I', global_size)
|
146
|
+
|
147
|
+
if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
|
148
|
+
dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
|
149
|
+
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
179
150
|
|
180
151
|
def wait(self, signal:hsa.amd_signal_t, value=0):
|
181
152
|
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
182
153
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
183
154
|
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
184
|
-
amd_gpu.WAIT_REG_MEM_ENGINE(0), addr
|
185
|
-
return self
|
155
|
+
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
|
156
|
+
return self._mark_command_end()
|
186
157
|
|
187
|
-
def
|
158
|
+
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
159
|
+
cache_flush_flags = 0
|
160
|
+
|
161
|
+
if cache_flush:
|
162
|
+
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
|
163
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
|
164
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
|
165
|
+
|
166
|
+
# event_index__mec_release_mem__end_of_pipe = 5
|
167
|
+
# event_index__mec_release_mem__shader_done = 6
|
188
168
|
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
189
|
-
|
190
|
-
amd_gpu.
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
169
|
+
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
170
|
+
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
171
|
+
*data64_le(address), *data64_le(value), cst]
|
172
|
+
|
173
|
+
def timestamp(self, sig):
|
174
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
|
175
|
+
address=ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)
|
176
|
+
return self._mark_command_end()
|
195
177
|
|
196
178
|
def signal(self, signal:hsa.amd_signal_t, value=0):
|
197
179
|
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
198
|
-
|
199
|
-
|
200
|
-
# event_index__mec_release_mem__end_of_pipe = 5
|
201
|
-
# event_index__mec_release_mem__shader_done = 6
|
202
|
-
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
|
203
|
-
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
|
204
|
-
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
|
205
|
-
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
|
206
|
-
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
207
|
-
addr&0xFFFFFFFF, addr>>32,
|
208
|
-
value&0xFFFFFFFF, value>>32, 0]
|
180
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
|
181
|
+
value=value, cache_flush=True)
|
209
182
|
if signal.event_mailbox_ptr != 0:
|
210
|
-
self.
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
|
219
|
-
signal.event_id&0xFFFFFFFF, signal.event_id>>32,
|
220
|
-
signal.event_id]
|
183
|
+
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
|
184
|
+
value=signal.event_id, cst=signal.event_id, cache_flush=True)
|
185
|
+
return self._mark_command_end()
|
186
|
+
|
187
|
+
def update_wait(self, cmd_idx, signal=None, value=None):
|
188
|
+
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), f"Command at index {cmd_idx} is not wait"
|
189
|
+
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 2, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
190
|
+
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 4] = value
|
221
191
|
return self
|
222
192
|
|
223
|
-
def
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
193
|
+
def update_signal(self, cmd_idx, signal=None, value=None):
|
194
|
+
assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal"
|
195
|
+
if signal is not None:
|
196
|
+
self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
197
|
+
if signal.event_mailbox_ptr != 0:
|
198
|
+
self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
|
199
|
+
if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)])
|
229
200
|
return self
|
230
201
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
237
|
-
GCR_CONTROL_GL2_RANGE=0)
|
202
|
+
def bind(self, device: AMDDevice):
|
203
|
+
self.binded_device = device
|
204
|
+
self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
205
|
+
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
206
|
+
for i, value in enumerate(self.q): hw_view[i] = value
|
238
207
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
def submit(self, device:AMDDevice):
|
244
|
-
|
245
|
-
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
246
|
-
for cmd in self.q:
|
247
|
-
if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
|
248
|
-
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
|
249
|
-
device.sdma_doorbell_value += fill
|
250
|
-
ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
|
251
|
-
device.sdma_doorbell_value += cmdsz
|
252
|
-
device.sdma_write_pointer[0] = device.sdma_doorbell_value
|
253
|
-
device.sdma_doorbell[0] = device.sdma_doorbell_value
|
254
|
-
return self
|
208
|
+
self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
|
209
|
+
len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
|
210
|
+
self.q = hw_view # type: ignore
|
211
|
+
|
212
|
+
def submit(self, device: AMDDevice):
|
213
|
+
cmds = self.indirect_cmd if device == self.binded_device else self.q
|
255
214
|
|
256
|
-
|
257
|
-
|
215
|
+
for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
|
216
|
+
|
217
|
+
device.compute_queue.put_value += len(cmds)
|
218
|
+
device.compute_queue.write_ptr[0] = device.compute_queue.put_value
|
219
|
+
device.compute_queue.doorbell[0] = device.compute_queue.put_value
|
258
220
|
return self
|
259
221
|
|
222
|
+
SDMA_MAX_COPY_SIZE = 0x400000
|
223
|
+
class HWCopyQueue(HWQueue):
|
224
|
+
def __init__(self):
|
225
|
+
self.internal_cmd_sizes = []
|
226
|
+
super().__init__()
|
227
|
+
|
228
|
+
def _q(self, arr):
|
229
|
+
self.q += arr
|
230
|
+
self.internal_cmd_sizes.append(len(arr))
|
231
|
+
|
260
232
|
def copy(self, dest, src, copy_size):
|
261
|
-
|
262
|
-
self.
|
233
|
+
# Invalidate cache inv
|
234
|
+
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
|
235
|
+
amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
|
236
|
+
|
263
237
|
copied = 0
|
264
|
-
|
265
|
-
for _ in range(
|
238
|
+
copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
239
|
+
for _ in range(copy_commands):
|
266
240
|
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
267
|
-
|
268
|
-
|
241
|
+
|
242
|
+
self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
243
|
+
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
|
244
|
+
|
269
245
|
copied += step_copy_size
|
270
|
-
self.q.append(sdma_cache_wb)
|
271
|
-
return self
|
272
246
|
|
273
|
-
|
274
|
-
self.
|
247
|
+
# Invalidate cache wb
|
248
|
+
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
|
249
|
+
|
250
|
+
return self._mark_command_end()
|
251
|
+
|
252
|
+
def signal(self, signal: hsa.amd_signal_t, value=0):
|
253
|
+
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
|
254
|
+
|
275
255
|
if signal.event_mailbox_ptr != 0:
|
276
|
-
self.
|
277
|
-
self.
|
256
|
+
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
|
257
|
+
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
|
258
|
+
|
259
|
+
return self._mark_command_end()
|
260
|
+
|
261
|
+
def wait(self, signal: hsa.amd_signal_t, value=0):
|
262
|
+
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
263
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
|
264
|
+
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
|
265
|
+
|
266
|
+
return self._mark_command_end()
|
267
|
+
|
268
|
+
def update_wait(self, cmd_idx, signal=None, value=None):
|
269
|
+
assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait"
|
270
|
+
if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])
|
271
|
+
if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value
|
278
272
|
return self
|
279
273
|
|
280
|
-
def
|
281
|
-
self.
|
282
|
-
|
283
|
-
|
274
|
+
def timestamp(self, sig: hsa.amd_signal_t):
|
275
|
+
self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
276
|
+
*data64_le(ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
|
277
|
+
return self._mark_command_end()
|
278
|
+
|
279
|
+
def submit(self, device: AMDDevice):
|
280
|
+
if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
281
|
+
|
282
|
+
tail_blit_dword = 0
|
283
|
+
for cmdsz in self.internal_cmd_sizes:
|
284
|
+
if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
|
285
|
+
tail_blit_dword += cmdsz
|
286
|
+
|
287
|
+
start_idx = (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes) // 4
|
288
|
+
device.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
|
289
|
+
device.sdma_queue.put_value += tail_blit_dword * 4
|
290
|
+
|
291
|
+
if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
|
292
|
+
zero_fill = device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes
|
293
|
+
ctypes.memset(mv_address(device.sdma_queue.ring) + (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes), 0, zero_fill)
|
294
|
+
device.sdma_queue.put_value += zero_fill
|
295
|
+
|
296
|
+
device.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
|
297
|
+
device.sdma_queue.put_value += rem_packet_cnt * 4
|
298
|
+
|
299
|
+
device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
|
300
|
+
device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
|
284
301
|
return self
|
285
302
|
|
286
303
|
SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
|
@@ -304,119 +321,92 @@ class AMDProgram:
|
|
304
321
|
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
|
305
322
|
|
306
323
|
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
|
307
|
-
self.handle = self.lib_gpu.va_addr + entry_point
|
308
324
|
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
|
309
325
|
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
|
310
326
|
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
|
327
|
+
self.kernargs_alloc_size = self.kernargs_segment_size
|
311
328
|
self.kernargs_offset = 0
|
312
|
-
assert self.private_segment_size <= self.device.max_private_segment_size, \
|
313
|
-
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
|
314
329
|
|
315
|
-
|
330
|
+
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
331
|
+
if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
|
332
|
+
if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
|
333
|
+
|
334
|
+
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
|
335
|
+
self.rsrc1 = code.compute_pgm_rsrc1
|
336
|
+
self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
|
337
|
+
|
338
|
+
if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
|
339
|
+
# Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
|
340
|
+
self.dispatch_packet_offset = self.kernargs_alloc_size
|
341
|
+
self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
342
|
+
|
343
|
+
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
344
|
+
assert code.workitem_private_segment_byte_size == 0
|
345
|
+
assert code.max_scratch_backing_memory_byte_size == 0
|
346
|
+
assert code.kernel_code_prefetch_byte_size == 0
|
347
|
+
|
348
|
+
self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
349
|
+
|
350
|
+
HWPM4Queue().memory_barrier().submit(self.device)
|
316
351
|
|
317
352
|
# NOTE: no programs are ever freed
|
318
353
|
def __del__(self):
|
319
354
|
if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
|
320
355
|
|
321
356
|
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
322
|
-
if self.device.kernargs_ptr + self.
|
357
|
+
if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
|
323
358
|
self.device.kernargs_ptr = self.device.kernargs.va_addr
|
324
|
-
|
359
|
+
|
325
360
|
if not hasattr(self, "args_struct_t"):
|
326
361
|
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
327
362
|
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
328
363
|
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
329
|
-
raise RuntimeError(f"
|
364
|
+
raise RuntimeError(f"AMDProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
365
|
+
|
330
366
|
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
|
331
367
|
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
332
368
|
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
333
369
|
|
370
|
+
sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
371
|
+
|
334
372
|
q = HWPM4Queue()
|
335
|
-
q.wait(self.device.timeline_signal, self.device.timeline_value - 1)
|
336
|
-
if wait: q.timestamp(
|
373
|
+
q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
374
|
+
if wait or PROFILE: q.timestamp(sig_st)
|
337
375
|
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
338
|
-
if wait: q.timestamp(
|
376
|
+
if wait or PROFILE: q.timestamp(sig_en)
|
339
377
|
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
340
378
|
self.device.timeline_value += 1
|
341
|
-
self.device.kernargs_ptr += self.
|
379
|
+
self.device.kernargs_ptr += self.kernargs_alloc_size
|
342
380
|
|
381
|
+
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
343
382
|
if wait:
|
344
383
|
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
345
|
-
return (
|
346
|
-
|
347
|
-
class AMDAllocator(
|
348
|
-
def __init__(self, device:AMDDevice):
|
349
|
-
self.device = device
|
350
|
-
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
351
|
-
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(16)]
|
352
|
-
self.b_timeline = [0] * len(self.b)
|
353
|
-
self.b_next = 0
|
354
|
-
super().__init__()
|
384
|
+
return (sig_en.start_ts - sig_st.start_ts) / 1e8
|
385
|
+
|
386
|
+
class AMDAllocator(HCQCompatAllocator):
|
387
|
+
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
|
355
388
|
|
356
389
|
def _alloc(self, size:int, options:BufferOptions):
|
357
390
|
try:
|
358
391
|
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
359
|
-
|
392
|
+
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
360
393
|
except OSError as e:
|
361
394
|
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
|
362
|
-
|
363
|
-
|
364
|
-
def _free(self,
|
365
|
-
#def as_buffer(self, src:Any) -> memoryview:
|
366
|
-
# self.device.synchronize()
|
367
|
-
# return to_mv(src.va_addr, src.size)
|
368
|
-
|
369
|
-
#def copy_from_fd(self, dest, fd, offset, size):
|
370
|
-
# fo = io.FileIO(fd, "a+b", closefd=False)
|
371
|
-
# fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
|
372
|
-
# copied_in, total_copy_size = 0, round_up(size+minor_offset, PAGE_SIZE)
|
373
|
-
# for i in range(0, size+minor_offset, self.b[0].size):
|
374
|
-
# local_size = min(self.b[0].size, total_copy_size-i)
|
375
|
-
# copy_size = min(local_size-minor_offset, size-copied_in)
|
376
|
-
# if copy_size == 0: break
|
377
|
-
|
378
|
-
# fo.readinto(to_mv(self.b[1].va_addr, local_size))
|
379
|
-
# if i != 0: self.device._wait_signal(self.device.signal_sdma)
|
380
|
-
# self.b = self.b[::-1]
|
381
|
-
# self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
|
382
|
-
|
383
|
-
# copied_in += copy_size
|
384
|
-
# minor_offset = 0 # only on the first
|
385
|
-
# self.device._wait_signal(self.device.signal_sdma)
|
386
|
-
|
387
|
-
def copyin(self, dest, src: memoryview):
|
388
|
-
for i in range(0, src.nbytes, self.b[0].size):
|
389
|
-
self.b_next = (self.b_next + 1) % len(self.b)
|
390
|
-
AMDDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
|
391
|
-
ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
|
392
|
-
HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
393
|
-
.copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
|
394
|
-
.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
395
|
-
self.b_timeline[self.b_next] = self.device.timeline_value
|
396
|
-
self.device.timeline_value += 1
|
397
|
-
|
398
|
-
def copyout(self, dest:memoryview, src):
|
399
|
-
self.device.synchronize()
|
400
|
-
for i in range(0, dest.nbytes, self.b[0].size):
|
401
|
-
HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
402
|
-
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
|
403
|
-
.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
404
|
-
AMDDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
|
405
|
-
self.device.timeline_value += 1
|
406
|
-
|
407
|
-
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
408
|
-
|
409
|
-
def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
|
410
|
-
src_dev._gpu_map(dest)
|
411
|
-
HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
412
|
-
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
413
|
-
.copy(dest.va_addr, src.va_addr, sz) \
|
414
|
-
.signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
|
415
|
-
HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
|
416
|
-
src_dev.timeline_value += 1
|
395
|
+
raise
|
396
|
+
|
397
|
+
def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
|
417
398
|
|
418
399
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
419
|
-
|
400
|
+
|
401
|
+
@dataclass
|
402
|
+
class AMDQueueDesc:
|
403
|
+
ring: memoryview
|
404
|
+
read_ptr: memoryview
|
405
|
+
write_ptr: memoryview
|
406
|
+
doorbell: memoryview
|
407
|
+
put_value: int = 0
|
408
|
+
|
409
|
+
class AMDDevice(HCQCompatCompiled):
|
420
410
|
kfd:int = -1
|
421
411
|
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
422
412
|
signals_page:Any = None
|
@@ -439,7 +429,7 @@ class AMDDevice(Compiled):
|
|
439
429
|
else:
|
440
430
|
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
441
431
|
assert addr != 0xffffffffffffffff
|
442
|
-
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
432
|
+
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
443
433
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
444
434
|
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
445
435
|
assert addr == buf == mem.va_addr
|
@@ -454,13 +444,19 @@ class AMDDevice(Compiled):
|
|
454
444
|
libc.munmap(mem.va_addr, mem.size)
|
455
445
|
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
|
456
446
|
|
447
|
+
@classmethod
|
448
|
+
def _read_signal(self, sig): return sig.value
|
449
|
+
|
450
|
+
@classmethod
|
451
|
+
def _read_timestamp(self, sig): return sig.start_ts
|
452
|
+
|
457
453
|
@classmethod
|
458
454
|
def _set_signal(self, sig, value): sig.value = value
|
459
455
|
|
460
456
|
@classmethod
|
461
|
-
def _get_signal(self, value=0,
|
457
|
+
def _get_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
|
462
458
|
self._set_signal(ret := self.signals_pool.pop(), value)
|
463
|
-
if sync_event is not None:
|
459
|
+
if (sync_event:=kwargs.get('sync_event')) is not None:
|
464
460
|
ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
465
461
|
ret.event_id = sync_event.event_id
|
466
462
|
else: ret.event_mailbox_ptr = ret.event_id = 0
|
@@ -471,10 +467,11 @@ class AMDDevice(Compiled):
|
|
471
467
|
assert signal.event_id != 0, "can't wait on this signal"
|
472
468
|
evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
|
473
469
|
|
470
|
+
# Wait active for 5s, then going to sleep.
|
474
471
|
start_time = time.time() * 1000
|
475
|
-
while (time.time() * 1000 - start_time) < timeout:
|
472
|
+
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
476
473
|
if signal.value >= value: return
|
477
|
-
kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=
|
474
|
+
if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
478
475
|
raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
|
479
476
|
|
480
477
|
def __init__(self, device:str=""):
|
@@ -500,65 +497,54 @@ class AMDDevice(Compiled):
|
|
500
497
|
self._gpu_map(AMDDevice.event_page)
|
501
498
|
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
502
499
|
|
503
|
-
self.
|
504
|
-
self.timeline_signal = AMDDevice._get_signal(sync_event=sync_event)
|
505
|
-
self._shadow_timeline_signal = AMDDevice._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
|
500
|
+
self.time_event_st, self.time_event_en = AMDDevice._get_signal(), AMDDevice._get_signal()
|
506
501
|
|
507
502
|
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
508
503
|
self.kernargs_ptr = self.kernargs.va_addr
|
509
504
|
|
510
|
-
#
|
505
|
+
# Scratch setup
|
511
506
|
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
512
507
|
max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
513
508
|
self.max_private_segment_size = 4096
|
514
509
|
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
515
510
|
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
516
511
|
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
self.
|
521
|
-
self.sdma_queue =
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
self.
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
self.
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
|
549
|
-
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
|
550
|
-
|
551
|
-
from tinygrad.runtime.graph.hcq import HCQGraph
|
552
|
-
super().__init__(device, AMDAllocator(self), AMDRenderer(), HSACompiler(self.arch),
|
553
|
-
functools.partial(AMDProgram, self),
|
554
|
-
functools.partial(HCQGraph, AMDDevice, HWPM4Queue, HWCopyQueue))
|
512
|
+
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
|
513
|
+
self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
|
514
|
+
|
515
|
+
self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
|
516
|
+
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
517
|
+
|
518
|
+
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self), HWPM4Queue, HWCopyQueue,
|
519
|
+
timeline_signals=[self._get_signal(sync_event=sync_event), self._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))])
|
520
|
+
|
521
|
+
def _gpu2cpu_time(self, gpu_time, is_copy):
|
522
|
+
if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
|
523
|
+
return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
|
524
|
+
|
525
|
+
def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
|
526
|
+
gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
527
|
+
ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
528
|
+
cwsr_ctx = self._gpu_alloc(ctx_save_restore_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
|
529
|
+
eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
|
530
|
+
queue = kio.create_queue(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
531
|
+
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
532
|
+
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
|
533
|
+
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=cwsr_ctx.size if cwsr_ctx else 0,
|
534
|
+
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
535
|
+
|
536
|
+
if not hasattr(self, 'doorbells'):
|
537
|
+
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
538
|
+
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
|
539
|
+
|
540
|
+
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring_size).cast("I"),
|
541
|
+
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
542
|
+
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
|
555
543
|
|
556
544
|
def synchronize(self):
|
557
545
|
AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
|
558
546
|
|
559
547
|
# reset kernargs
|
560
548
|
self.kernargs_ptr = self.kernargs.va_addr
|
561
|
-
if self.timeline_value > (1 << 31):
|
562
|
-
|
563
|
-
self.timeline_signal.value, self.timeline_value = 0, 1
|
564
|
-
cast(AMDAllocator, self.allocator).b_timeline = [0] * len(cast(AMDAllocator, self.allocator).b)
|
549
|
+
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
|
550
|
+
if PROFILE: self._prof_process_events()
|