tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +6 -0
- tinygrad/codegen/kernel.py +572 -83
- tinygrad/codegen/linearizer.py +415 -395
- tinygrad/codegen/uops.py +415 -0
- tinygrad/device.py +183 -0
- tinygrad/dtype.py +113 -0
- tinygrad/engine/__init__.py +0 -0
- tinygrad/engine/graph.py +100 -0
- tinygrad/engine/jit.py +195 -0
- tinygrad/engine/realize.py +191 -0
- tinygrad/engine/schedule.py +362 -0
- tinygrad/engine/search.py +196 -0
- tinygrad/{mlops.py → function.py} +76 -55
- tinygrad/helpers.py +196 -89
- tinygrad/lazy.py +210 -371
- tinygrad/multi.py +169 -0
- tinygrad/nn/__init__.py +202 -22
- tinygrad/nn/datasets.py +7 -0
- tinygrad/nn/optim.py +112 -32
- tinygrad/nn/state.py +136 -39
- tinygrad/ops.py +119 -202
- tinygrad/renderer/__init__.py +61 -0
- tinygrad/renderer/assembly.py +276 -0
- tinygrad/renderer/cstyle.py +353 -166
- tinygrad/renderer/llvmir.py +150 -138
- tinygrad/runtime/autogen/amd_gpu.py +1900 -0
- tinygrad/runtime/autogen/comgr.py +865 -0
- tinygrad/runtime/autogen/cuda.py +5923 -0
- tinygrad/runtime/autogen/hip.py +5909 -0
- tinygrad/runtime/autogen/hsa.py +5761 -0
- tinygrad/runtime/autogen/kfd.py +812 -0
- tinygrad/runtime/autogen/nv_gpu.py +33328 -0
- tinygrad/runtime/autogen/opencl.py +1795 -0
- tinygrad/runtime/driver/hip_comgr.py +47 -0
- tinygrad/runtime/driver/hsa.py +143 -0
- tinygrad/runtime/graph/clang.py +38 -0
- tinygrad/runtime/graph/cuda.py +81 -0
- tinygrad/runtime/graph/hcq.py +143 -0
- tinygrad/runtime/graph/hsa.py +171 -0
- tinygrad/runtime/graph/metal.py +75 -0
- tinygrad/runtime/ops_amd.py +564 -0
- tinygrad/runtime/ops_clang.py +24 -77
- tinygrad/runtime/ops_cuda.py +175 -89
- tinygrad/runtime/ops_disk.py +56 -33
- tinygrad/runtime/ops_gpu.py +92 -95
- tinygrad/runtime/ops_hsa.py +278 -0
- tinygrad/runtime/ops_llvm.py +39 -60
- tinygrad/runtime/ops_metal.py +92 -74
- tinygrad/runtime/ops_npy.py +9 -0
- tinygrad/runtime/ops_nv.py +630 -0
- tinygrad/runtime/ops_python.py +204 -0
- tinygrad/shape/shapetracker.py +86 -254
- tinygrad/shape/symbolic.py +166 -141
- tinygrad/shape/view.py +296 -0
- tinygrad/tensor.py +2619 -448
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
- tinygrad-0.9.0.dist-info/METADATA +227 -0
- tinygrad-0.9.0.dist-info/RECORD +60 -0
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/assembly.py +0 -190
- tinygrad/codegen/optimizer.py +0 -379
- tinygrad/codegen/search.py +0 -72
- tinygrad/graph.py +0 -83
- tinygrad/jit.py +0 -57
- tinygrad/nn/image.py +0 -100
- tinygrad/renderer/assembly_arm64.py +0 -169
- tinygrad/renderer/assembly_ptx.py +0 -98
- tinygrad/renderer/wgsl.py +0 -53
- tinygrad/runtime/lib.py +0 -113
- tinygrad/runtime/ops_cpu.py +0 -51
- tinygrad/runtime/ops_hip.py +0 -82
- tinygrad/runtime/ops_shm.py +0 -29
- tinygrad/runtime/ops_torch.py +0 -30
- tinygrad/runtime/ops_webgpu.py +0 -45
- tinygrad-0.7.0.dist-info/METADATA +0 -212
- tinygrad-0.7.0.dist-info/RECORD +0 -40
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
from typing import List, Any, Dict, cast, Optional
|
2
|
+
import Metal
|
3
|
+
from tinygrad.dtype import dtypes
|
4
|
+
from tinygrad.helpers import dedup, unwrap2, GraphException
|
5
|
+
from tinygrad.device import Buffer
|
6
|
+
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
7
|
+
from tinygrad.engine.jit import GraphRunner
|
8
|
+
from tinygrad.shape.symbolic import Variable
|
9
|
+
from tinygrad.runtime.ops_metal import wait_check
|
10
|
+
|
11
|
+
class MetalGraph(GraphRunner):
|
12
|
+
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
|
13
|
+
super().__init__(jit_cache, input_rawbuffers, var_vals)
|
14
|
+
if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
|
15
|
+
|
16
|
+
# create metal batch exec
|
17
|
+
icb_descriptor = Metal.MTLIndirectCommandBufferDescriptor.new()
|
18
|
+
icb_descriptor.setCommandTypes_(Metal.MTLIndirectCommandType(Metal.MTLIndirectCommandTypeConcurrentDispatch))
|
19
|
+
icb_descriptor.setInheritBuffers_(False)
|
20
|
+
icb_descriptor.setInheritPipelineState_(False)
|
21
|
+
icb_descriptor.setMaxKernelBufferBindCount_(31)
|
22
|
+
self.icb = self.device.device.newIndirectCommandBufferWithDescriptor_maxCommandCount_options_(icb_descriptor, len(self.jit_cache),
|
23
|
+
Metal.MTLResourceOptions(0))
|
24
|
+
if self.icb is None: raise GraphException("create indirect command buffer failed, does your system support this?")
|
25
|
+
|
26
|
+
if len(self.vars): self.int_buf = self.device.allocator.alloc(len(self.vars)*dtypes.int32.itemsize)
|
27
|
+
all_resources = [self.int_buf] if len(self.vars) else []
|
28
|
+
|
29
|
+
for j,ji in enumerate(self.jit_cache):
|
30
|
+
prg: CompiledRunner = cast(CompiledRunner, ji.prg)
|
31
|
+
descriptor = Metal.MTLComputePipelineDescriptor.new()
|
32
|
+
descriptor.setComputeFunction_(prg.clprg.fxn)
|
33
|
+
descriptor.setSupportIndirectCommandBuffers_(True)
|
34
|
+
icb_command = self.icb.indirectComputeCommandAtIndex_(j)
|
35
|
+
icb_command.setComputePipelineState_(unwrap2(
|
36
|
+
self.device.device.newComputePipelineStateWithDescriptor_options_reflection_error_(descriptor, Metal.MTLPipelineOption(0), None, None)))
|
37
|
+
for i,b in enumerate(ji.bufs):
|
38
|
+
if b is not None:
|
39
|
+
icb_command.setKernelBuffer_offset_atIndex_(b._buf, 0, i)
|
40
|
+
all_resources.append(b._buf)
|
41
|
+
for i,v in enumerate(prg.p.vars): icb_command.setKernelBuffer_offset_atIndex_(self.int_buf, self.vars.index(v)*4, len(ji.bufs)+i)
|
42
|
+
if j not in self.jc_idx_with_updatable_launch_dims:
|
43
|
+
global_size, local_size = prg.p.launch_dims(var_vals)
|
44
|
+
icb_command.concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
|
45
|
+
icb_command.setBarrier()
|
46
|
+
|
47
|
+
self.all_resources = dedup(all_resources)
|
48
|
+
self.command_buffer: Any = None
|
49
|
+
if len(self.vars): self.int_buf_view = self.int_buf.contents().as_buffer(self.int_buf.length()).cast('i')
|
50
|
+
|
51
|
+
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
|
52
|
+
if self.command_buffer is not None and self.command_buffer in self.device.mtl_buffers_in_flight: wait_check(self.command_buffer)
|
53
|
+
all_resources = dedup(self.all_resources + [x._buf for x in input_rawbuffers])
|
54
|
+
|
55
|
+
for (j,i),input_idx in self.input_replace.items():
|
56
|
+
self.icb.indirectComputeCommandAtIndex_(j).setKernelBuffer_offset_atIndex_(input_rawbuffers[input_idx]._buf, 0, i)
|
57
|
+
for j in self.jc_idx_with_updatable_launch_dims:
|
58
|
+
global_size, local_size = cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals)
|
59
|
+
self.icb.indirectComputeCommandAtIndex_(j).concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size),
|
60
|
+
Metal.MTLSize(*local_size))
|
61
|
+
for j, var in enumerate(self.vars): self.int_buf_view[j] = var_vals[var]
|
62
|
+
|
63
|
+
command_buffer = self.device.mtl_queue.commandBuffer()
|
64
|
+
encoder = command_buffer.computeCommandEncoder()
|
65
|
+
encoder.useResources_count_usage_(all_resources, len(all_resources), Metal.MTLResourceUsageRead | Metal.MTLResourceUsageWrite)
|
66
|
+
encoder.executeCommandsInBuffer_withRange_(self.icb, Metal.MTLIndirectCommandBufferExecutionRangeMake(0, len(self.jit_cache)))
|
67
|
+
encoder.endEncoding()
|
68
|
+
command_buffer.commit()
|
69
|
+
self.command_buffer = command_buffer
|
70
|
+
|
71
|
+
if wait:
|
72
|
+
wait_check(command_buffer)
|
73
|
+
return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
|
74
|
+
self.device.mtl_buffers_in_flight.append(command_buffer)
|
75
|
+
return None
|
@@ -0,0 +1,564 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Tuple, List, Any, cast
|
3
|
+
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
|
4
|
+
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
|
5
|
+
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
|
6
|
+
from tinygrad.renderer.cstyle import AMDRenderer
|
7
|
+
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
8
|
+
from tinygrad.runtime.ops_hsa import HSACompiler
|
9
|
+
import tinygrad.runtime.autogen.kfd as kfd
|
10
|
+
import tinygrad.runtime.autogen.hsa as hsa
|
11
|
+
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
12
|
+
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
|
13
|
+
|
14
|
+
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
15
|
+
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
16
|
+
libc.mmap.restype = ctypes.c_void_p
|
17
|
+
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
18
|
+
libc.munmap.restype = ctypes.c_int
|
19
|
+
|
20
|
+
if getenv("MOCKGPU"):
|
21
|
+
import extra.mockgpu.mockgpu # noqa: F401
|
22
|
+
libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
|
23
|
+
libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
|
24
|
+
|
25
|
+
def is_usable_gpu(gpu_id):
|
26
|
+
try:
|
27
|
+
with gpu_id.open() as f:
|
28
|
+
return int(f.read()) != 0
|
29
|
+
except OSError:
|
30
|
+
return False
|
31
|
+
|
32
|
+
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
33
|
+
made = made_struct or user_struct(**kwargs)
|
34
|
+
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
35
|
+
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
36
|
+
return made
|
37
|
+
|
38
|
+
def ioctls_from_header():
|
39
|
+
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
40
|
+
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
41
|
+
#matches = re.findall(pattern, hdr, re.MULTILINE)
|
42
|
+
# get this from python instead
|
43
|
+
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
|
44
|
+
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
45
|
+
matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
46
|
+
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
47
|
+
fxns = {name.replace("AMDKFD_IOC_", "").lower():
|
48
|
+
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
49
|
+
for name, idir, nr, sname in matches}
|
50
|
+
return type("KIO", (object, ), fxns)
|
51
|
+
kio = ioctls_from_header()
|
52
|
+
|
53
|
+
def create_sdma_packets():
|
54
|
+
# TODO: clean up this, if we want to keep it
|
55
|
+
structs = {}
|
56
|
+
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
|
57
|
+
names = set()
|
58
|
+
fields = []
|
59
|
+
for pkt_fields in pkt._fields_:
|
60
|
+
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
|
61
|
+
else:
|
62
|
+
assert pkt_fields[1]._fields_[0][0] == '_0'
|
63
|
+
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
|
64
|
+
fname = union_fields[0]
|
65
|
+
if fname in names: fname = pkt_fields[0]+fname
|
66
|
+
names.add(fname)
|
67
|
+
# merge together 64-bit fields, otherwise just append them
|
68
|
+
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
|
69
|
+
else: fields.append(tuple([fname, *union_fields[1:]]))
|
70
|
+
new_name = name[16:-4].lower()
|
71
|
+
structs[new_name] = init_c_struct_t(tuple(fields))
|
72
|
+
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
73
|
+
return type("SDMA_PKTS", (object, ), structs)
|
74
|
+
sdma_pkts = create_sdma_packets()
|
75
|
+
|
76
|
+
class AMDCompiler(Compiler):
|
77
|
+
def __init__(self, arch:str):
|
78
|
+
self.arch = arch
|
79
|
+
super().__init__(f"compile_hip_{self.arch}")
|
80
|
+
def compile(self, src:str) -> bytes:
|
81
|
+
try: return compile_hip(src, self.arch)
|
82
|
+
except RuntimeError as e: raise CompileError(e)
|
83
|
+
|
84
|
+
PAGE_SIZE = 0x1000
|
85
|
+
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
|
86
|
+
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
87
|
+
|
88
|
+
BASE_ADDR = 0x00001260
|
89
|
+
SUB = amd_gpu.PACKET3_SET_SH_REG_START - BASE_ADDR
|
90
|
+
|
91
|
+
regCOMPUTE_PGM_LO = 0x1bac - SUB
|
92
|
+
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
|
93
|
+
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
|
94
|
+
regCOMPUTE_START_X = 0x1ba4 - SUB
|
95
|
+
regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
|
96
|
+
regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
|
97
|
+
regCOMPUTE_RESTART_X = 0x1bbb - SUB
|
98
|
+
regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
|
99
|
+
regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
|
100
|
+
regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
|
101
|
+
|
102
|
+
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
|
103
|
+
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
|
104
|
+
|
105
|
+
# VGT_EVENT_TYPE in navi10_enum.h
|
106
|
+
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
107
|
+
CS_PARTIAL_FLUSH = 0x7
|
108
|
+
|
109
|
+
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
110
|
+
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
111
|
+
|
112
|
+
COMPUTE_SHADER_EN = 1
|
113
|
+
FORCE_START_AT_000 = 1 << 2
|
114
|
+
CS_W32_EN = 1 << 15
|
115
|
+
|
116
|
+
class HWPM4Queue:
|
117
|
+
def __init__(self): self.q = []
|
118
|
+
def ptr(self) -> int: return len(self.q)
|
119
|
+
|
120
|
+
def hdp_flush(self):
|
121
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
122
|
+
amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | \
|
123
|
+
amd_gpu.WAIT_REG_MEM_ENGINE(0), regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE, 0x0, 0x0, 0x20]
|
124
|
+
|
125
|
+
def invalidate_cache(self):
|
126
|
+
# overkill?
|
127
|
+
addr=0x0
|
128
|
+
sz=(1 << 64)-1
|
129
|
+
gli=1
|
130
|
+
glv=1
|
131
|
+
glk=1
|
132
|
+
gl1=1
|
133
|
+
gl2=1
|
134
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
|
135
|
+
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
136
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
137
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
138
|
+
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
139
|
+
return self
|
140
|
+
|
141
|
+
def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
|
142
|
+
self.hdp_flush()
|
143
|
+
self.invalidate_cache()
|
144
|
+
|
145
|
+
code = hsa.amd_kernel_code_t.from_address(prg.handle) # NOTE: this is wrong, it's not this object
|
146
|
+
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
147
|
+
assert code.workitem_private_segment_byte_size == 0
|
148
|
+
assert code.max_scratch_backing_memory_byte_size == 0
|
149
|
+
assert code.kernel_code_prefetch_byte_size == 0
|
150
|
+
rsrc1, rsrc2 = code.compute_pgm_rsrc1, code.compute_pgm_rsrc2
|
151
|
+
|
152
|
+
# this is required
|
153
|
+
lds_size = ((prg.group_segment_size + 511) // 512) & 0x1FF
|
154
|
+
assert lds_size <= 0x80 # larger numbers stall the GPU
|
155
|
+
|
156
|
+
prog_addr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
|
157
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, prog_addr&0xFFFFFFFF, prog_addr>>32, 0, 0,
|
158
|
+
(prg.device.scratch.va_addr>>8)&0xFFFFFFFF, prg.device.scratch.va_addr>>40]
|
159
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, rsrc1, rsrc2 | (lds_size << 15)]
|
160
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_TMPRING_SIZE, 0x00200200] # (waveSize << 12) | (numWaves)
|
161
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_RESTART_X, 0,0,0,0]
|
162
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF,0xFFFFFFFF]
|
163
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF,0xFFFFFFFF]
|
164
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]
|
165
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, kernargs&0xFFFFFFFF, kernargs>>32]
|
166
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0]
|
167
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_RESOURCE_LIMITS, 0]
|
168
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
169
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
|
170
|
+
|
171
|
+
if signal is not None: self.signal(signal, signal_value)
|
172
|
+
return self
|
173
|
+
|
174
|
+
def update_exec(self, cmd_ptr, global_size, local_size):
|
175
|
+
# Patch the exec cmd with new launch dims
|
176
|
+
assert self.q[cmd_ptr + 67] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),"The pointer does not point to a packet of this type"
|
177
|
+
self.q[cmd_ptr + 59 : cmd_ptr + 62] = local_size
|
178
|
+
self.q[cmd_ptr + 68 : cmd_ptr + 71] = global_size
|
179
|
+
|
180
|
+
def wait(self, signal:hsa.amd_signal_t, value=0):
|
181
|
+
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
182
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
183
|
+
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
184
|
+
amd_gpu.WAIT_REG_MEM_ENGINE(0), addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
|
185
|
+
return self
|
186
|
+
|
187
|
+
def timestamp(self, addr):
|
188
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
189
|
+
# event_index__mec_release_mem__end_of_pipe = 5
|
190
|
+
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5),
|
191
|
+
# * 3 - send 64bit GPU counter value
|
192
|
+
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(3) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(0) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
193
|
+
addr&0xFFFFFFFF, addr>>32, 0, 0, 0]
|
194
|
+
return self
|
195
|
+
|
196
|
+
def signal(self, signal:hsa.amd_signal_t, value=0):
|
197
|
+
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
198
|
+
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
199
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
200
|
+
# event_index__mec_release_mem__end_of_pipe = 5
|
201
|
+
# event_index__mec_release_mem__shader_done = 6
|
202
|
+
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
|
203
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
|
204
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
|
205
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
|
206
|
+
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
207
|
+
addr&0xFFFFFFFF, addr>>32,
|
208
|
+
value&0xFFFFFFFF, value>>32, 0]
|
209
|
+
if signal.event_mailbox_ptr != 0:
|
210
|
+
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
211
|
+
# event_index__mec_release_mem__end_of_pipe = 5
|
212
|
+
# event_index__mec_release_mem__shader_done = 6
|
213
|
+
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
|
214
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
|
215
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
|
216
|
+
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
|
217
|
+
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
218
|
+
signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
|
219
|
+
signal.event_id&0xFFFFFFFF, signal.event_id>>32,
|
220
|
+
signal.event_id]
|
221
|
+
return self
|
222
|
+
|
223
|
+
def submit(self, device:AMDDevice):
|
224
|
+
wptr = device.pm4_write_pointer[0]
|
225
|
+
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
|
226
|
+
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
|
227
|
+
device.pm4_write_pointer[0] = wptr + len(self.q)
|
228
|
+
device.pm4_doorbell[0] = wptr + len(self.q)
|
229
|
+
return self
|
230
|
+
|
231
|
+
# prebuilt sdma packets
|
232
|
+
sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
|
233
|
+
sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
234
|
+
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
|
235
|
+
GCR_CONTROL_GL2_RANGE=0)
|
236
|
+
sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
237
|
+
GCR_CONTROL_GL2_RANGE=0)
|
238
|
+
|
239
|
+
SDMA_MAX_COPY_SIZE = 0x400000
|
240
|
+
class HWCopyQueue:
|
241
|
+
def __init__(self): self.q = []
|
242
|
+
|
243
|
+
def submit(self, device:AMDDevice):
|
244
|
+
read_ptr = device.sdma_read_pointer[0]
|
245
|
+
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
246
|
+
for cmd in self.q:
|
247
|
+
if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
|
248
|
+
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
|
249
|
+
device.sdma_doorbell_value += fill
|
250
|
+
ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
|
251
|
+
device.sdma_doorbell_value += cmdsz
|
252
|
+
device.sdma_write_pointer[0] = device.sdma_doorbell_value
|
253
|
+
device.sdma_doorbell[0] = device.sdma_doorbell_value
|
254
|
+
return self
|
255
|
+
|
256
|
+
def timestamp(self, addr):
|
257
|
+
self.q.append(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=addr))
|
258
|
+
return self
|
259
|
+
|
260
|
+
def copy(self, dest, src, copy_size):
|
261
|
+
self.q.append(sdma_flush_hdp_pkt) # TODO: do I need this?
|
262
|
+
self.q.append(sdma_cache_inv)
|
263
|
+
copied = 0
|
264
|
+
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
265
|
+
for _ in range(copies_commands):
|
266
|
+
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
267
|
+
self.q.append(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
|
268
|
+
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
|
269
|
+
copied += step_copy_size
|
270
|
+
self.q.append(sdma_cache_wb)
|
271
|
+
return self
|
272
|
+
|
273
|
+
def signal(self, signal:hsa.amd_signal_t, value=0):
|
274
|
+
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
|
275
|
+
if signal.event_mailbox_ptr != 0:
|
276
|
+
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
|
277
|
+
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
|
278
|
+
return self
|
279
|
+
|
280
|
+
def wait(self, signal:hsa.amd_signal_t, value=0):
|
281
|
+
self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=WAIT_REG_MEM_FUNCTION_GEQ,
|
282
|
+
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
|
283
|
+
value=value, mask=0xffffffff, interval=0x04, retry_count=0xfff))
|
284
|
+
return self
|
285
|
+
|
286
|
+
SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
|
287
|
+
class AMDProgram:
|
288
|
+
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
289
|
+
# TODO; this API needs the type signature of the function and global_size/local_size
|
290
|
+
self.device, self.name, self.lib = device, name, lib
|
291
|
+
|
292
|
+
if DEBUG >= 6:
|
293
|
+
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
|
294
|
+
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
295
|
+
|
296
|
+
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
297
|
+
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
298
|
+
|
299
|
+
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
|
300
|
+
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
301
|
+
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
|
302
|
+
|
303
|
+
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
|
304
|
+
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
|
305
|
+
|
306
|
+
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
|
307
|
+
self.handle = self.lib_gpu.va_addr + entry_point
|
308
|
+
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
|
309
|
+
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
|
310
|
+
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
|
311
|
+
self.kernargs_offset = 0
|
312
|
+
assert self.private_segment_size <= self.device.max_private_segment_size, \
|
313
|
+
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
|
314
|
+
|
315
|
+
HWPM4Queue().invalidate_cache().submit(self.device)
|
316
|
+
|
317
|
+
# NOTE: no programs are ever freed
|
318
|
+
def __del__(self):
|
319
|
+
if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
|
320
|
+
|
321
|
+
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
322
|
+
if self.device.kernargs_ptr + self.kernargs_segment_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
|
323
|
+
self.device.kernargs_ptr = self.device.kernargs.va_addr
|
324
|
+
assert self.device.kernargs_ptr + self.kernargs_segment_size <= (self.device.kernargs.va_addr + self.device.kernargs.size), "kernargs overrun"
|
325
|
+
if not hasattr(self, "args_struct_t"):
|
326
|
+
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
327
|
+
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
328
|
+
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
329
|
+
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
330
|
+
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
|
331
|
+
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
332
|
+
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
333
|
+
|
334
|
+
q = HWPM4Queue()
|
335
|
+
q.wait(self.device.timeline_signal, self.device.timeline_value - 1)
|
336
|
+
if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
|
337
|
+
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
338
|
+
if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset)
|
339
|
+
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
340
|
+
self.device.timeline_value += 1
|
341
|
+
self.device.kernargs_ptr += self.kernargs_segment_size
|
342
|
+
|
343
|
+
if wait:
|
344
|
+
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
345
|
+
return (self.device.timeline_signal.end_ts - self.device.timeline_signal.start_ts) / 1e8
|
346
|
+
|
347
|
+
class AMDAllocator(LRUAllocator):
|
348
|
+
def __init__(self, device:AMDDevice):
|
349
|
+
self.device = device
|
350
|
+
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
351
|
+
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(16)]
|
352
|
+
self.b_timeline = [0] * len(self.b)
|
353
|
+
self.b_next = 0
|
354
|
+
super().__init__()
|
355
|
+
|
356
|
+
def _alloc(self, size:int, options:BufferOptions):
|
357
|
+
try:
|
358
|
+
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
359
|
+
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
360
|
+
except OSError as e:
|
361
|
+
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
|
362
|
+
else: raise
|
363
|
+
|
364
|
+
def _free(self, gpumem, options:BufferOptions): self.device._gpu_free(gpumem)
|
365
|
+
#def as_buffer(self, src:Any) -> memoryview:
|
366
|
+
# self.device.synchronize()
|
367
|
+
# return to_mv(src.va_addr, src.size)
|
368
|
+
|
369
|
+
#def copy_from_fd(self, dest, fd, offset, size):
|
370
|
+
# fo = io.FileIO(fd, "a+b", closefd=False)
|
371
|
+
# fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
|
372
|
+
# copied_in, total_copy_size = 0, round_up(size+minor_offset, PAGE_SIZE)
|
373
|
+
# for i in range(0, size+minor_offset, self.b[0].size):
|
374
|
+
# local_size = min(self.b[0].size, total_copy_size-i)
|
375
|
+
# copy_size = min(local_size-minor_offset, size-copied_in)
|
376
|
+
# if copy_size == 0: break
|
377
|
+
|
378
|
+
# fo.readinto(to_mv(self.b[1].va_addr, local_size))
|
379
|
+
# if i != 0: self.device._wait_signal(self.device.signal_sdma)
|
380
|
+
# self.b = self.b[::-1]
|
381
|
+
# self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
|
382
|
+
|
383
|
+
# copied_in += copy_size
|
384
|
+
# minor_offset = 0 # only on the first
|
385
|
+
# self.device._wait_signal(self.device.signal_sdma)
|
386
|
+
|
387
|
+
def copyin(self, dest, src: memoryview):
|
388
|
+
for i in range(0, src.nbytes, self.b[0].size):
|
389
|
+
self.b_next = (self.b_next + 1) % len(self.b)
|
390
|
+
AMDDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
|
391
|
+
ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
|
392
|
+
HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
393
|
+
.copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
|
394
|
+
.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
395
|
+
self.b_timeline[self.b_next] = self.device.timeline_value
|
396
|
+
self.device.timeline_value += 1
|
397
|
+
|
398
|
+
def copyout(self, dest:memoryview, src):
|
399
|
+
self.device.synchronize()
|
400
|
+
for i in range(0, dest.nbytes, self.b[0].size):
|
401
|
+
HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
402
|
+
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
|
403
|
+
.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
404
|
+
AMDDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
|
405
|
+
self.device.timeline_value += 1
|
406
|
+
|
407
|
+
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
408
|
+
|
409
|
+
def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
|
410
|
+
src_dev._gpu_map(dest)
|
411
|
+
HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
412
|
+
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
413
|
+
.copy(dest.va_addr, src.va_addr, sz) \
|
414
|
+
.signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
|
415
|
+
HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
|
416
|
+
src_dev.timeline_value += 1
|
417
|
+
|
418
|
+
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
419
|
+
class AMDDevice(Compiled):
|
420
|
+
kfd:int = -1
|
421
|
+
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
422
|
+
signals_page:Any = None
|
423
|
+
signals_pool:List[hsa.amd_signal_t] = []
|
424
|
+
gpus:List[pathlib.Path] = []
|
425
|
+
|
426
|
+
def _gpu_map(self, mem):
|
427
|
+
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
|
428
|
+
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
|
429
|
+
c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
|
430
|
+
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
|
431
|
+
assert stm.n_success == len(mem.mapped_gpu_ids)
|
432
|
+
|
433
|
+
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
434
|
+
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
435
|
+
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
|
436
|
+
if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
437
|
+
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
438
|
+
buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
439
|
+
else:
|
440
|
+
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
441
|
+
assert addr != 0xffffffffffffffff
|
442
|
+
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
443
|
+
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
444
|
+
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
445
|
+
assert addr == buf == mem.va_addr
|
446
|
+
if map_to_gpu: self._gpu_map(mem)
|
447
|
+
return mem
|
448
|
+
|
449
|
+
def _gpu_free(self, mem):
|
450
|
+
if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
|
451
|
+
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
452
|
+
stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
453
|
+
assert stm.n_success == len(gpus)
|
454
|
+
libc.munmap(mem.va_addr, mem.size)
|
455
|
+
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
|
456
|
+
|
457
|
+
@classmethod
|
458
|
+
def _set_signal(self, sig, value): sig.value = value
|
459
|
+
|
460
|
+
@classmethod
|
461
|
+
def _get_signal(self, value=0, sync_event=None) -> hsa.amd_signal_t:
|
462
|
+
self._set_signal(ret := self.signals_pool.pop(), value)
|
463
|
+
if sync_event is not None:
|
464
|
+
ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
465
|
+
ret.event_id = sync_event.event_id
|
466
|
+
else: ret.event_mailbox_ptr = ret.event_id = 0
|
467
|
+
return ret
|
468
|
+
|
469
|
+
@classmethod
|
470
|
+
def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
|
471
|
+
assert signal.event_id != 0, "can't wait on this signal"
|
472
|
+
evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
|
473
|
+
|
474
|
+
start_time = time.time() * 1000
|
475
|
+
while (time.time() * 1000 - start_time) < timeout:
|
476
|
+
if signal.value >= value: return
|
477
|
+
kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=100)
|
478
|
+
raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
|
479
|
+
|
480
|
+
def __init__(self, device:str=""):
|
481
|
+
if AMDDevice.kfd == -1:
|
482
|
+
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
483
|
+
AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
484
|
+
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
485
|
+
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
486
|
+
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
487
|
+
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
488
|
+
target = int(self.properties['gfx_target_version'])
|
489
|
+
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
490
|
+
kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
491
|
+
|
492
|
+
if AMDDevice.event_page is None:
|
493
|
+
AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
494
|
+
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
495
|
+
for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
|
496
|
+
AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
|
497
|
+
sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
|
498
|
+
else:
|
499
|
+
self._gpu_map(AMDDevice.signals_page)
|
500
|
+
self._gpu_map(AMDDevice.event_page)
|
501
|
+
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
502
|
+
|
503
|
+
self.timeline_value: int = 1
|
504
|
+
self.timeline_signal = AMDDevice._get_signal(sync_event=sync_event)
|
505
|
+
self._shadow_timeline_signal = AMDDevice._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
|
506
|
+
|
507
|
+
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
508
|
+
self.kernargs_ptr = self.kernargs.va_addr
|
509
|
+
|
510
|
+
# scratch setup
|
511
|
+
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
512
|
+
max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
513
|
+
self.max_private_segment_size = 4096
|
514
|
+
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
515
|
+
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
516
|
+
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
517
|
+
|
518
|
+
# SDMA Queue
|
519
|
+
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
520
|
+
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
521
|
+
self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
522
|
+
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
523
|
+
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
|
524
|
+
|
525
|
+
# doorbell page
|
526
|
+
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
527
|
+
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
|
528
|
+
|
529
|
+
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
530
|
+
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
531
|
+
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
|
532
|
+
self.sdma_doorbell_value = 0
|
533
|
+
|
534
|
+
# PM4 Queue
|
535
|
+
self.pm4_ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
536
|
+
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
537
|
+
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
538
|
+
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
539
|
+
self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
|
540
|
+
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
541
|
+
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
|
542
|
+
# TODO: are these needed? (i know eop is)
|
543
|
+
ctx_save_restore_address=self.pm4_ctx_save_restore_address.va_addr, ctx_save_restore_size=self.pm4_ctx_save_restore_address.size,
|
544
|
+
ctl_stack_size = 0xa000,
|
545
|
+
write_pointer_address=self.gart_pm4.va_addr, read_pointer_address=self.gart_pm4.va_addr+8)
|
546
|
+
|
547
|
+
self.pm4_read_pointer = to_mv(self.pm4_queue.read_pointer_address, 8).cast("Q")
|
548
|
+
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
|
549
|
+
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
|
550
|
+
|
551
|
+
from tinygrad.runtime.graph.hcq import HCQGraph
|
552
|
+
super().__init__(device, AMDAllocator(self), AMDRenderer(), HSACompiler(self.arch),
|
553
|
+
functools.partial(AMDProgram, self),
|
554
|
+
functools.partial(HCQGraph, AMDDevice, HWPM4Queue, HWCopyQueue))
|
555
|
+
|
556
|
+
def synchronize(self):
|
557
|
+
AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
|
558
|
+
|
559
|
+
# reset kernargs
|
560
|
+
self.kernargs_ptr = self.kernargs.va_addr
|
561
|
+
if self.timeline_value > (1 << 31):
|
562
|
+
self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
|
563
|
+
self.timeline_signal.value, self.timeline_value = 0, 1
|
564
|
+
cast(AMDAllocator, self.allocator).b_timeline = [0] * len(cast(AMDAllocator, self.allocator).b)
|