tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/kernel.py +248 -115
- tinygrad/codegen/lowerer.py +215 -0
- tinygrad/codegen/transcendental.py +310 -0
- tinygrad/codegen/uopgraph.py +622 -0
- tinygrad/codegen/uops.py +235 -393
- tinygrad/device.py +428 -69
- tinygrad/dtype.py +18 -4
- tinygrad/engine/graph.py +19 -32
- tinygrad/engine/jit.py +148 -70
- tinygrad/engine/realize.py +127 -51
- tinygrad/engine/schedule.py +259 -216
- tinygrad/engine/search.py +29 -22
- tinygrad/function.py +9 -0
- tinygrad/helpers.py +87 -49
- tinygrad/lazy.py +34 -35
- tinygrad/multi.py +41 -36
- tinygrad/nn/__init__.py +39 -22
- tinygrad/nn/state.py +3 -3
- tinygrad/ops.py +63 -62
- tinygrad/renderer/__init__.py +43 -21
- tinygrad/renderer/assembly.py +104 -106
- tinygrad/renderer/cstyle.py +87 -60
- tinygrad/renderer/llvmir.py +21 -30
- tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
- tinygrad/runtime/autogen/cuda.py +6 -162
- tinygrad/runtime/autogen/kfd.py +32 -0
- tinygrad/runtime/autogen/libc.py +4260 -0
- tinygrad/runtime/autogen/nvrtc.py +579 -0
- tinygrad/runtime/graph/clang.py +2 -2
- tinygrad/runtime/graph/cuda.py +8 -11
- tinygrad/runtime/graph/hcq.py +120 -107
- tinygrad/runtime/graph/metal.py +18 -15
- tinygrad/runtime/ops_amd.py +197 -305
- tinygrad/runtime/ops_clang.py +2 -2
- tinygrad/runtime/ops_cuda.py +36 -94
- tinygrad/runtime/ops_disk.py +3 -7
- tinygrad/runtime/ops_gpu.py +4 -2
- tinygrad/runtime/ops_hip.py +70 -0
- tinygrad/runtime/ops_metal.py +38 -27
- tinygrad/runtime/ops_nv.py +283 -363
- tinygrad/runtime/ops_python.py +26 -30
- tinygrad/runtime/support/compiler_cuda.py +78 -0
- tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
- tinygrad/runtime/support/elf.py +38 -0
- tinygrad/shape/shapetracker.py +5 -14
- tinygrad/shape/symbolic.py +4 -8
- tinygrad/shape/view.py +34 -22
- tinygrad/tensor.py +399 -97
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
- tinygrad-0.9.2.dist-info/RECORD +70 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/linearizer.py +0 -528
- tinygrad-0.9.1.dist-info/RECORD +0 -63
- /tinygrad/runtime/{driver → support}/__init__.py +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_nv.py
CHANGED
@@ -1,25 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, contextlib,
|
3
|
-
from typing import Tuple, List, Any
|
2
|
+
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, time, array, decimal
|
3
|
+
from typing import Tuple, List, Any, cast, Union, Dict, Type
|
4
4
|
from dataclasses import dataclass
|
5
|
-
from tinygrad.device import
|
6
|
-
|
5
|
+
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \
|
6
|
+
HCQArgsState, HCQProgram, HCQSignal, BufferOptions
|
7
|
+
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
8
|
+
from tinygrad.renderer.assembly import PTXRenderer
|
7
9
|
from tinygrad.renderer.cstyle import NVRenderer
|
8
|
-
from tinygrad.runtime.
|
9
|
-
|
10
|
-
|
11
|
-
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
libc.mmap.restype = ctypes.c_void_p
|
16
|
-
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
17
|
-
libc.munmap.restype = ctypes.c_int
|
18
|
-
|
19
|
-
if MOCKGPU:=getenv("MOCKGPU"):
|
20
|
-
import extra.mockgpu.mockgpu # noqa: F401
|
21
|
-
libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
|
22
|
-
libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
|
10
|
+
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler, nv_disassemble
|
11
|
+
from tinygrad.runtime.autogen import nv_gpu, libc
|
12
|
+
from tinygrad.runtime.support.elf import elf_loader
|
13
|
+
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
14
|
+
if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
15
|
+
|
16
|
+
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
|
23
17
|
|
24
18
|
def nv_iowr(fd, nr, args):
|
25
19
|
ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
@@ -27,117 +21,113 @@ def nv_iowr(fd, nr, args):
|
|
27
21
|
|
28
22
|
def rm_alloc(fd, clss, root, parant, params):
|
29
23
|
made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
|
30
|
-
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.
|
24
|
+
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
|
31
25
|
nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
|
32
|
-
if made.status != 0: raise RuntimeError(f"rm_alloc returned {
|
26
|
+
if made.status != 0: raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
|
33
27
|
return made
|
34
28
|
|
35
|
-
def rm_control(
|
36
|
-
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
|
37
|
-
params=ctypes.cast(ctypes.byref(params), ctypes.
|
29
|
+
def rm_control(cmd, sttyp, fd, client, obj, **kwargs):
|
30
|
+
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params:=sttyp(**kwargs)),
|
31
|
+
params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
|
38
32
|
nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
|
39
|
-
if made.status != 0: raise RuntimeError(f"rm_control returned {
|
40
|
-
return
|
33
|
+
if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
|
34
|
+
return params
|
35
|
+
|
36
|
+
def make_rmctrl_type():
|
37
|
+
return type("NVRMCTRL", (object,), {name[name.find("_CTRL_CMD_")+10:].lower(): functools.partial(rm_control, dt, sttyp)
|
38
|
+
for name,dt in nv_gpu.__dict__.items() if name.find("_CTRL_CMD_")>=0 and
|
39
|
+
(sttyp:=getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_")+"_PARAMS", getattr(nv_gpu, name+"_PARAMS", None)))})
|
40
|
+
rmctrl = make_rmctrl_type()
|
41
41
|
|
42
42
|
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
43
43
|
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
|
44
44
|
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
45
|
-
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {
|
45
|
+
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
|
46
46
|
return made
|
47
47
|
|
48
48
|
def make_uvm_type():
|
49
|
-
|
50
|
-
|
51
|
-
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
|
52
|
-
return type("NVUVM", (object, ), fxns)
|
49
|
+
return type("NVUVM", (object,), {name.replace("UVM_", "").lower(): functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
|
50
|
+
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
|
53
51
|
uvm = make_uvm_type()
|
54
52
|
|
55
53
|
def make_qmd_struct_type():
|
56
|
-
fields = []
|
54
|
+
fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
|
57
55
|
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
58
56
|
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
59
57
|
bits = sorted(bits, key=lambda x: x[1][1])
|
60
58
|
for i,(name, data) in enumerate(bits):
|
61
|
-
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0:
|
59
|
+
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
|
62
60
|
fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
|
61
|
+
if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
|
62
|
+
fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
|
63
63
|
return init_c_struct_t(tuple(fields))
|
64
64
|
qmd_struct_t = make_qmd_struct_type()
|
65
65
|
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
66
66
|
|
67
67
|
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
self.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
def
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
class HWQueue:
|
88
|
-
def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
|
68
|
+
|
69
|
+
|
70
|
+
class NVSignal(HCQSignal):
|
71
|
+
def __init__(self, value=0):
|
72
|
+
self._signal = NVDevice.signals_pool.pop()
|
73
|
+
self.signal_addr = mv_address(self._signal)
|
74
|
+
super().__init__(value)
|
75
|
+
def __del__(self): NVDevice.signals_pool.append(self._signal)
|
76
|
+
def _get_value(self) -> int: return self._signal[0]
|
77
|
+
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
|
78
|
+
def _set_value(self, new_value:int): self._signal[0] = new_value
|
79
|
+
def wait(self, value:int, timeout:int=10000):
|
80
|
+
start_time = time.time() * 1000
|
81
|
+
while time.time() * 1000 - start_time < timeout:
|
82
|
+
if self._signal[0] >= value: return
|
83
|
+
raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
|
84
|
+
|
85
|
+
class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
|
89
86
|
def __del__(self):
|
90
87
|
if self.binded_device is not None:
|
91
88
|
self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
|
92
89
|
self.binded_device._gpu_free(self.hw_page)
|
93
90
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
91
|
+
@hcq_command
|
92
|
+
def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
|
93
|
+
if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class]
|
94
|
+
if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class]
|
95
|
+
if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)]
|
96
|
+
if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)]
|
97
|
+
if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)]
|
98
|
+
if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0x40]
|
99
|
+
|
100
|
+
def _wait(self, signal, value=0):
|
101
|
+
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
|
103
102
|
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
|
104
|
-
return self._mark_command_end()
|
105
|
-
|
106
|
-
def timestamp(self, signal): return HWQueue.signal(self, signal, timestamp=True)
|
107
103
|
|
108
|
-
def
|
109
|
-
self.q
|
110
|
-
|
111
|
-
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
112
|
-
return self._mark_command_end()
|
104
|
+
def _update_wait(self, cmd_idx, signal=None, value=None):
|
105
|
+
if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
|
106
|
+
if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
|
113
107
|
|
114
|
-
def
|
115
|
-
def update_wait(self, cmd_idx, signal=None, value=None):
|
116
|
-
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
|
117
|
-
if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
|
118
|
-
return self
|
108
|
+
def _timestamp(self, signal): return self._signal(signal, 0)
|
119
109
|
|
120
|
-
def bind(self, device
|
110
|
+
def bind(self, device):
|
121
111
|
self.binded_device = device
|
122
|
-
self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
|
123
|
-
hw_view = to_mv(self.hw_page.
|
112
|
+
self.hw_page = cast(NVDevice, device)._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
|
113
|
+
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
124
114
|
for i, value in enumerate(self.q): hw_view[i] = value
|
125
115
|
|
126
116
|
# From now on, the queue is on the device for faster submission.
|
127
117
|
self.q = hw_view # type: ignore
|
128
118
|
|
129
|
-
def
|
119
|
+
def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
|
130
120
|
if len(self.q) == 0: return
|
131
121
|
|
132
|
-
if dev == self.binded_device: cmdq_addr = self.hw_page.
|
122
|
+
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
|
133
123
|
else:
|
134
|
-
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.
|
135
|
-
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.
|
124
|
+
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
|
125
|
+
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
|
136
126
|
gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
|
137
127
|
dev.cmdq_wptr = 0
|
138
128
|
|
139
129
|
dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
|
140
|
-
cmdq_addr = dev.cmdq_page.
|
130
|
+
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
|
141
131
|
dev.cmdq_wptr += len(self.q) * 4
|
142
132
|
|
143
133
|
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
|
@@ -145,37 +135,26 @@ class HWQueue:
|
|
145
135
|
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
146
136
|
gpfifo.put_value += 1
|
147
137
|
|
148
|
-
class HWComputeQueue
|
138
|
+
class NVComputeQueue(NVCommandQueue, HWComputeQueue):
|
149
139
|
def __init__(self):
|
140
|
+
self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
|
150
141
|
super().__init__()
|
151
|
-
self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
|
152
142
|
|
153
|
-
def
|
154
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
|
155
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
|
156
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
|
157
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
|
158
|
-
return self._mark_command_end()
|
143
|
+
def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
|
159
144
|
|
160
|
-
def
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
self.
|
145
|
+
def _exec(self, prg, args_state, global_size, local_size):
|
146
|
+
cmd_idx = len(self) - 1
|
147
|
+
|
148
|
+
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
149
|
+
self.cmd_idx_to_qmd[cmd_idx] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
150
|
+
self.cmd_idx_to_global_dims[cmd_idx] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
151
|
+
self.cmd_idx_to_local_dims[cmd_idx] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
165
152
|
|
166
153
|
qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
|
167
154
|
qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
|
168
|
-
qmd.constant_buffer_addr_lower_0 =
|
169
|
-
|
170
|
-
if
|
171
|
-
qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
|
172
|
-
qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
|
173
|
-
qmd.release0_payload_lower = signal_value & 0xffffffff
|
174
|
-
qmd.release0_payload_upper = signal_value >> 32
|
175
|
-
qmd.release0_enable = 1
|
176
|
-
|
177
|
-
if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
|
178
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
|
155
|
+
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
|
156
|
+
|
157
|
+
if (prev_qmd:=self.cmd_idx_to_qmd.get(cmd_idx - 1)) is None:
|
179
158
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
|
180
159
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
|
181
160
|
else:
|
@@ -183,180 +162,145 @@ class HWComputeQueue(HWQueue):
|
|
183
162
|
prev_qmd.dependent_qmd0_action = 1
|
184
163
|
prev_qmd.dependent_qmd0_prefetch = 1
|
185
164
|
prev_qmd.dependent_qmd0_enable = 1
|
186
|
-
return self._mark_command_end()
|
187
165
|
|
188
|
-
def
|
166
|
+
def _update_exec(self, cmd_idx, global_size, local_size):
|
189
167
|
# Patch the exec cmd with new launch dims
|
190
|
-
self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
191
|
-
self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
168
|
+
if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
169
|
+
if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
170
|
+
|
171
|
+
def _signal(self, signal, value=0):
|
172
|
+
if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 2)) is not None:
|
173
|
+
for i in range(2):
|
174
|
+
if getattr(prev_qmd, f'release{i}_enable') == 0:
|
175
|
+
setattr(prev_qmd, f'release{i}_enable', 1)
|
176
|
+
setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
|
177
|
+
setattr(prev_qmd, f'release{i}_payload', value)
|
178
|
+
self.cmd_idx_to_qmd[len(self) - 1] = prev_qmd
|
179
|
+
self.cmd_idx_to_signal_id[len(self) - 1] = i
|
180
|
+
return
|
181
|
+
|
182
|
+
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
|
183
|
+
(1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
184
|
+
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
185
|
+
|
186
|
+
def _update_signal(self, cmd_idx, signal=None, value=None):
|
187
|
+
if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
|
188
|
+
if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
|
189
|
+
if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
|
192
190
|
|
193
|
-
def
|
191
|
+
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
|
194
192
|
|
195
|
-
class HWCopyQueue
|
196
|
-
def
|
197
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *
|
193
|
+
class NVCopyQueue(NVCommandQueue, HWCopyQueue):
|
194
|
+
def _copy(self, dest, src, copy_size):
|
195
|
+
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
|
198
196
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
199
197
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
200
|
-
return self._mark_command_end()
|
201
198
|
|
202
|
-
def
|
203
|
-
|
199
|
+
def _update_copy(self, cmd_idx, dest=None, src=None):
|
200
|
+
if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
|
201
|
+
if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
|
202
|
+
|
203
|
+
def _signal(self, signal, value=0):
|
204
|
+
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *data64(signal.signal_addr), value, 4]
|
204
205
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
|
205
|
-
return self._mark_command_end()
|
206
206
|
|
207
|
-
def
|
208
|
-
if signal is not None: self.
|
209
|
-
if value is not None: self.
|
210
|
-
|
207
|
+
def _update_signal(self, cmd_idx, signal=None, value=None):
|
208
|
+
if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
|
209
|
+
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
210
|
+
|
211
|
+
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
|
211
212
|
|
212
|
-
|
213
|
+
class NVArgsState(HCQArgsState):
|
214
|
+
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
215
|
+
super().__init__(ptr, prg, bufs, vals=vals)
|
213
216
|
|
214
|
-
|
215
|
-
|
217
|
+
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
|
218
|
+
kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
|
219
|
+
to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
|
220
|
+
self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
|
221
|
+
self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
|
222
|
+
|
223
|
+
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
224
|
+
def update_var(self, index:int, val:int): self.vals[index] = val
|
225
|
+
|
226
|
+
class NVProgram(HCQProgram):
|
216
227
|
def __init__(self, device:NVDevice, name:str, lib:bytes):
|
217
228
|
self.device, self.name, self.lib = device, name, lib
|
218
|
-
if DEBUG >= 6:
|
219
|
-
try:
|
220
|
-
fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
|
221
|
-
with open(fn + ".cubin", "wb") as f: f.write(lib)
|
222
|
-
print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
|
223
|
-
except Exception as e: print("failed to disasm cubin", str(e))
|
224
|
-
|
225
|
-
self.rel_info, self.global_init, self.shmem_usage = None, None, 0
|
226
|
-
constant_buffers_data = {}
|
227
|
-
|
228
|
-
if MOCKGPU:
|
229
|
-
self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
|
230
|
-
constant_buffers_data[0] = memoryview(bytearray(0x190))
|
231
|
-
else:
|
232
|
-
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
233
|
-
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
234
|
-
shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
|
235
|
-
for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
|
236
|
-
section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
|
237
|
-
if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
|
238
|
-
elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
|
239
|
-
self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
240
|
-
self.registers_usage = sh_info >> 24
|
241
|
-
if match := re.match(r'\.nv\.constant(\d+)', section_name):
|
242
|
-
constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
243
|
-
if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
244
|
-
elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
|
245
|
-
elif section_name == ".nv.info":
|
246
|
-
section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
247
|
-
for i in range(sh_size // 12):
|
248
|
-
if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
|
249
|
-
raise RuntimeError("too high local memory")
|
229
|
+
if DEBUG >= 6: nv_disassemble(lib)
|
250
230
|
|
251
|
-
|
252
|
-
|
231
|
+
if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
232
|
+
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
253
233
|
|
254
|
-
# Load program and constant buffers (if any)
|
255
234
|
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
256
|
-
self.
|
257
|
-
|
258
|
-
self.
|
235
|
+
self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True))
|
236
|
+
|
237
|
+
self.program_addr, self.program_sz, self.registers_usage, self.shmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0
|
238
|
+
self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
|
239
|
+
for sh in sections:
|
240
|
+
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = sh.header.sh_size
|
241
|
+
if sh.name == f".text.{self.name}":
|
242
|
+
self.program_addr, self.program_sz, self.registers_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, sh.header.sh_info>>24
|
243
|
+
elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
|
244
|
+
elif sh.name == ".nv.info":
|
245
|
+
for off in range(0, sh.header.sh_size, 12):
|
246
|
+
typ, _, val = struct.unpack_from("III", sh.content, off)
|
247
|
+
if typ & 0xffff == 0x1204: self.device._ensure_has_local_memory(val + 0x240)
|
248
|
+
|
249
|
+
# Apply relocs
|
250
|
+
for apply_image_offset, rel_sym_offset, typ, _ in relocs:
|
251
|
+
# These types are CUDA-specific, applying them here
|
252
|
+
if typ == 2: image[apply_image_offset:apply_image_offset+8] = struct.pack('<Q', self.lib_gpu.va_addr + rel_sym_offset) # R_CUDA_64
|
253
|
+
elif typ == 0x38: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) & 0xffffffff)
|
254
|
+
elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
|
255
|
+
else: raise RuntimeError(f"unknown NV reloc {typ}")
|
256
|
+
|
257
|
+
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
259
258
|
|
260
259
|
self.constbuffer_0 = [0] * 88
|
261
|
-
self.constbuffer_0[6:12] = [*
|
260
|
+
self.constbuffer_0[6:12] = [*data64_le(self.device.shared_mem_window), *data64_le(self.device.local_mem_window), *data64_le(0xfffdc0)]
|
262
261
|
|
263
262
|
smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
264
263
|
self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
265
264
|
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
|
266
|
-
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
|
265
|
+
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
|
267
266
|
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
|
268
267
|
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
|
269
|
-
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.
|
270
|
-
|
271
|
-
program_prefetch_addr_lower_shifted=self.
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
self.kernargs_offset = 0x160
|
278
|
-
|
279
|
-
# constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
|
280
|
-
if 0 in constant_buffers_data: constant_buffers_data.pop(0)
|
281
|
-
|
282
|
-
off = round_up(self.program.nbytes, 128)
|
283
|
-
|
284
|
-
if self.rel_info is not None:
|
285
|
-
assert self.global_init is not None
|
286
|
-
global_init_addr = self.lib_gpu.base + off
|
287
|
-
for rel_i in range(0, len(self.rel_info), 4):
|
288
|
-
if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
|
289
|
-
elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
|
290
|
-
else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
|
291
|
-
|
292
|
-
HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
|
293
|
-
for st in range(0, len(self.program), 4095):
|
294
|
-
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
|
295
|
-
|
296
|
-
if self.global_init is not None:
|
297
|
-
HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
|
298
|
-
off += round_up(self.global_init.nbytes, 128)
|
299
|
-
if 4 in constant_buffers_data: # >= 12.4
|
300
|
-
# Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
|
301
|
-
assert constant_buffers_data[4].nbytes == 8
|
302
|
-
constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
|
303
|
-
|
304
|
-
for i,data in constant_buffers_data.items():
|
305
|
-
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
|
306
|
-
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
|
307
|
-
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
|
268
|
+
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program_sz>>8,
|
269
|
+
program_address=self.program_addr, sass_version=0x89,
|
270
|
+
program_prefetch_addr_lower_shifted=self.program_addr>>8, program_prefetch_addr_upper_shifted=self.program_addr>>40)
|
271
|
+
|
272
|
+
for i,(addr,sz) in self.constbufs.items():
|
273
|
+
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (addr) >> 32)
|
274
|
+
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (addr) & 0xffffffff)
|
275
|
+
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
|
308
276
|
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
|
309
277
|
|
310
|
-
|
311
|
-
|
278
|
+
# Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
|
279
|
+
self.max_threads = ((65536 // round_up(max(1, self.registers_usage) * 32, 256)) // 4) * 4 * 32
|
312
280
|
|
313
|
-
|
314
|
-
self.device.
|
315
|
-
|
281
|
+
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
282
|
+
super().__init__(NVArgsState, self.device, self.name,
|
283
|
+
kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160)
|
316
284
|
|
317
285
|
def __del__(self):
|
318
|
-
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.
|
286
|
+
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
|
319
287
|
|
320
|
-
def __call__(self, *
|
288
|
+
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
321
289
|
if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
|
322
290
|
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
323
|
-
raise RuntimeError("Invalid global/local dims")
|
291
|
+
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
292
|
+
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
324
293
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
# HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
|
329
|
-
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
|
330
|
-
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
|
331
|
-
|
332
|
-
sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
333
|
-
|
334
|
-
queue = HWComputeQueue()
|
335
|
-
queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
|
336
|
-
if wait or PROFILE: queue.timestamp(sig_st)
|
337
|
-
queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
|
338
|
-
queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
339
|
-
if wait or PROFILE: queue.timestamp(sig_en)
|
340
|
-
queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
341
|
-
self.device.timeline_value += 1
|
342
|
-
self.device.kernargs_ptr += self.kernargs_alloc_size
|
343
|
-
|
344
|
-
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
345
|
-
if wait:
|
346
|
-
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
347
|
-
return (sig_en[1] - sig_st[1]) / 1e9
|
348
|
-
|
349
|
-
class NVAllocator(HCQCompatAllocator):
|
350
|
-
def __init__(self, device:NVDevice): super().__init__(device)
|
351
|
-
|
352
|
-
def _alloc(self, size:int, options:BufferOptions):
|
294
|
+
class NVAllocator(HCQAllocator):
|
295
|
+
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
353
296
|
if options.host: return self.device._gpu_host_alloc(size)
|
354
297
|
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
|
355
298
|
|
356
299
|
def _free(self, opaque, options:BufferOptions):
|
357
300
|
self.device.synchronize()
|
358
|
-
|
359
|
-
|
301
|
+
self.device._gpu_free(opaque)
|
302
|
+
|
303
|
+
def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
|
360
304
|
|
361
305
|
@dataclass
|
362
306
|
class GPFifo:
|
@@ -367,19 +311,18 @@ class GPFifo:
|
|
367
311
|
put_value: int = 0
|
368
312
|
|
369
313
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
370
|
-
class NVDevice(
|
314
|
+
class NVDevice(HCQCompiled):
|
371
315
|
root = None
|
372
316
|
fd_ctl: int = -1
|
373
317
|
fd_uvm: int = -1
|
374
|
-
gpus_info =
|
318
|
+
gpus_info:Union[List, ctypes.Array] = []
|
375
319
|
signals_page:Any = None
|
376
320
|
signals_pool: List[Any] = []
|
377
321
|
uvm_vaddr: int = 0x1000000000
|
378
322
|
host_object_enumerator: int = 0x1000
|
379
|
-
devices: List[NVDevice] = []
|
380
323
|
|
381
324
|
def _new_gpu_fd(self):
|
382
|
-
fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
|
325
|
+
fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
383
326
|
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
384
327
|
return fd_dev
|
385
328
|
|
@@ -388,8 +331,10 @@ class NVDevice(HCQCompatCompiled):
|
|
388
331
|
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
|
389
332
|
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
|
390
333
|
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
391
|
-
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
|
392
|
-
|
334
|
+
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
|
335
|
+
res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
|
336
|
+
os.close(fd_dev)
|
337
|
+
return res
|
393
338
|
|
394
339
|
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
|
395
340
|
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
|
@@ -404,7 +349,7 @@ class NVDevice(HCQCompatCompiled):
|
|
404
349
|
|
405
350
|
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
|
406
351
|
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
|
407
|
-
return self._gpu_uvm_map(va_addr, size, mem_handle)
|
352
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
|
408
353
|
|
409
354
|
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
|
410
355
|
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
|
@@ -417,51 +362,56 @@ class NVDevice(HCQCompatCompiled):
|
|
417
362
|
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
|
418
363
|
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
|
419
364
|
|
420
|
-
return self._gpu_uvm_map(va_addr, size, mem_handle)
|
365
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
|
421
366
|
|
422
367
|
def _gpu_host_alloc(self, size):
|
423
|
-
va_base = self._alloc_gpu_vaddr(
|
424
|
-
libc.mmap(va_base,
|
425
|
-
|
426
|
-
|
427
|
-
def _gpu_free(self, mem):
|
428
|
-
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
|
429
|
-
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
430
|
-
if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
|
431
|
-
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
|
368
|
+
va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
|
369
|
+
mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
370
|
+
assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
|
432
371
|
|
433
|
-
def _gpu_host_free(self, mem):
|
434
|
-
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
|
435
|
-
libc.munmap(mem.base, mem.length)
|
436
|
-
|
437
|
-
def _map_to_gpu(self, va_base, size):
|
438
372
|
NVDevice.host_object_enumerator += 1
|
439
373
|
flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
|
440
374
|
(nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
|
441
375
|
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
|
442
|
-
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=
|
376
|
+
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1)
|
443
377
|
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
444
|
-
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
|
445
|
-
return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
|
446
378
|
|
447
|
-
|
379
|
+
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
|
380
|
+
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True)
|
381
|
+
|
382
|
+
def _gpu_free(self, mem):
|
383
|
+
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
384
|
+
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
|
385
|
+
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
386
|
+
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
387
|
+
|
388
|
+
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
|
389
|
+
if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
|
390
|
+
|
391
|
+
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
|
448
392
|
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
449
393
|
gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
|
450
394
|
nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
|
451
395
|
|
452
|
-
# NOTE: va_addr is set to make rawbufs compatable with
|
396
|
+
# NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
|
453
397
|
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
|
454
|
-
|
398
|
+
gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
|
455
399
|
|
456
400
|
def _gpu_map(self, mem):
|
457
|
-
if self.gpu_uuid in
|
458
|
-
mem.
|
459
|
-
|
401
|
+
if self.gpu_uuid in mem.mapped_gpu_ids: return
|
402
|
+
mem.mapped_gpu_ids.append(self.gpu_uuid)
|
403
|
+
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False)
|
460
404
|
|
461
405
|
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
|
462
406
|
NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
|
463
407
|
return res_va
|
464
408
|
|
409
|
+
def _setup_nvclasses(self):
|
410
|
+
classlist = memoryview(bytearray(100 * 4)).cast('I')
|
411
|
+
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist))
|
412
|
+
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
413
|
+
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
|
414
|
+
|
465
415
|
def __init__(self, device:str=""):
|
466
416
|
if NVDevice.root is None:
|
467
417
|
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
@@ -471,48 +421,48 @@ class NVDevice(HCQCompatCompiled):
|
|
471
421
|
uvm.initialize(self.fd_uvm)
|
472
422
|
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
473
423
|
|
474
|
-
NVDevice.
|
475
|
-
|
424
|
+
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
425
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
426
|
+
NVDevice.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
|
476
427
|
|
477
|
-
# TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
|
478
428
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
479
|
-
self.fd_dev = self._new_gpu_fd()
|
480
429
|
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
self.
|
430
|
+
if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid:
|
431
|
+
raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
432
|
+
|
433
|
+
self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
|
434
|
+
self.fd_dev = self._new_gpu_fd()
|
486
435
|
|
487
|
-
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
|
436
|
+
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
|
488
437
|
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
489
438
|
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
490
439
|
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
|
491
440
|
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
492
441
|
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
493
442
|
|
494
|
-
|
443
|
+
self._setup_nvclasses()
|
444
|
+
|
445
|
+
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
495
446
|
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
496
|
-
rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
|
497
447
|
|
498
448
|
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
|
499
449
|
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
|
500
450
|
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
|
501
451
|
|
502
|
-
|
503
|
-
|
504
|
-
self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
|
452
|
+
raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
|
453
|
+
self.gpu_uuid = (ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)])
|
505
454
|
|
506
455
|
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
|
507
456
|
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
|
508
457
|
hClient=self.root, hVaSpace=vaspace)
|
509
458
|
|
510
459
|
for dev in self.devices:
|
511
|
-
uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid),
|
460
|
+
uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid),
|
461
|
+
gpuUuidB=nv_gpu.struct_nv_uuid(uuid=cast(NVDevice, dev).gpu_uuid))
|
512
462
|
|
513
463
|
if NVDevice.signals_page is None:
|
514
464
|
NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
|
515
|
-
NVDevice.signals_pool = [to_mv(self.signals_page.
|
465
|
+
NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
|
516
466
|
else: self._gpu_map(NVDevice.signals_page)
|
517
467
|
|
518
468
|
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
@@ -526,100 +476,70 @@ class NVDevice(HCQCompatCompiled):
|
|
526
476
|
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
|
527
477
|
self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
|
528
478
|
|
529
|
-
|
530
|
-
rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
|
531
|
-
|
532
|
-
self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
|
479
|
+
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
533
480
|
|
534
481
|
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
|
535
|
-
self.cmdq: memoryview = to_mv(self.cmdq_page.
|
482
|
+
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
|
536
483
|
self.cmdq_wptr: int = 0 # in bytes
|
537
484
|
|
538
|
-
|
539
|
-
self.
|
540
|
-
|
541
|
-
self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
|
542
|
-
|
543
|
-
super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
|
544
|
-
functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
|
545
|
-
|
546
|
-
self._cmdq_setup_compute_gpfifo()
|
547
|
-
self._cmdq_setup_dma_gpfifo()
|
548
|
-
|
549
|
-
NVDevice.devices.append(self)
|
550
|
-
|
551
|
-
@classmethod
|
552
|
-
def _read_signal(self, sig): return sig[0]
|
553
|
-
|
554
|
-
@classmethod
|
555
|
-
def _read_timestamp(self, sig): return sig[1]
|
556
|
-
|
557
|
-
@classmethod
|
558
|
-
def _set_signal(self, sig, value): sig[0] = value
|
485
|
+
sm_info = nv_gpu.NV2080_CTRL_GR_INFO(index=nv_gpu.NV2080_CTRL_GR_INFO_INDEX_SM_VERSION)
|
486
|
+
rmctrl.gr_get_info(self.fd_ctl, self.root, self.subdevice, grInfoListSize=1, grInfoList=ctypes.addressof(sm_info))
|
487
|
+
self.arch: str = f"sm_{(sm_info.data>>8)&0xff}{(val>>4) if (val:=sm_info.data&0xff) > 0xf else val}"
|
559
488
|
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
return sig
|
489
|
+
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
|
490
|
+
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
|
491
|
+
functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue, timeline_signals=(NVSignal(), NVSignal()))
|
564
492
|
|
565
|
-
|
566
|
-
def _wait_signal(self, signal, value=0, timeout=10000):
|
567
|
-
start_time = time.time() * 1000
|
568
|
-
while time.time() * 1000 - start_time < timeout:
|
569
|
-
if signal[0] >= value: return
|
570
|
-
raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
|
571
|
-
|
572
|
-
def _gpu2cpu_time(self, gpu_time, is_copy): return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e3
|
573
|
-
|
574
|
-
def synchronize(self):
|
575
|
-
NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
|
576
|
-
self.cmdq_wptr = 0
|
577
|
-
|
578
|
-
if self.timeline_value > (1 << 63): self._wrap_timeline_signal()
|
579
|
-
if PROFILE: self._prof_process_events()
|
493
|
+
self._setup_gpfifos()
|
580
494
|
|
581
495
|
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
|
582
496
|
notifier = self._gpu_system_alloc(48 << 20)
|
583
497
|
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
584
|
-
gpFifoOffset=gpfifo_area.
|
498
|
+
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
585
499
|
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
586
500
|
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
587
|
-
rm_alloc(self.fd_ctl, self.
|
501
|
+
rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None)
|
588
502
|
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
589
503
|
|
590
|
-
ws_token_params =
|
591
|
-
rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
|
504
|
+
ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
|
592
505
|
assert ws_token_params.workSubmitToken != -1
|
593
506
|
|
594
507
|
channel_base = self._alloc_gpu_vaddr(0x4000000)
|
595
508
|
uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
|
596
509
|
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
597
510
|
|
598
|
-
return GPFifo(ring=to_mv(gpfifo_area.
|
599
|
-
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.
|
511
|
+
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
|
512
|
+
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))
|
513
|
+
|
514
|
+
def _setup_gpfifos(self):
|
515
|
+
# Set windows addresses to not collide with other allocated buffers.
|
516
|
+
self.shared_mem_window, self.local_mem_window, self.slm_per_thread = 0xfe000000, 0xff000000, 0
|
517
|
+
|
518
|
+
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
519
|
+
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
600
520
|
|
601
|
-
|
602
|
-
|
521
|
+
NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
|
522
|
+
.setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
|
523
|
+
.signal(self.timeline_signal, self.timeline_value + 1).submit(self)
|
524
|
+
|
525
|
+
self.timeline_value += 2
|
526
|
+
|
527
|
+
def _ensure_has_local_memory(self, required):
|
528
|
+
if self.slm_per_thread >= required: return
|
529
|
+
|
530
|
+
self.synchronize()
|
531
|
+
if hasattr(self, 'shader_local_mem'): self._gpu_free(self.shader_local_mem) # type: ignore # pylint: disable=access-member-before-definition
|
532
|
+
|
533
|
+
self.slm_per_thread = round_up(required, 32)
|
603
534
|
bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
|
604
535
|
bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
|
605
|
-
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True)
|
536
|
+
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True)
|
606
537
|
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
queue = HWComputeQueue()
|
611
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
|
612
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
|
613
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
|
614
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
|
615
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
|
616
|
-
queue.signal(self.timeline_signal, self.timeline_value).submit(self)
|
538
|
+
NVComputeQueue().setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
|
539
|
+
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
617
540
|
self.timeline_value += 1
|
618
|
-
self.synchronize()
|
619
541
|
|
620
|
-
def
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
self.timeline_value += 1
|
625
|
-
self.synchronize()
|
542
|
+
def invalidate_caches(self):
|
543
|
+
rmctrl.fb_flush_gpu_cache(self.fd_ctl, self.root, self.subdevice,
|
544
|
+
flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
|
545
|
+
(nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4)))
|