tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/kernel.py +114 -172
- tinygrad/codegen/linearize.py +211 -81
- tinygrad/codegen/lowerer.py +30 -35
- tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
- tinygrad/codegen/transcendental.py +12 -13
- tinygrad/device.py +170 -47
- tinygrad/dtype.py +28 -26
- tinygrad/engine/jit.py +80 -63
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +162 -0
- tinygrad/engine/realize.py +58 -107
- tinygrad/engine/schedule.py +381 -314
- tinygrad/engine/search.py +40 -44
- tinygrad/gradient.py +70 -0
- tinygrad/helpers.py +77 -58
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +89 -64
- tinygrad/ops.py +562 -446
- tinygrad/renderer/__init__.py +79 -36
- tinygrad/renderer/cstyle.py +70 -84
- tinygrad/renderer/llvmir.py +32 -20
- tinygrad/renderer/ptx.py +79 -99
- tinygrad/renderer/wgsl.py +87 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libpciaccess.py +2023 -0
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +19 -21
- tinygrad/runtime/ops_amd.py +488 -327
- tinygrad/runtime/ops_clang.py +15 -28
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +129 -38
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +45 -40
- tinygrad/runtime/ops_metal.py +93 -73
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +232 -270
- tinygrad/runtime/ops_python.py +51 -46
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +63 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +384 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +26 -4
- tinygrad/runtime/support/hcq.py +254 -324
- tinygrad/runtime/support/llvm.py +32 -0
- tinygrad/shape/shapetracker.py +84 -53
- tinygrad/shape/view.py +103 -138
- tinygrad/spec.py +154 -0
- tinygrad/tensor.py +744 -496
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
- tinygrad-0.10.1.dist-info/RECORD +86 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_nv.py
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, contextlib, re,
|
2
|
+
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
|
3
3
|
assert sys.platform != 'win32'
|
4
|
-
from typing import
|
4
|
+
from typing import Any, cast, Union, Type
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer,
|
7
|
-
from tinygrad.runtime.support.hcq import
|
8
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
7
|
+
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
|
8
|
+
from tinygrad.ops import sint
|
9
|
+
from tinygrad.device import BufferSpec
|
9
10
|
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
10
11
|
from tinygrad.renderer.ptx import PTXRenderer
|
11
12
|
from tinygrad.renderer.cstyle import NVRenderer
|
12
13
|
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
13
|
-
from tinygrad.runtime.autogen import nv_gpu
|
14
|
+
from tinygrad.runtime.autogen import nv_gpu
|
14
15
|
from tinygrad.runtime.support.elf import elf_loader
|
15
16
|
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
16
|
-
if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
17
17
|
|
18
18
|
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
|
19
19
|
|
20
20
|
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
|
21
21
|
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
|
22
22
|
|
23
|
-
def nv_iowr(fd, nr, args):
|
24
|
-
ret =
|
23
|
+
def nv_iowr(fd:HWInterface, nr, args):
|
24
|
+
ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
25
25
|
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
26
26
|
|
27
27
|
def rm_alloc(fd, clss, root, parant, params):
|
@@ -46,8 +46,8 @@ def make_rmctrl_type():
|
|
46
46
|
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
|
47
47
|
rmctrl = make_rmctrl_type()
|
48
48
|
|
49
|
-
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
50
|
-
ret =
|
49
|
+
def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
|
50
|
+
ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
|
51
51
|
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
52
52
|
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
|
53
53
|
return made
|
@@ -58,7 +58,7 @@ def make_uvm_type():
|
|
58
58
|
uvm = make_uvm_type()
|
59
59
|
|
60
60
|
def make_qmd_struct_type():
|
61
|
-
fields:
|
61
|
+
fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
|
62
62
|
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
63
63
|
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
64
64
|
bits = sorted(bits, key=lambda x: x[1][1])
|
@@ -71,167 +71,138 @@ def make_qmd_struct_type():
|
|
71
71
|
qmd_struct_t = make_qmd_struct_type()
|
72
72
|
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
73
73
|
|
74
|
-
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
75
|
-
|
76
74
|
class NVSignal(HCQSignal):
|
77
|
-
def __init__(self,
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
def
|
85
|
-
|
86
|
-
|
75
|
+
def __init__(self, base_addr:int|None=None, **kwargs):
|
76
|
+
super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
|
77
|
+
|
78
|
+
def __del__(self):
|
79
|
+
if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
|
80
|
+
|
81
|
+
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
82
|
+
def __init__(self):
|
83
|
+
self.active_qmd = None
|
84
|
+
super().__init__()
|
85
|
+
|
87
86
|
def __del__(self):
|
88
|
-
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size,
|
87
|
+
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
|
88
|
+
|
89
|
+
def nvm(self, subchannel, mthd, *args, typ=2): self.q((typ << 28) | (len(args) << 16) | (subchannel << 13) | (mthd >> 2), *args)
|
89
90
|
|
90
|
-
@hcq_command
|
91
91
|
def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
|
92
|
-
if compute_class: self.
|
93
|
-
if copy_class: self.
|
94
|
-
if local_mem_window: self.
|
95
|
-
if shared_mem_window: self.
|
96
|
-
if local_mem: self.
|
97
|
-
if local_mem_tpc_bytes: self.
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
def
|
108
|
-
|
109
|
-
|
110
|
-
self.binded_device = device
|
111
|
-
self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
|
92
|
+
if compute_class: self.nvm(1, nv_gpu.NVC6C0_SET_OBJECT, compute_class)
|
93
|
+
if copy_class: self.nvm(4, nv_gpu.NVC6C0_SET_OBJECT, copy_class)
|
94
|
+
if local_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, *data64(local_mem_window))
|
95
|
+
if shared_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, *data64(shared_mem_window))
|
96
|
+
if local_mem: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, *data64(local_mem))
|
97
|
+
if local_mem_tpc_bytes: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, *data64(local_mem_tpc_bytes), 0xff)
|
98
|
+
return self
|
99
|
+
|
100
|
+
def wait(self, signal:NVSignal, value:sint=0):
|
101
|
+
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT
|
102
|
+
self.active_qmd = None
|
103
|
+
return self
|
104
|
+
|
105
|
+
def timestamp(self, signal:NVSignal): return self.signal(signal, 0)
|
106
|
+
|
107
|
+
def bind(self, dev:NVDevice):
|
108
|
+
self.binded_device = dev
|
109
|
+
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
|
112
110
|
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
113
|
-
for i, value in enumerate(self.
|
111
|
+
for i, value in enumerate(self._q): hw_view[i] = value
|
114
112
|
|
115
113
|
# From now on, the queue is on the device for faster submission.
|
116
|
-
self.
|
114
|
+
self._q = hw_view
|
117
115
|
|
118
|
-
def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
|
116
|
+
def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):
|
119
117
|
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
|
120
118
|
else:
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
dev.cmdq_wptr = 0
|
119
|
+
cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4)
|
120
|
+
cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4
|
121
|
+
dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q)
|
125
122
|
|
126
|
-
|
127
|
-
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
|
128
|
-
dev.cmdq_wptr += len(self.q) * 4
|
129
|
-
|
130
|
-
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
|
123
|
+
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
|
131
124
|
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
|
132
125
|
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
133
126
|
gpfifo.put_value += 1
|
134
127
|
|
135
|
-
class NVComputeQueue(NVCommandQueue
|
136
|
-
def
|
137
|
-
self.
|
138
|
-
|
128
|
+
class NVComputeQueue(NVCommandQueue):
|
129
|
+
def memory_barrier(self):
|
130
|
+
self.nvm(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, (1 << 12) | (1 << 4) | (1 << 0))
|
131
|
+
self.active_qmd = None
|
132
|
+
return self
|
139
133
|
|
140
|
-
def
|
134
|
+
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
135
|
+
self.bind_args_state(args_state)
|
141
136
|
|
142
|
-
def _exec(self, prg, args_state, global_size, local_size):
|
143
137
|
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
144
138
|
assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
|
145
139
|
|
146
|
-
|
147
|
-
self.cmd_idx_to_global_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
148
|
-
self.cmd_idx_to_local_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
140
|
+
qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
149
141
|
|
150
|
-
|
151
|
-
|
142
|
+
self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
|
143
|
+
self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
|
152
144
|
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
|
153
145
|
|
154
|
-
if
|
155
|
-
self.
|
156
|
-
self.
|
146
|
+
if self.active_qmd is None:
|
147
|
+
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
|
148
|
+
self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
|
157
149
|
else:
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
def _update_exec(self, cmd_idx, global_size, local_size):
|
164
|
-
# Patch the exec cmd with new launch dims
|
165
|
-
if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
166
|
-
if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
167
|
-
|
168
|
-
def _signal(self, signal, value=0):
|
169
|
-
if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is not None:
|
170
|
-
for i in range(2):
|
171
|
-
if getattr(prev_qmd, f'release{i}_enable') == 0:
|
172
|
-
setattr(prev_qmd, f'release{i}_enable', 1)
|
173
|
-
setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
|
174
|
-
setattr(prev_qmd, f'release{i}_payload', value)
|
175
|
-
self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd
|
176
|
-
self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i
|
177
|
-
return
|
178
|
-
|
179
|
-
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
|
180
|
-
(1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
181
|
-
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
182
|
-
|
183
|
-
def _update_signal(self, cmd_idx, signal=None, value=None):
|
184
|
-
if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
|
185
|
-
if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
|
186
|
-
if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
|
187
|
-
|
188
|
-
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
|
189
|
-
|
190
|
-
class NVCopyQueue(NVCommandQueue, HWCopyQueue):
|
191
|
-
def _copy(self, dest, src, copy_size):
|
192
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
|
193
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
194
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
195
|
-
|
196
|
-
def _update_copy(self, cmd_idx, dest=None, src=None):
|
197
|
-
if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
|
198
|
-
if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
|
199
|
-
|
200
|
-
def _signal(self, signal, value=0):
|
201
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value]
|
202
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
|
203
|
-
|
204
|
-
def _update_signal(self, cmd_idx, signal=None, value=None):
|
205
|
-
if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
|
206
|
-
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
150
|
+
self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
|
151
|
+
self.active_qmd.dependent_qmd0_action = 1
|
152
|
+
self.active_qmd.dependent_qmd0_prefetch = 1
|
153
|
+
self.active_qmd.dependent_qmd0_enable = 1
|
207
154
|
|
208
|
-
|
209
|
-
|
210
|
-
class NVArgsState(HCQArgsState):
|
211
|
-
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
212
|
-
super().__init__(ptr, prg, bufs, vals=vals)
|
155
|
+
self.active_qmd = qmd
|
156
|
+
return self
|
213
157
|
|
158
|
+
def signal(self, signal:NVSignal, value:sint=0):
|
159
|
+
if self.active_qmd is not None:
|
160
|
+
for i in range(2):
|
161
|
+
if getattr(self.active_qmd, f'release{i}_enable') == 0:
|
162
|
+
setattr(self.active_qmd, f'release{i}_enable', 1)
|
163
|
+
self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
|
164
|
+
self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
|
165
|
+
return self
|
166
|
+
|
167
|
+
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
|
168
|
+
(1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)) # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
169
|
+
self.nvm(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 0x0)
|
170
|
+
self.active_qmd = None
|
171
|
+
return self
|
172
|
+
|
173
|
+
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.compute_gpfifo)
|
174
|
+
|
175
|
+
class NVCopyQueue(NVCommandQueue):
|
176
|
+
def copy(self, dest:sint, src:sint, copy_size:int):
|
177
|
+
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src), *data64(dest))
|
178
|
+
self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, copy_size)
|
179
|
+
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
180
|
+
return self
|
181
|
+
|
182
|
+
def signal(self, signal:NVSignal, value:sint=0):
|
183
|
+
self.nvm(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, *data64(signal.value_addr), value)
|
184
|
+
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x14)
|
185
|
+
return self
|
186
|
+
|
187
|
+
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
|
188
|
+
|
189
|
+
class NVArgsState(CLikeArgsState):
|
190
|
+
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
214
191
|
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
|
215
|
-
|
216
|
-
to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
|
217
|
-
self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
|
218
|
-
self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
|
219
|
-
|
220
|
-
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
221
|
-
def update_var(self, index:int, val:int): self.vals[index] = val
|
192
|
+
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
222
193
|
|
223
194
|
class NVProgram(HCQProgram):
|
224
|
-
def __init__(self,
|
225
|
-
self.
|
195
|
+
def __init__(self, dev:NVDevice, name:str, lib:bytes):
|
196
|
+
self.dev, self.name, self.lib = dev, name, lib
|
226
197
|
|
227
198
|
if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
228
199
|
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
229
200
|
|
230
201
|
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
231
|
-
self.lib_gpu = self.
|
202
|
+
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
|
232
203
|
|
233
204
|
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
|
234
|
-
self.constbufs:
|
205
|
+
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
|
235
206
|
for sh in sections:
|
236
207
|
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
|
237
208
|
if sh.name == f".text.{self.name}":
|
@@ -243,7 +214,7 @@ class NVProgram(HCQProgram):
|
|
243
214
|
if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
|
244
215
|
|
245
216
|
# Ensure device has enough local memory to run the program
|
246
|
-
self.
|
217
|
+
self.dev._ensure_has_local_memory(self.lcmem_usage)
|
247
218
|
|
248
219
|
# Apply relocs
|
249
220
|
for apply_image_offset, rel_sym_offset, typ, _ in relocs:
|
@@ -256,15 +227,16 @@ class NVProgram(HCQProgram):
|
|
256
227
|
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
257
228
|
|
258
229
|
self.constbuffer_0 = [0] * 88
|
259
|
-
self.constbuffer_0[6:12] = [*data64_le(self.
|
230
|
+
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
|
260
231
|
|
261
232
|
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
262
|
-
self.qmd
|
233
|
+
self.qmd: ctypes.Structure = \
|
234
|
+
qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
263
235
|
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
|
264
236
|
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
|
265
237
|
shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
|
266
238
|
max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
|
267
|
-
barrier_count=1, shader_local_memory_high_size=self.
|
239
|
+
barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
|
268
240
|
program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
|
269
241
|
|
270
242
|
for i,(addr,sz) in self.constbufs.items():
|
@@ -273,32 +245,32 @@ class NVProgram(HCQProgram):
|
|
273
245
|
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
|
274
246
|
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
|
275
247
|
|
276
|
-
# Registers allocation granularity per warp is 256, warp
|
248
|
+
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
|
277
249
|
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
|
278
250
|
|
279
251
|
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
280
|
-
super().__init__(NVArgsState, self.
|
252
|
+
super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
|
281
253
|
|
282
254
|
def __del__(self):
|
283
|
-
if hasattr(self, 'lib_gpu'): self.
|
255
|
+
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
|
284
256
|
|
285
|
-
def __call__(self, *bufs, global_size:
|
286
|
-
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.
|
287
|
-
raise RuntimeError("Too many resources requested for launch")
|
257
|
+
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
258
|
+
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
|
259
|
+
raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}")
|
288
260
|
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
289
261
|
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
290
262
|
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
291
263
|
|
292
|
-
class NVAllocator(HCQAllocator):
|
293
|
-
def _alloc(self, size:int, options:
|
294
|
-
if options.host: return self.
|
295
|
-
return self.
|
264
|
+
class NVAllocator(HCQAllocator['NVDevice']):
|
265
|
+
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
266
|
+
if options.host: return self.dev._gpu_alloc(size, host=True, tag="user host memory")
|
267
|
+
return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})")
|
296
268
|
|
297
|
-
def _free(self, opaque, options:
|
298
|
-
self.
|
299
|
-
self.
|
269
|
+
def _free(self, opaque:HCQBuffer, options:BufferSpec):
|
270
|
+
self.dev.synchronize()
|
271
|
+
self.dev._gpu_free(opaque)
|
300
272
|
|
301
|
-
def map(self, buf:HCQBuffer): self.
|
273
|
+
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
|
302
274
|
|
303
275
|
@dataclass
|
304
276
|
class GPFifo:
|
@@ -309,119 +281,114 @@ class GPFifo:
|
|
309
281
|
put_value: int = 0
|
310
282
|
|
311
283
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
312
|
-
class NVDevice(HCQCompiled):
|
284
|
+
class NVDevice(HCQCompiled[NVSignal]):
|
313
285
|
root = None
|
314
|
-
fd_ctl:
|
315
|
-
fd_uvm:
|
316
|
-
gpus_info: Union[
|
286
|
+
fd_ctl: HWInterface
|
287
|
+
fd_uvm: HWInterface
|
288
|
+
gpus_info: Union[list, ctypes.Array] = []
|
317
289
|
signals_page: Any = None
|
318
|
-
signals_pool:
|
319
|
-
|
320
|
-
|
290
|
+
signals_pool: list[int] = []
|
291
|
+
|
292
|
+
# TODO: Need a proper allocator for va addresses
|
293
|
+
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
294
|
+
# VA space is 48bits.
|
295
|
+
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x1000000000, wrap=False)
|
296
|
+
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=0x2000000000, wrap=False)
|
321
297
|
host_object_enumerator: int = 0x1000
|
322
298
|
|
323
299
|
def _new_gpu_fd(self):
|
324
|
-
fd_dev =
|
325
|
-
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
300
|
+
fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
301
|
+
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
|
326
302
|
return fd_dev
|
327
303
|
|
328
304
|
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
|
329
|
-
fd_dev = self._new_gpu_fd() if not system else
|
330
|
-
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
|
331
|
-
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.
|
305
|
+
fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
306
|
+
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
|
307
|
+
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
|
332
308
|
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
333
309
|
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
size = round_up(size,
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
flags=(nv_gpu.
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
|
379
|
-
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
|
380
|
-
|
381
|
-
def _gpu_free(self, mem):
|
382
|
-
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
383
|
-
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
|
310
|
+
return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
|
311
|
+
|
312
|
+
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
|
313
|
+
# Uncached memory is "system". Use huge pages only for gpu memory.
|
314
|
+
page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10))
|
315
|
+
size = round_up(size, page_size)
|
316
|
+
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
|
317
|
+
|
318
|
+
if host:
|
319
|
+
va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
320
|
+
|
321
|
+
flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
|
322
|
+
| (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
|
323
|
+
|
324
|
+
NVDevice.host_object_enumerator += 1
|
325
|
+
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags,
|
326
|
+
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
|
327
|
+
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
328
|
+
|
329
|
+
if made.params.status != 0: raise RuntimeError(f"host alloc returned {get_error_str(made.params.status)}")
|
330
|
+
mem_handle = made.params.hObjectNew
|
331
|
+
else:
|
332
|
+
attr = ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contiguous else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27) \
|
333
|
+
| (nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE if page_size > 0x1000 else 0) << 23 | ((nv_gpu.NVOS32_ATTR_LOCATION_PCI if uncached else 0) << 25)
|
334
|
+
|
335
|
+
attr2 = ((nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO if uncached else nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES) << 2) \
|
336
|
+
| ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB if page_size > 0x1000 else 0) << 20) | nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC
|
337
|
+
|
338
|
+
fl = nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED | nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE \
|
339
|
+
| nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | (nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM if not uncached else 0)
|
340
|
+
|
341
|
+
alloc_func = nv_gpu.NV1_MEMORY_SYSTEM if uncached else nv_gpu.NV1_MEMORY_USER
|
342
|
+
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=page_size, offset=0, limit=size-1, format=6, size=size,
|
343
|
+
type=nv_gpu.NVOS32_TYPE_NOTIFIER if uncached else nv_gpu.NVOS32_TYPE_IMAGE, attr=attr, attr2=attr2, flags=fl)
|
344
|
+
mem_handle = rm_alloc(self.fd_ctl, alloc_func, self.root, self.nvdevice, alloc_params).hObjectNew
|
345
|
+
|
346
|
+
if cpu_access: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=uncached)
|
347
|
+
|
348
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
|
349
|
+
|
350
|
+
def _gpu_free(self, mem:HCQBuffer):
|
351
|
+
if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
352
|
+
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
|
353
|
+
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
384
354
|
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
385
355
|
|
386
|
-
self._debug_mappings.pop((mem.va_addr, mem.size))
|
387
|
-
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
|
388
|
-
if mem.has_cpu_mapping:
|
356
|
+
self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
|
357
|
+
uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
|
358
|
+
if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
|
389
359
|
|
390
|
-
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") ->
|
360
|
+
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
|
391
361
|
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
392
362
|
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
|
393
363
|
|
394
|
-
# NOTE: va_addr is set to make rawbufs
|
364
|
+
# NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
|
395
365
|
self._debug_mappings[(va_base, size)] = tag
|
396
|
-
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl
|
397
|
-
|
366
|
+
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
|
367
|
+
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
|
368
|
+
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
|
398
369
|
|
399
|
-
def _gpu_map(self, mem):
|
400
|
-
if self.gpu_uuid in mem.mapped_gpu_ids: return
|
401
|
-
mem.mapped_gpu_ids.append(self.gpu_uuid)
|
402
|
-
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
|
370
|
+
def _gpu_map(self, mem:HCQBuffer):
|
371
|
+
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
|
372
|
+
mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
|
373
|
+
self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
|
403
374
|
|
404
375
|
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
405
|
-
if force_low
|
406
|
-
NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
|
407
|
-
assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
|
408
|
-
else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
|
409
|
-
return res_va
|
376
|
+
return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
|
410
377
|
|
411
378
|
def _setup_nvclasses(self):
|
412
379
|
classlist = memoryview(bytearray(100 * 4)).cast('I')
|
413
|
-
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.
|
380
|
+
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
|
414
381
|
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
415
382
|
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
|
416
383
|
|
417
384
|
def __init__(self, device:str=""):
|
418
385
|
if NVDevice.root is None:
|
419
|
-
NVDevice.fd_ctl =
|
420
|
-
NVDevice.fd_uvm =
|
421
|
-
fd_uvm_2 =
|
386
|
+
NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
387
|
+
NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
388
|
+
self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
422
389
|
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
|
423
390
|
uvm.initialize(self.fd_uvm)
|
424
|
-
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
391
|
+
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
|
425
392
|
|
426
393
|
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
427
394
|
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
@@ -438,40 +405,40 @@ class NVDevice(HCQCompiled):
|
|
438
405
|
|
439
406
|
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
|
440
407
|
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
441
|
-
self.
|
442
|
-
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.
|
408
|
+
self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
409
|
+
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
|
443
410
|
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
444
411
|
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
445
412
|
|
446
413
|
self._setup_nvclasses()
|
447
|
-
self._debug_mappings:
|
414
|
+
self._debug_mappings: dict[tuple[int, int], str] = dict()
|
448
415
|
|
449
416
|
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
450
417
|
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
451
418
|
|
452
419
|
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
|
453
420
|
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
|
454
|
-
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.
|
421
|
+
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.nvdevice, vaspace_params).hObjectNew
|
455
422
|
|
456
423
|
raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
|
457
424
|
self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
|
458
425
|
|
459
426
|
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
|
460
|
-
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
|
427
|
+
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
|
461
428
|
|
462
|
-
for dev in cast(
|
429
|
+
for dev in cast(list[NVDevice], self.devices):
|
463
430
|
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
|
464
431
|
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
|
465
432
|
|
466
433
|
if NVDevice.signals_page is None:
|
467
|
-
NVDevice.signals_page = self.
|
468
|
-
NVDevice.signals_pool = [
|
434
|
+
NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
|
435
|
+
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
|
469
436
|
else: self._gpu_map(NVDevice.signals_page)
|
470
437
|
|
471
438
|
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
472
|
-
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.
|
439
|
+
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
|
473
440
|
|
474
|
-
gpfifo_area = self._gpu_alloc(0x200000,
|
441
|
+
gpfifo_area = self._gpu_alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000, tag="gpfifo")
|
475
442
|
|
476
443
|
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
477
444
|
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
|
@@ -481,9 +448,9 @@ class NVDevice(HCQCompiled):
|
|
481
448
|
|
482
449
|
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
483
450
|
|
484
|
-
self.cmdq_page:
|
485
|
-
self.
|
486
|
-
self.
|
451
|
+
self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
|
452
|
+
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True)
|
453
|
+
self.cmdq: memoryview = to_mv(cast(int, self.cmdq_page.va_addr), 0x200000).cast("I")
|
487
454
|
|
488
455
|
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
|
489
456
|
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
|
@@ -496,10 +463,10 @@ class NVDevice(HCQCompiled):
|
|
496
463
|
self._setup_gpfifos()
|
497
464
|
|
498
465
|
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
|
499
|
-
notifier = self.
|
500
|
-
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
466
|
+
notifier = self._gpu_alloc(48 << 20, uncached=True)
|
467
|
+
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
|
501
468
|
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
502
|
-
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
469
|
+
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
503
470
|
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
504
471
|
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
|
505
472
|
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
@@ -507,13 +474,13 @@ class NVDevice(HCQCompiled):
|
|
507
474
|
if enable_debug:
|
508
475
|
self.debug_compute_obj, self.debug_channel = comp, gpfifo
|
509
476
|
debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
|
510
|
-
self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.
|
477
|
+
self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.nvdevice, debugger_params).hObjectNew
|
511
478
|
|
512
479
|
ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
|
513
480
|
assert ws_token_params.workSubmitToken != -1
|
514
481
|
|
515
482
|
channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
|
516
|
-
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
|
483
|
+
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
|
517
484
|
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
518
485
|
|
519
486
|
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
|
@@ -532,30 +499,25 @@ class NVDevice(HCQCompiled):
|
|
532
499
|
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
533
500
|
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
534
501
|
|
535
|
-
NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
|
536
|
-
|
537
|
-
|
502
|
+
cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
|
503
|
+
.setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
|
504
|
+
.signal(self.timeline_signal, self.timeline_value + 1).submit(self)
|
538
505
|
|
539
506
|
self.timeline_value += 2
|
540
507
|
|
541
508
|
def _ensure_has_local_memory(self, required):
|
542
509
|
if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
|
543
510
|
|
544
|
-
if self.shader_local_mem is not None: self.allocator.free(self.shader_local_mem, self.shader_local_mem.size)
|
545
|
-
|
546
511
|
self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread
|
547
512
|
bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
|
513
|
+
self.shader_local_mem, ok = self._realloc(self.shader_local_mem, round_up(bytes_per_tpc*self.num_tpc_per_gpc*self.num_gpcs, 0x20000))
|
548
514
|
|
549
|
-
|
550
|
-
|
551
|
-
# If can't allocate a new size, reallocator the old buffer.
|
552
|
-
self.slm_per_thread = old_slm_per_thread
|
553
|
-
bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
|
554
|
-
self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
|
515
|
+
# Realloc failed, restore the old value.
|
516
|
+
if not ok: self.slm_per_thread = old_slm_per_thread
|
555
517
|
|
556
|
-
NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
|
557
|
-
|
558
|
-
|
518
|
+
cast(NVComputeQueue, NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1)) \
|
519
|
+
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
|
520
|
+
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
559
521
|
self.timeline_value += 1
|
560
522
|
|
561
523
|
def invalidate_caches(self):
|