tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
- tinygrad/codegen/expander.py +121 -0
- tinygrad/codegen/kernel.py +141 -201
- tinygrad/codegen/linearize.py +223 -84
- tinygrad/codegen/lowerer.py +60 -42
- tinygrad/codegen/symbolic.py +476 -0
- tinygrad/codegen/transcendental.py +22 -13
- tinygrad/device.py +187 -47
- tinygrad/dtype.py +39 -28
- tinygrad/engine/jit.py +83 -65
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +161 -0
- tinygrad/engine/realize.py +62 -108
- tinygrad/engine/schedule.py +396 -357
- tinygrad/engine/search.py +55 -66
- tinygrad/gradient.py +73 -0
- tinygrad/helpers.py +81 -59
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +91 -66
- tinygrad/ops.py +492 -641
- tinygrad/renderer/__init__.py +95 -36
- tinygrad/renderer/cstyle.py +99 -92
- tinygrad/renderer/llvmir.py +83 -34
- tinygrad/renderer/ptx.py +83 -99
- tinygrad/renderer/wgsl.py +95 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/pci.py +1333 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/autogen/webgpu.py +6985 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +40 -43
- tinygrad/runtime/ops_amd.py +498 -334
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cpu.py +24 -0
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +159 -42
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +48 -41
- tinygrad/runtime/ops_metal.py +149 -113
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +238 -273
- tinygrad/runtime/ops_python.py +55 -50
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +225 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +396 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +28 -4
- tinygrad/runtime/support/hcq.py +256 -324
- tinygrad/runtime/support/llvm.py +26 -0
- tinygrad/shape/shapetracker.py +85 -53
- tinygrad/shape/view.py +104 -140
- tinygrad/spec.py +155 -0
- tinygrad/tensor.py +835 -527
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad/viz/index.html +544 -0
- tinygrad/viz/perfetto.html +178 -0
- tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
- tinygrad-0.10.2.dist-info/RECORD +99 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/uopgraph.py +0 -506
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad/runtime/ops_clang.py +0 -35
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_nv.py
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, contextlib, re,
|
2
|
+
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
|
3
3
|
assert sys.platform != 'win32'
|
4
|
-
from typing import
|
4
|
+
from typing import Any, cast, Union, Type
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer,
|
7
|
-
from tinygrad.runtime.support.hcq import
|
8
|
-
from tinygrad.
|
9
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
7
|
+
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
|
8
|
+
from tinygrad.ops import sint
|
9
|
+
from tinygrad.device import BufferSpec, CPUProgram
|
10
|
+
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX
|
10
11
|
from tinygrad.renderer.ptx import PTXRenderer
|
11
12
|
from tinygrad.renderer.cstyle import NVRenderer
|
12
13
|
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
13
|
-
from tinygrad.runtime.autogen import nv_gpu
|
14
|
+
from tinygrad.runtime.autogen import nv_gpu
|
14
15
|
from tinygrad.runtime.support.elf import elf_loader
|
15
16
|
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
16
|
-
if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
17
17
|
|
18
18
|
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
|
19
19
|
|
20
20
|
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
|
21
21
|
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
|
22
22
|
|
23
|
-
def nv_iowr(fd, nr, args):
|
24
|
-
ret =
|
23
|
+
def nv_iowr(fd:HWInterface, nr, args):
|
24
|
+
ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
25
25
|
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
26
26
|
|
27
27
|
def rm_alloc(fd, clss, root, parant, params):
|
@@ -46,8 +46,8 @@ def make_rmctrl_type():
|
|
46
46
|
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
|
47
47
|
rmctrl = make_rmctrl_type()
|
48
48
|
|
49
|
-
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
50
|
-
ret =
|
49
|
+
def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
|
50
|
+
ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
|
51
51
|
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
52
52
|
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
|
53
53
|
return made
|
@@ -58,7 +58,7 @@ def make_uvm_type():
|
|
58
58
|
uvm = make_uvm_type()
|
59
59
|
|
60
60
|
def make_qmd_struct_type():
|
61
|
-
fields:
|
61
|
+
fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
|
62
62
|
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
63
63
|
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
64
64
|
bits = sorted(bits, key=lambda x: x[1][1])
|
@@ -71,167 +71,141 @@ def make_qmd_struct_type():
|
|
71
71
|
qmd_struct_t = make_qmd_struct_type()
|
72
72
|
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
73
73
|
|
74
|
-
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
75
|
-
|
76
74
|
class NVSignal(HCQSignal):
|
77
|
-
def __init__(self,
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
def
|
85
|
-
|
86
|
-
|
75
|
+
def __init__(self, base_addr:int|None=None, **kwargs):
|
76
|
+
super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
|
77
|
+
|
78
|
+
def __del__(self):
|
79
|
+
if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
|
80
|
+
|
81
|
+
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
82
|
+
def __init__(self):
|
83
|
+
self.active_qmd = None
|
84
|
+
super().__init__()
|
85
|
+
|
87
86
|
def __del__(self):
|
88
|
-
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size,
|
87
|
+
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
|
88
|
+
|
89
|
+
def nvm(self, subchannel, mthd, *args, typ=2): self.q((typ << 28) | (len(args) << 16) | (subchannel << 13) | (mthd >> 2), *args)
|
89
90
|
|
90
|
-
@hcq_command
|
91
91
|
def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
|
92
|
-
if compute_class: self.
|
93
|
-
if copy_class: self.
|
94
|
-
if local_mem_window: self.
|
95
|
-
if shared_mem_window: self.
|
96
|
-
if local_mem: self.
|
97
|
-
if local_mem_tpc_bytes: self.
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
def
|
108
|
-
|
109
|
-
|
110
|
-
self.binded_device = device
|
111
|
-
self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
|
92
|
+
if compute_class: self.nvm(1, nv_gpu.NVC6C0_SET_OBJECT, compute_class)
|
93
|
+
if copy_class: self.nvm(4, nv_gpu.NVC6C0_SET_OBJECT, copy_class)
|
94
|
+
if local_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, *data64(local_mem_window))
|
95
|
+
if shared_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, *data64(shared_mem_window))
|
96
|
+
if local_mem: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, *data64(local_mem))
|
97
|
+
if local_mem_tpc_bytes: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, *data64(local_mem_tpc_bytes), 0xff)
|
98
|
+
return self
|
99
|
+
|
100
|
+
def wait(self, signal:NVSignal, value:sint=0):
|
101
|
+
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT
|
102
|
+
self.active_qmd = None
|
103
|
+
return self
|
104
|
+
|
105
|
+
def timestamp(self, signal:NVSignal): return self.signal(signal, 0)
|
106
|
+
|
107
|
+
def bind(self, dev:NVDevice):
|
108
|
+
self.binded_device = dev
|
109
|
+
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
|
112
110
|
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
113
|
-
for i, value in enumerate(self.
|
111
|
+
for i, value in enumerate(self._q): hw_view[i] = value
|
114
112
|
|
115
113
|
# From now on, the queue is on the device for faster submission.
|
116
|
-
self.
|
114
|
+
self._q = hw_view
|
117
115
|
|
118
|
-
def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
|
116
|
+
def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):
|
119
117
|
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
|
120
118
|
else:
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
dev.cmdq_wptr = 0
|
119
|
+
cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4)
|
120
|
+
cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4
|
121
|
+
dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q)
|
125
122
|
|
126
|
-
|
127
|
-
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
|
128
|
-
dev.cmdq_wptr += len(self.q) * 4
|
129
|
-
|
130
|
-
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
|
123
|
+
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
|
131
124
|
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
|
125
|
+
|
126
|
+
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
|
132
127
|
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
133
128
|
gpfifo.put_value += 1
|
134
129
|
|
135
|
-
class NVComputeQueue(NVCommandQueue
|
136
|
-
def
|
137
|
-
self.
|
138
|
-
|
130
|
+
class NVComputeQueue(NVCommandQueue):
|
131
|
+
def memory_barrier(self):
|
132
|
+
self.nvm(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, (1 << 12) | (1 << 4) | (1 << 0))
|
133
|
+
self.active_qmd = None
|
134
|
+
return self
|
139
135
|
|
140
|
-
def
|
136
|
+
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
137
|
+
self.bind_args_state(args_state)
|
141
138
|
|
142
|
-
def _exec(self, prg, args_state, global_size, local_size):
|
143
139
|
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
144
140
|
assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
|
145
141
|
|
146
|
-
|
147
|
-
self.cmd_idx_to_global_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
148
|
-
self.cmd_idx_to_local_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
142
|
+
qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
149
143
|
|
150
|
-
|
151
|
-
|
144
|
+
self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
|
145
|
+
self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
|
146
|
+
self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I')
|
152
147
|
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
|
153
148
|
|
154
|
-
if
|
155
|
-
self.
|
156
|
-
self.
|
149
|
+
if self.active_qmd is None:
|
150
|
+
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
|
151
|
+
self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
|
157
152
|
else:
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
def _signal(self, signal, value=0):
|
169
|
-
if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is not None:
|
153
|
+
self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
|
154
|
+
self.active_qmd.dependent_qmd0_action = 1
|
155
|
+
self.active_qmd.dependent_qmd0_prefetch = 1
|
156
|
+
self.active_qmd.dependent_qmd0_enable = 1
|
157
|
+
|
158
|
+
self.active_qmd = qmd
|
159
|
+
return self
|
160
|
+
|
161
|
+
def signal(self, signal:NVSignal, value:sint=0):
|
162
|
+
if self.active_qmd is not None:
|
170
163
|
for i in range(2):
|
171
|
-
if getattr(
|
172
|
-
setattr(
|
173
|
-
|
174
|
-
|
175
|
-
self
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
self.
|
180
|
-
|
181
|
-
self
|
182
|
-
|
183
|
-
def
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
self.
|
194
|
-
self.
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
def
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
def _update_signal(self, cmd_idx, signal=None, value=None):
|
205
|
-
if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
|
206
|
-
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
207
|
-
|
208
|
-
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
|
209
|
-
|
210
|
-
class NVArgsState(HCQArgsState):
|
211
|
-
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
212
|
-
super().__init__(ptr, prg, bufs, vals=vals)
|
213
|
-
|
214
|
-
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
|
215
|
-
kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
|
216
|
-
to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
|
217
|
-
self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
|
218
|
-
self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
|
219
|
-
|
220
|
-
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
221
|
-
def update_var(self, index:int, val:int): self.vals[index] = val
|
164
|
+
if getattr(self.active_qmd, f'release{i}_enable') == 0:
|
165
|
+
setattr(self.active_qmd, f'release{i}_enable', 1)
|
166
|
+
self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
|
167
|
+
self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
|
168
|
+
return self
|
169
|
+
|
170
|
+
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
|
171
|
+
(1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)) # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
172
|
+
self.nvm(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 0x0)
|
173
|
+
self.active_qmd = None
|
174
|
+
return self
|
175
|
+
|
176
|
+
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.compute_gpfifo)
|
177
|
+
|
178
|
+
class NVCopyQueue(NVCommandQueue):
|
179
|
+
def copy(self, dest:sint, src:sint, copy_size:int):
|
180
|
+
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src), *data64(dest))
|
181
|
+
self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, copy_size)
|
182
|
+
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
183
|
+
return self
|
184
|
+
|
185
|
+
def signal(self, signal:NVSignal, value:sint=0):
|
186
|
+
self.nvm(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, *data64(signal.value_addr), value)
|
187
|
+
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x14)
|
188
|
+
return self
|
189
|
+
|
190
|
+
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
|
191
|
+
|
192
|
+
class NVArgsState(CLikeArgsState):
|
193
|
+
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
194
|
+
if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
|
195
|
+
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
222
196
|
|
223
197
|
class NVProgram(HCQProgram):
|
224
|
-
def __init__(self,
|
225
|
-
self.
|
198
|
+
def __init__(self, dev:NVDevice, name:str, lib:bytes):
|
199
|
+
self.dev, self.name, self.lib = dev, name, lib
|
226
200
|
|
227
201
|
if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
228
202
|
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
229
203
|
|
230
204
|
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
231
|
-
self.lib_gpu = self.
|
205
|
+
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
|
232
206
|
|
233
207
|
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
|
234
|
-
self.constbufs:
|
208
|
+
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
|
235
209
|
for sh in sections:
|
236
210
|
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
|
237
211
|
if sh.name == f".text.{self.name}":
|
@@ -243,7 +217,7 @@ class NVProgram(HCQProgram):
|
|
243
217
|
if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
|
244
218
|
|
245
219
|
# Ensure device has enough local memory to run the program
|
246
|
-
self.
|
220
|
+
self.dev._ensure_has_local_memory(self.lcmem_usage)
|
247
221
|
|
248
222
|
# Apply relocs
|
249
223
|
for apply_image_offset, rel_sym_offset, typ, _ in relocs:
|
@@ -256,15 +230,16 @@ class NVProgram(HCQProgram):
|
|
256
230
|
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
257
231
|
|
258
232
|
self.constbuffer_0 = [0] * 88
|
259
|
-
self.constbuffer_0[6:12] = [*data64_le(self.
|
233
|
+
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
|
260
234
|
|
261
235
|
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
262
|
-
self.qmd
|
236
|
+
self.qmd: ctypes.Structure = \
|
237
|
+
qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
263
238
|
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
|
264
239
|
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
|
265
240
|
shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
|
266
241
|
max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
|
267
|
-
barrier_count=1, shader_local_memory_high_size=self.
|
242
|
+
barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
|
268
243
|
program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
|
269
244
|
|
270
245
|
for i,(addr,sz) in self.constbufs.items():
|
@@ -273,32 +248,32 @@ class NVProgram(HCQProgram):
|
|
273
248
|
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
|
274
249
|
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
|
275
250
|
|
276
|
-
# Registers allocation granularity per warp is 256, warp
|
251
|
+
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
|
277
252
|
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
|
278
253
|
|
279
254
|
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
280
|
-
super().__init__(NVArgsState, self.
|
255
|
+
super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
|
281
256
|
|
282
257
|
def __del__(self):
|
283
|
-
if hasattr(self, 'lib_gpu'): self.
|
258
|
+
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
|
284
259
|
|
285
|
-
def __call__(self, *bufs, global_size:
|
286
|
-
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.
|
287
|
-
raise RuntimeError("Too many resources requested for launch")
|
260
|
+
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
261
|
+
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
|
262
|
+
raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}")
|
288
263
|
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
289
264
|
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
290
265
|
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
291
266
|
|
292
|
-
class NVAllocator(HCQAllocator):
|
293
|
-
def _alloc(self, size:int, options:
|
294
|
-
if options.host: return self.
|
295
|
-
return self.
|
267
|
+
class NVAllocator(HCQAllocator['NVDevice']):
|
268
|
+
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
269
|
+
if options.host: return self.dev._gpu_alloc(size, host=True, tag="user host memory")
|
270
|
+
return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})")
|
296
271
|
|
297
|
-
def _free(self, opaque, options:
|
298
|
-
self.
|
299
|
-
self.
|
272
|
+
def _free(self, opaque:HCQBuffer, options:BufferSpec):
|
273
|
+
self.dev.synchronize()
|
274
|
+
self.dev._gpu_free(opaque)
|
300
275
|
|
301
|
-
def map(self, buf:HCQBuffer): self.
|
276
|
+
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
|
302
277
|
|
303
278
|
@dataclass
|
304
279
|
class GPFifo:
|
@@ -309,119 +284,114 @@ class GPFifo:
|
|
309
284
|
put_value: int = 0
|
310
285
|
|
311
286
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
312
|
-
class NVDevice(HCQCompiled):
|
287
|
+
class NVDevice(HCQCompiled[NVSignal]):
|
313
288
|
root = None
|
314
|
-
fd_ctl:
|
315
|
-
fd_uvm:
|
316
|
-
gpus_info: Union[
|
289
|
+
fd_ctl: HWInterface
|
290
|
+
fd_uvm: HWInterface
|
291
|
+
gpus_info: Union[list, ctypes.Array] = []
|
317
292
|
signals_page: Any = None
|
318
|
-
signals_pool:
|
319
|
-
|
320
|
-
|
293
|
+
signals_pool: list[int] = []
|
294
|
+
|
295
|
+
# TODO: Need a proper allocator for va addresses
|
296
|
+
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
297
|
+
# VA space is 48bits.
|
298
|
+
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x8000000000 if OSX else 0x1000000000, wrap=False)
|
299
|
+
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False)
|
321
300
|
host_object_enumerator: int = 0x1000
|
322
301
|
|
323
302
|
def _new_gpu_fd(self):
|
324
|
-
fd_dev =
|
325
|
-
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
303
|
+
fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
304
|
+
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
|
326
305
|
return fd_dev
|
327
306
|
|
328
307
|
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
|
329
|
-
fd_dev = self._new_gpu_fd() if not system else
|
330
|
-
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
|
331
|
-
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.
|
308
|
+
fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
309
|
+
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
|
310
|
+
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
|
332
311
|
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
333
312
|
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
size = round_up(size,
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
flags=(nv_gpu.
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
|
379
|
-
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
|
380
|
-
|
381
|
-
def _gpu_free(self, mem):
|
382
|
-
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
383
|
-
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
|
313
|
+
return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
|
314
|
+
|
315
|
+
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
|
316
|
+
# Uncached memory is "system". Use huge pages only for gpu memory.
|
317
|
+
page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10)))
|
318
|
+
size = round_up(size, page_size)
|
319
|
+
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
|
320
|
+
|
321
|
+
if host:
|
322
|
+
va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
323
|
+
|
324
|
+
flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
|
325
|
+
| (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
|
326
|
+
|
327
|
+
NVDevice.host_object_enumerator += 1
|
328
|
+
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags,
|
329
|
+
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
|
330
|
+
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
331
|
+
|
332
|
+
if made.params.status != 0: raise RuntimeError(f"host alloc returned {get_error_str(made.params.status)}")
|
333
|
+
mem_handle = made.params.hObjectNew
|
334
|
+
else:
|
335
|
+
attr = ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contiguous else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27) \
|
336
|
+
| (nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE if page_size > 0x1000 else 0) << 23 | ((nv_gpu.NVOS32_ATTR_LOCATION_PCI if uncached else 0) << 25)
|
337
|
+
|
338
|
+
attr2 = ((nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO if uncached else nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES) << 2) \
|
339
|
+
| ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB if page_size > 0x1000 else 0) << 20) | nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC
|
340
|
+
|
341
|
+
fl = nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED | nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE \
|
342
|
+
| nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | (nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM if not uncached else 0)
|
343
|
+
|
344
|
+
alloc_func = nv_gpu.NV1_MEMORY_SYSTEM if uncached else nv_gpu.NV1_MEMORY_USER
|
345
|
+
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=page_size, offset=0, limit=size-1, format=6, size=size,
|
346
|
+
type=nv_gpu.NVOS32_TYPE_NOTIFIER if uncached else nv_gpu.NVOS32_TYPE_IMAGE, attr=attr, attr2=attr2, flags=fl)
|
347
|
+
mem_handle = rm_alloc(self.fd_ctl, alloc_func, self.root, self.nvdevice, alloc_params).hObjectNew
|
348
|
+
|
349
|
+
if cpu_access: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=uncached)
|
350
|
+
|
351
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
|
352
|
+
|
353
|
+
def _gpu_free(self, mem:HCQBuffer):
|
354
|
+
if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
355
|
+
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
|
356
|
+
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
384
357
|
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
385
358
|
|
386
|
-
self._debug_mappings.pop((mem.va_addr, mem.size))
|
387
|
-
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
|
388
|
-
if mem.has_cpu_mapping:
|
359
|
+
self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
|
360
|
+
uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
|
361
|
+
if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
|
389
362
|
|
390
|
-
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") ->
|
363
|
+
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
|
391
364
|
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
392
365
|
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
|
393
366
|
|
394
|
-
# NOTE: va_addr is set to make rawbufs
|
367
|
+
# NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
|
395
368
|
self._debug_mappings[(va_base, size)] = tag
|
396
|
-
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl
|
397
|
-
|
369
|
+
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
|
370
|
+
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
|
371
|
+
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
|
398
372
|
|
399
|
-
def _gpu_map(self, mem):
|
400
|
-
if self.gpu_uuid in mem.mapped_gpu_ids: return
|
401
|
-
mem.mapped_gpu_ids.append(self.gpu_uuid)
|
402
|
-
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
|
373
|
+
def _gpu_map(self, mem:HCQBuffer):
|
374
|
+
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
|
375
|
+
mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
|
376
|
+
self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
|
403
377
|
|
404
378
|
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
405
|
-
if force_low
|
406
|
-
NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
|
407
|
-
assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
|
408
|
-
else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
|
409
|
-
return res_va
|
379
|
+
return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
|
410
380
|
|
411
381
|
def _setup_nvclasses(self):
|
412
382
|
classlist = memoryview(bytearray(100 * 4)).cast('I')
|
413
|
-
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.
|
383
|
+
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
|
414
384
|
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
415
385
|
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
|
416
386
|
|
417
387
|
def __init__(self, device:str=""):
|
418
388
|
if NVDevice.root is None:
|
419
|
-
NVDevice.fd_ctl =
|
420
|
-
NVDevice.fd_uvm =
|
421
|
-
fd_uvm_2 =
|
389
|
+
NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
390
|
+
NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
391
|
+
self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
422
392
|
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
|
423
393
|
uvm.initialize(self.fd_uvm)
|
424
|
-
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
394
|
+
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
|
425
395
|
|
426
396
|
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
427
397
|
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
@@ -438,40 +408,40 @@ class NVDevice(HCQCompiled):
|
|
438
408
|
|
439
409
|
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
|
440
410
|
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
441
|
-
self.
|
442
|
-
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.
|
411
|
+
self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
412
|
+
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
|
443
413
|
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
444
414
|
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
445
415
|
|
446
416
|
self._setup_nvclasses()
|
447
|
-
self._debug_mappings:
|
417
|
+
self._debug_mappings: dict[tuple[int, int], str] = dict()
|
448
418
|
|
449
419
|
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
450
420
|
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
451
421
|
|
452
422
|
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
|
453
423
|
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
|
454
|
-
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.
|
424
|
+
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.nvdevice, vaspace_params).hObjectNew
|
455
425
|
|
456
426
|
raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
|
457
427
|
self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
|
458
428
|
|
459
429
|
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
|
460
|
-
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
|
430
|
+
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
|
461
431
|
|
462
|
-
for dev in cast(
|
432
|
+
for dev in cast(list[NVDevice], self.devices):
|
463
433
|
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
|
464
434
|
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
|
465
435
|
|
466
436
|
if NVDevice.signals_page is None:
|
467
|
-
NVDevice.signals_page = self.
|
468
|
-
NVDevice.signals_pool = [
|
437
|
+
NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
|
438
|
+
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
|
469
439
|
else: self._gpu_map(NVDevice.signals_page)
|
470
440
|
|
471
441
|
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
472
|
-
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.
|
442
|
+
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
|
473
443
|
|
474
|
-
gpfifo_area = self._gpu_alloc(0x200000,
|
444
|
+
gpfifo_area = self._gpu_alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000, tag="gpfifo")
|
475
445
|
|
476
446
|
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
477
447
|
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
|
@@ -481,9 +451,9 @@ class NVDevice(HCQCompiled):
|
|
481
451
|
|
482
452
|
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
483
453
|
|
484
|
-
self.cmdq_page:
|
485
|
-
self.
|
486
|
-
self.
|
454
|
+
self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
|
455
|
+
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True)
|
456
|
+
self.cmdq: memoryview = to_mv(cast(int, self.cmdq_page.va_addr), 0x200000).cast("I")
|
487
457
|
|
488
458
|
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
|
489
459
|
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
|
@@ -496,10 +466,10 @@ class NVDevice(HCQCompiled):
|
|
496
466
|
self._setup_gpfifos()
|
497
467
|
|
498
468
|
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
|
499
|
-
notifier = self.
|
500
|
-
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
469
|
+
notifier = self._gpu_alloc(48 << 20, uncached=True)
|
470
|
+
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
|
501
471
|
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
502
|
-
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
472
|
+
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
503
473
|
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
504
474
|
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
|
505
475
|
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
@@ -507,13 +477,13 @@ class NVDevice(HCQCompiled):
|
|
507
477
|
if enable_debug:
|
508
478
|
self.debug_compute_obj, self.debug_channel = comp, gpfifo
|
509
479
|
debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
|
510
|
-
self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.
|
480
|
+
self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.nvdevice, debugger_params).hObjectNew
|
511
481
|
|
512
482
|
ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
|
513
483
|
assert ws_token_params.workSubmitToken != -1
|
514
484
|
|
515
485
|
channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
|
516
|
-
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
|
486
|
+
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
|
517
487
|
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
518
488
|
|
519
489
|
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
|
@@ -532,30 +502,25 @@ class NVDevice(HCQCompiled):
|
|
532
502
|
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
533
503
|
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
534
504
|
|
535
|
-
NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
|
536
|
-
|
537
|
-
|
505
|
+
cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
|
506
|
+
.setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
|
507
|
+
.signal(self.timeline_signal, self.timeline_value + 1).submit(self)
|
538
508
|
|
539
509
|
self.timeline_value += 2
|
540
510
|
|
541
511
|
def _ensure_has_local_memory(self, required):
|
542
512
|
if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
|
543
513
|
|
544
|
-
if self.shader_local_mem is not None: self.allocator.free(self.shader_local_mem, self.shader_local_mem.size)
|
545
|
-
|
546
514
|
self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread
|
547
515
|
bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
|
516
|
+
self.shader_local_mem, ok = self._realloc(self.shader_local_mem, round_up(bytes_per_tpc*self.num_tpc_per_gpc*self.num_gpcs, 0x20000))
|
548
517
|
|
549
|
-
|
550
|
-
|
551
|
-
# If can't allocate a new size, reallocator the old buffer.
|
552
|
-
self.slm_per_thread = old_slm_per_thread
|
553
|
-
bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
|
554
|
-
self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
|
518
|
+
# Realloc failed, restore the old value.
|
519
|
+
if not ok: self.slm_per_thread = old_slm_per_thread
|
555
520
|
|
556
|
-
NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
|
557
|
-
|
558
|
-
|
521
|
+
cast(NVComputeQueue, NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1)) \
|
522
|
+
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
|
523
|
+
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
559
524
|
self.timeline_value += 1
|
560
525
|
|
561
526
|
def invalidate_caches(self):
|