tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +1 -1
- tinygrad/apps/llm.py +206 -0
- tinygrad/codegen/__init__.py +116 -0
- tinygrad/codegen/devectorizer.py +315 -172
- tinygrad/codegen/expander.py +8 -16
- tinygrad/codegen/gpudims.py +89 -0
- tinygrad/codegen/linearize.py +205 -203
- tinygrad/codegen/lowerer.py +92 -139
- tinygrad/codegen/opt/__init__.py +38 -0
- tinygrad/codegen/opt/heuristic.py +125 -0
- tinygrad/codegen/opt/kernel.py +510 -0
- tinygrad/{engine → codegen/opt}/search.py +51 -35
- tinygrad/codegen/opt/swizzler.py +134 -0
- tinygrad/codegen/opt/tc.py +127 -0
- tinygrad/codegen/quantize.py +67 -0
- tinygrad/device.py +122 -132
- tinygrad/dtype.py +152 -35
- tinygrad/engine/jit.py +81 -54
- tinygrad/engine/memory.py +46 -27
- tinygrad/engine/realize.py +82 -41
- tinygrad/engine/schedule.py +70 -445
- tinygrad/frontend/__init__.py +0 -0
- tinygrad/frontend/onnx.py +1253 -0
- tinygrad/frontend/torch.py +5 -0
- tinygrad/gradient.py +19 -27
- tinygrad/helpers.py +95 -47
- tinygrad/nn/__init__.py +7 -8
- tinygrad/nn/optim.py +72 -41
- tinygrad/nn/state.py +37 -23
- tinygrad/renderer/__init__.py +40 -60
- tinygrad/renderer/cstyle.py +143 -128
- tinygrad/renderer/llvmir.py +113 -62
- tinygrad/renderer/ptx.py +50 -32
- tinygrad/renderer/wgsl.py +27 -23
- tinygrad/runtime/autogen/am/am.py +5861 -0
- tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
- tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
- tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
- tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
- tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
- tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
- tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
- tinygrad/runtime/autogen/comgr.py +35 -9
- tinygrad/runtime/autogen/comgr_3.py +906 -0
- tinygrad/runtime/autogen/cuda.py +2419 -494
- tinygrad/runtime/autogen/hsa.py +57 -16
- tinygrad/runtime/autogen/ib.py +7171 -0
- tinygrad/runtime/autogen/io_uring.py +917 -118
- tinygrad/runtime/autogen/kfd.py +748 -26
- tinygrad/runtime/autogen/libc.py +613 -218
- tinygrad/runtime/autogen/libusb.py +1643 -0
- tinygrad/runtime/autogen/nv/nv.py +8602 -0
- tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
- tinygrad/runtime/autogen/opencl.py +2 -4
- tinygrad/runtime/autogen/sqtt.py +1789 -0
- tinygrad/runtime/autogen/vfio.py +3 -3
- tinygrad/runtime/autogen/webgpu.py +273 -264
- tinygrad/runtime/graph/cuda.py +3 -3
- tinygrad/runtime/graph/hcq.py +68 -29
- tinygrad/runtime/graph/metal.py +29 -13
- tinygrad/runtime/graph/remote.py +114 -0
- tinygrad/runtime/ops_amd.py +537 -320
- tinygrad/runtime/ops_cpu.py +108 -7
- tinygrad/runtime/ops_cuda.py +12 -14
- tinygrad/runtime/ops_disk.py +13 -10
- tinygrad/runtime/ops_dsp.py +47 -40
- tinygrad/runtime/ops_gpu.py +13 -11
- tinygrad/runtime/ops_hip.py +6 -9
- tinygrad/runtime/ops_llvm.py +35 -15
- tinygrad/runtime/ops_metal.py +29 -19
- tinygrad/runtime/ops_npy.py +5 -3
- tinygrad/runtime/ops_null.py +28 -0
- tinygrad/runtime/ops_nv.py +306 -234
- tinygrad/runtime/ops_python.py +62 -52
- tinygrad/runtime/ops_qcom.py +28 -39
- tinygrad/runtime/ops_remote.py +482 -0
- tinygrad/runtime/ops_webgpu.py +28 -28
- tinygrad/runtime/support/am/amdev.py +114 -249
- tinygrad/runtime/support/am/ip.py +211 -172
- tinygrad/runtime/support/amd.py +138 -0
- tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
- tinygrad/runtime/support/compiler_cuda.py +8 -11
- tinygrad/runtime/support/elf.py +2 -1
- tinygrad/runtime/support/hcq.py +184 -97
- tinygrad/runtime/support/ib.py +172 -0
- tinygrad/runtime/support/llvm.py +3 -4
- tinygrad/runtime/support/memory.py +251 -0
- tinygrad/runtime/support/nv/__init__.py +0 -0
- tinygrad/runtime/support/nv/ip.py +581 -0
- tinygrad/runtime/support/nv/nvdev.py +183 -0
- tinygrad/runtime/support/system.py +170 -0
- tinygrad/runtime/support/usb.py +268 -0
- tinygrad/runtime/support/webgpu.py +18 -0
- tinygrad/schedule/__init__.py +0 -0
- tinygrad/schedule/grouper.py +119 -0
- tinygrad/schedule/kernelize.py +368 -0
- tinygrad/schedule/multi.py +231 -0
- tinygrad/shape/shapetracker.py +40 -46
- tinygrad/shape/view.py +88 -52
- tinygrad/tensor.py +968 -542
- tinygrad/uop/__init__.py +117 -0
- tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
- tinygrad/uop/mathtraits.py +169 -0
- tinygrad/uop/ops.py +1021 -0
- tinygrad/uop/spec.py +228 -0
- tinygrad/{codegen → uop}/symbolic.py +239 -216
- tinygrad/uop/upat.py +163 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
- tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
- tinygrad/viz/index.html +203 -403
- tinygrad/viz/js/index.js +718 -0
- tinygrad/viz/js/worker.js +29 -0
- tinygrad/viz/serve.py +224 -102
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
- tinygrad-0.11.0.dist-info/RECORD +141 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/kernel.py +0 -693
- tinygrad/engine/multi.py +0 -161
- tinygrad/ops.py +0 -1003
- tinygrad/runtime/ops_cloud.py +0 -220
- tinygrad/runtime/support/allocator.py +0 -94
- tinygrad/spec.py +0 -155
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
- tinygrad/viz/perfetto.html +0 -178
- tinygrad-0.10.2.dist-info/RECORD +0 -99
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_nv.py
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
|
2
|
+
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys, weakref
|
3
3
|
assert sys.platform != 'win32'
|
4
|
-
from typing import
|
4
|
+
from typing import cast, ClassVar
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
7
|
-
from tinygrad.runtime.support.hcq import
|
8
|
-
from tinygrad.ops import sint
|
9
|
-
from tinygrad.device import BufferSpec
|
10
|
-
from tinygrad.helpers import getenv, mv_address,
|
7
|
+
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU
|
8
|
+
from tinygrad.uop.ops import sint
|
9
|
+
from tinygrad.device import BufferSpec
|
10
|
+
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, suppress_finalizing
|
11
11
|
from tinygrad.renderer.ptx import PTXRenderer
|
12
12
|
from tinygrad.renderer.cstyle import NVRenderer
|
13
13
|
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
14
|
-
from tinygrad.runtime.autogen import nv_gpu
|
14
|
+
from tinygrad.runtime.autogen import nv_gpu, pci
|
15
15
|
from tinygrad.runtime.support.elf import elf_loader
|
16
|
+
from tinygrad.runtime.support.nv.nvdev import NVDev, NVMemoryManager
|
17
|
+
from tinygrad.runtime.support.system import System, PCIIfaceBase, MAP_FIXED
|
16
18
|
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
17
19
|
|
18
20
|
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
|
@@ -20,33 +22,11 @@ def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status
|
|
20
22
|
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
|
21
23
|
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
|
22
24
|
|
23
|
-
def nv_iowr(fd:
|
25
|
+
def nv_iowr(fd:FileIOInterface, nr, args):
|
24
26
|
ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
25
27
|
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
26
28
|
|
27
|
-
def
|
28
|
-
made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
|
29
|
-
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
|
30
|
-
nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
|
31
|
-
if made.status != 0:
|
32
|
-
if made.status == nv_gpu.NV_ERR_NO_MEMORY: raise MemoryError(f"rm_alloc returned {get_error_str(made.status)}")
|
33
|
-
raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
|
34
|
-
return made
|
35
|
-
|
36
|
-
def rm_control(cmd, sttyp, fd, client, obj, **kwargs):
|
37
|
-
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params:=sttyp(**kwargs)),
|
38
|
-
params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
|
39
|
-
nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
|
40
|
-
if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
|
41
|
-
return params
|
42
|
-
|
43
|
-
def make_rmctrl_type():
|
44
|
-
return type("NVRMCTRL", (object,), {name[name.find("_CTRL_CMD_")+10:].lower(): functools.partial(rm_control, dt, sttyp)
|
45
|
-
for name,dt in nv_gpu.__dict__.items() if name.find("_CTRL_CMD_")>=0 and (sttyp:=getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_")+"_PARAMS", \
|
46
|
-
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
|
47
|
-
rmctrl = make_rmctrl_type()
|
48
|
-
|
49
|
-
def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
|
29
|
+
def uvm_ioctl(cmd, sttyp, fd:FileIOInterface, **kwargs):
|
50
30
|
ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
|
51
31
|
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
52
32
|
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
|
@@ -57,28 +37,41 @@ def make_uvm_type():
|
|
57
37
|
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
|
58
38
|
uvm = make_uvm_type()
|
59
39
|
|
60
|
-
|
61
|
-
fields:
|
62
|
-
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
63
|
-
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
64
|
-
bits = sorted(bits, key=lambda x: x[1][1])
|
65
|
-
for i,(name, data) in enumerate(bits):
|
66
|
-
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
|
67
|
-
fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
|
68
|
-
if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
|
69
|
-
fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
|
70
|
-
return init_c_struct_t(tuple(fields))
|
71
|
-
qmd_struct_t = make_qmd_struct_type()
|
72
|
-
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
73
|
-
|
74
|
-
class NVSignal(HCQSignal):
|
75
|
-
def __init__(self, base_addr:int|None=None, **kwargs):
|
76
|
-
super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
|
40
|
+
class QMD:
|
41
|
+
fields: dict[str, dict[str, tuple[int, int]]] = {}
|
77
42
|
|
78
|
-
def
|
79
|
-
|
43
|
+
def __init__(self, dev:NVDevice, addr:int|None=None, **kwargs):
|
44
|
+
self.ver, self.sz = (5, 0x60) if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else (3, 0x40)
|
45
|
+
|
46
|
+
# Init fields from module
|
47
|
+
if (pref:="NVCEC0_QMDV05_00" if self.ver == 5 else "NVC6C0_QMDV03_00") not in QMD.fields:
|
48
|
+
QMD.fields[pref] = {**{name[len(pref)+1:]: dt for name,dt in nv_gpu.__dict__.items() if name.startswith(pref) and isinstance(dt, tuple)},
|
49
|
+
**{name[len(pref)+1:]+f"_{i}": dt(i) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith(pref) and callable(dt)}}
|
50
|
+
|
51
|
+
self.mv, self.pref = (memoryview(bytearray(self.sz * 4)) if addr is None else to_mv(addr, self.sz * 4)), pref
|
52
|
+
if kwargs: self.write(**kwargs)
|
53
|
+
|
54
|
+
def _rw_bits(self, hi:int, lo:int, value:int|None=None):
|
55
|
+
mask = ((1 << (width:=hi - lo + 1)) - 1) << (lo % 8)
|
56
|
+
num = int.from_bytes(self.mv[lo//8:hi//8+1], "little")
|
80
57
|
|
81
|
-
|
58
|
+
if value is None: return (num & mask) >> (lo % 8)
|
59
|
+
|
60
|
+
if value >= (1 << width): raise ValueError(f"{value:#x} does not fit.")
|
61
|
+
self.mv[lo//8:hi//8+1] = int((num & ~mask) | ((value << (lo % 8)) & mask)).to_bytes((hi//8 - lo//8 + 1), "little")
|
62
|
+
|
63
|
+
def write(self, **kwargs):
|
64
|
+
for k,val in kwargs.items(): self._rw_bits(*QMD.fields[self.pref][k.upper()], value=val) # type: ignore [misc]
|
65
|
+
|
66
|
+
def read(self, k, val=0): return self._rw_bits(*QMD.fields[self.pref][k.upper()])
|
67
|
+
|
68
|
+
def field_offset(self, k): return QMD.fields[self.pref][k.upper()][1] // 8
|
69
|
+
|
70
|
+
def set_constant_buf_addr(self, i, addr):
|
71
|
+
if self.ver < 4: self.write(**{f'constant_buffer_addr_upper_{i}':hi32(addr), f'constant_buffer_addr_lower_{i}':lo32(addr)})
|
72
|
+
else: self.write(**{f'constant_buffer_addr_upper_shifted6_{i}':hi32(addr >> 6), f'constant_buffer_addr_lower_shifted6_{i}':lo32(addr >> 6)})
|
73
|
+
|
74
|
+
class NVCommandQueue(HWQueue[HCQSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
82
75
|
def __init__(self):
|
83
76
|
self.active_qmd = None
|
84
77
|
super().__init__()
|
@@ -97,17 +90,17 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
|
97
90
|
if local_mem_tpc_bytes: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, *data64(local_mem_tpc_bytes), 0xff)
|
98
91
|
return self
|
99
92
|
|
100
|
-
def wait(self, signal:
|
93
|
+
def wait(self, signal:HCQSignal, value:sint=0):
|
101
94
|
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT
|
102
95
|
self.active_qmd = None
|
103
96
|
return self
|
104
97
|
|
105
|
-
def timestamp(self, signal:
|
98
|
+
def timestamp(self, signal:HCQSignal): return self.signal(signal, 0)
|
106
99
|
|
107
100
|
def bind(self, dev:NVDevice):
|
108
101
|
self.binded_device = dev
|
109
102
|
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
|
110
|
-
hw_view =
|
103
|
+
hw_view = self.hw_page.cpu_view().view(fmt='I')
|
111
104
|
for i, value in enumerate(self._q): hw_view[i] = value
|
112
105
|
|
113
106
|
# From now on, the queue is on the device for faster submission.
|
@@ -123,48 +116,48 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
|
123
116
|
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
|
124
117
|
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
|
125
118
|
|
126
|
-
|
119
|
+
System.memory_barrier()
|
127
120
|
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
128
121
|
gpfifo.put_value += 1
|
129
122
|
|
130
123
|
class NVComputeQueue(NVCommandQueue):
|
131
124
|
def memory_barrier(self):
|
132
125
|
self.nvm(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, (1 << 12) | (1 << 4) | (1 << 0))
|
133
|
-
self.active_qmd = None
|
126
|
+
self.active_qmd:QMD|None = None
|
134
127
|
return self
|
135
128
|
|
136
129
|
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
137
130
|
self.bind_args_state(args_state)
|
138
131
|
|
139
|
-
|
140
|
-
|
132
|
+
qmd_buf = args_state.buf.offset(round_up(prg.constbufs[0][1], 1 << 8))
|
133
|
+
qmd_buf.cpu_view().view(size=prg.qmd.mv.nbytes, fmt='B')[:] = prg.qmd.mv
|
134
|
+
assert qmd_buf.va_addr < (1 << 40), f"large qmd addr {qmd_buf.va_addr:x}"
|
141
135
|
|
142
|
-
qmd =
|
136
|
+
qmd = QMD(dev=prg.dev, addr=cast(int, qmd_buf.va_addr)) # Save qmd for later update
|
143
137
|
|
144
|
-
self.
|
145
|
-
self.
|
146
|
-
self.
|
147
|
-
qmd.
|
138
|
+
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=qmd.field_offset('cta_raster_width' if qmd.ver<4 else 'grid_width'))
|
139
|
+
self.bind_sints_to_mem(*(local_size[:2]), mem=qmd_buf.cpu_view(), fmt='H', offset=qmd.field_offset('cta_thread_dimension0'))
|
140
|
+
self.bind_sints_to_mem(local_size[2], mem=qmd_buf.cpu_view(), fmt='B', offset=qmd.field_offset('cta_thread_dimension2'))
|
141
|
+
qmd.set_constant_buf_addr(0, args_state.buf.va_addr)
|
148
142
|
|
149
143
|
if self.active_qmd is None:
|
150
|
-
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A,
|
144
|
+
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_buf.va_addr >> 8)
|
151
145
|
self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
|
152
146
|
else:
|
153
|
-
self.active_qmd.dependent_qmd0_pointer
|
154
|
-
self.active_qmd.dependent_qmd0_action = 1
|
155
|
-
self.active_qmd.dependent_qmd0_prefetch = 1
|
156
|
-
self.active_qmd.dependent_qmd0_enable = 1
|
147
|
+
self.active_qmd.write(dependent_qmd0_pointer=qmd_buf.va_addr >> 8, dependent_qmd0_action=1, dependent_qmd0_prefetch=1, dependent_qmd0_enable=1)
|
157
148
|
|
158
|
-
self.active_qmd = qmd
|
149
|
+
self.active_qmd, self.active_qmd_buf = qmd, qmd_buf
|
159
150
|
return self
|
160
151
|
|
161
|
-
def signal(self, signal:
|
152
|
+
def signal(self, signal:HCQSignal, value:sint=0):
|
162
153
|
if self.active_qmd is not None:
|
163
154
|
for i in range(2):
|
164
|
-
if
|
165
|
-
|
166
|
-
self.
|
167
|
-
|
155
|
+
if self.active_qmd.read(f'release{i}_enable') == 0:
|
156
|
+
self.active_qmd.write(**{f'release{i}_enable': 1})
|
157
|
+
self.bind_sints_to_mem(signal.value_addr, mem=self.active_qmd_buf.cpu_view(), fmt='Q', mask=0xfffffffff,
|
158
|
+
offset=self.active_qmd.field_offset(f'release{i}_address_lower' if self.active_qmd.ver<4 else f'release_semaphore{i}_addr_lower'))
|
159
|
+
self.bind_sints_to_mem(value, mem=self.active_qmd_buf.cpu_view(), fmt='Q',
|
160
|
+
offset=self.active_qmd.field_offset(f'release{i}_payload_lower' if self.active_qmd.ver<4 else f'release_semaphore{i}_payload_lower'))
|
168
161
|
return self
|
169
162
|
|
170
163
|
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
|
@@ -177,12 +170,13 @@ class NVComputeQueue(NVCommandQueue):
|
|
177
170
|
|
178
171
|
class NVCopyQueue(NVCommandQueue):
|
179
172
|
def copy(self, dest:sint, src:sint, copy_size:int):
|
180
|
-
|
181
|
-
|
182
|
-
|
173
|
+
for off in range(0, copy_size, step:=(1 << 31)):
|
174
|
+
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src+off), *data64(dest+off))
|
175
|
+
self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, min(copy_size-off, step))
|
176
|
+
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
183
177
|
return self
|
184
178
|
|
185
|
-
def signal(self, signal:
|
179
|
+
def signal(self, signal:HCQSignal, value:sint=0):
|
186
180
|
self.nvm(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, *data64(signal.value_addr), value)
|
187
181
|
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x14)
|
188
182
|
return self
|
@@ -190,31 +184,34 @@ class NVCopyQueue(NVCommandQueue):
|
|
190
184
|
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
|
191
185
|
|
192
186
|
class NVArgsState(CLikeArgsState):
|
193
|
-
def __init__(self,
|
187
|
+
def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
194
188
|
if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
|
195
|
-
super().__init__(
|
189
|
+
super().__init__(buf, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
196
190
|
|
197
191
|
class NVProgram(HCQProgram):
|
198
192
|
def __init__(self, dev:NVDevice, name:str, lib:bytes):
|
199
193
|
self.dev, self.name, self.lib = dev, name, lib
|
200
194
|
|
195
|
+
# For MOCKGPU, the lib is PTX code, so some values are emulated.
|
196
|
+
cbuf0_size = 0 if not MOCKGPU else 0x160
|
197
|
+
|
201
198
|
if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
202
199
|
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
203
200
|
|
204
201
|
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
205
|
-
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
|
202
|
+
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, buf_spec:=BufferSpec(cpu_access=True))
|
206
203
|
|
207
204
|
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
|
208
205
|
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
|
209
206
|
for sh in sections:
|
210
207
|
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
|
211
|
-
if sh.name == f".text.{self.name}":
|
212
|
-
self.prog_addr, self.prog_sz, self.regs_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, max(sh.header.sh_info>>24, 16)
|
208
|
+
if sh.name == f".text.{self.name}": self.prog_addr, self.prog_sz = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size
|
213
209
|
elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
|
214
|
-
elif sh.name
|
215
|
-
for
|
216
|
-
|
217
|
-
|
210
|
+
elif sh.name.startswith(".nv.info"):
|
211
|
+
for typ, param, data in self._parse_elf_info(sh):
|
212
|
+
if sh.name == f".nv.info.{name}" and param == 0xa: cbuf0_size = struct.unpack_from("IH", data)[1] # EIATTR_PARAM_CBANK
|
213
|
+
elif sh.name == ".nv.info" and param == 0x12: self.lcmem_usage = struct.unpack_from("II", data)[1] + 0x240 # EIATTR_MIN_STACK_SIZE
|
214
|
+
elif sh.name == ".nv.info" and param == 0x2f: self.regs_usage = struct.unpack_from("II", data)[1] # EIATTR_REGCOUNT
|
218
215
|
|
219
216
|
# Ensure device has enough local memory to run the program
|
220
217
|
self.dev._ensure_has_local_memory(self.lcmem_usage)
|
@@ -229,33 +226,44 @@ class NVProgram(HCQProgram):
|
|
229
226
|
|
230
227
|
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
231
228
|
|
232
|
-
self.constbuffer_0 = [0] *
|
233
|
-
|
229
|
+
self.constbuffer_0 = [0] * (cbuf0_size // 4)
|
230
|
+
|
231
|
+
if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A:
|
232
|
+
self.constbuffer_0[188:192], self.constbuffer_0[223] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window)], 0xfffdc0
|
233
|
+
qmd = {'qmd_major_version':5, 'qmd_type':nv_gpu.NVCEC0_QMDV05_00_QMD_TYPE_GRID_CTA, 'register_count':self.regs_usage,
|
234
|
+
'program_address_upper_shifted4':hi32(self.prog_addr>>4), 'program_address_lower_shifted4':lo32(self.prog_addr>>4),
|
235
|
+
'shared_memory_size_shifted7':self.shmem_usage>>7, 'shader_local_memory_high_size_shifted4':self.dev.slm_per_thread>>4}
|
236
|
+
else:
|
237
|
+
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
|
238
|
+
qmd = {'qmd_major_version':3, 'sm_global_caching_enable':1, 'shader_local_memory_high_size':self.dev.slm_per_thread,
|
239
|
+
'program_address_upper':hi32(self.prog_addr), 'program_address_lower':lo32(self.prog_addr), 'shared_memory_size':self.shmem_usage,
|
240
|
+
'register_count_v':self.regs_usage}
|
234
241
|
|
235
242
|
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
|
243
|
+
|
244
|
+
self.qmd:QMD = QMD(dev, **qmd, qmd_group_id=0x3f, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
245
|
+
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, barrier_count=1,
|
246
|
+
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, constant_buffer_invalidate_0=1,
|
247
|
+
min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg, max_sm_config_shared_mem_size=0x1a,
|
248
|
+
program_prefetch_size=min(self.prog_sz>>8, 0x1ff), sass_version=dev.sass_version,
|
249
|
+
program_prefetch_addr_upper_shifted=self.prog_addr>>40, program_prefetch_addr_lower_shifted=self.prog_addr>>8)
|
244
250
|
|
245
251
|
for i,(addr,sz) in self.constbufs.items():
|
246
|
-
self.qmd.
|
247
|
-
self.qmd.
|
248
|
-
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
|
249
|
-
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
|
252
|
+
self.qmd.set_constant_buf_addr(i, addr)
|
253
|
+
self.qmd.write(**{f'constant_buffer_size_shifted4_{i}': sz, f'constant_buffer_valid_{i}': 1})
|
250
254
|
|
251
255
|
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
|
252
256
|
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
|
253
257
|
|
254
|
-
# NV's kernargs is constbuffer
|
258
|
+
# NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
255
259
|
super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
|
260
|
+
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
256
261
|
|
257
|
-
def
|
258
|
-
|
262
|
+
def _parse_elf_info(self, sh, start_off=0):
|
263
|
+
while start_off < sh.header.sh_size:
|
264
|
+
typ, param, sz = struct.unpack_from("BBH", sh.content, start_off)
|
265
|
+
yield typ, param, sh.content[start_off+4:start_off+sz+4] if typ == 0x4 else sz
|
266
|
+
start_off += (sz if typ == 0x4 else 0) + 4
|
259
267
|
|
260
268
|
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
261
269
|
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
|
@@ -266,31 +274,28 @@ class NVProgram(HCQProgram):
|
|
266
274
|
|
267
275
|
class NVAllocator(HCQAllocator['NVDevice']):
|
268
276
|
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
269
|
-
|
270
|
-
return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})")
|
277
|
+
return self.dev.iface.alloc(size, cpu_access=options.cpu_access, host=options.host)
|
271
278
|
|
279
|
+
@suppress_finalizing
|
272
280
|
def _free(self, opaque:HCQBuffer, options:BufferSpec):
|
273
281
|
self.dev.synchronize()
|
274
|
-
self.dev.
|
282
|
+
self.dev.iface.free(opaque)
|
275
283
|
|
276
|
-
def
|
284
|
+
def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
|
277
285
|
|
278
286
|
@dataclass
|
279
287
|
class GPFifo:
|
280
|
-
ring:
|
288
|
+
ring: MMIOInterface
|
281
289
|
controls: nv_gpu.AmpereAControlGPFifo
|
282
290
|
entries_count: int
|
283
291
|
token: int
|
284
292
|
put_value: int = 0
|
285
293
|
|
286
|
-
|
287
|
-
class NVDevice(HCQCompiled[NVSignal]):
|
294
|
+
class NVKIface:
|
288
295
|
root = None
|
289
|
-
fd_ctl:
|
290
|
-
fd_uvm:
|
291
|
-
gpus_info:
|
292
|
-
signals_page: Any = None
|
293
|
-
signals_pool: list[int] = []
|
296
|
+
fd_ctl: FileIOInterface
|
297
|
+
fd_uvm: FileIOInterface
|
298
|
+
gpus_info: list|ctypes.Array = []
|
294
299
|
|
295
300
|
# TODO: Need a proper allocator for va addresses
|
296
301
|
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
@@ -299,34 +304,98 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|
299
304
|
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False)
|
300
305
|
host_object_enumerator: int = 0x1000
|
301
306
|
|
307
|
+
def __init__(self, dev, device_id):
|
308
|
+
if NVKIface.root is None:
|
309
|
+
NVKIface.fd_ctl = FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
310
|
+
NVKIface.fd_uvm = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
311
|
+
self.fd_uvm_2 = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
312
|
+
NVKIface.root = self.rm_alloc(0, nv_gpu.NV01_ROOT_CLIENT, None, root=0)
|
313
|
+
uvm.initialize(self.fd_uvm)
|
314
|
+
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
|
315
|
+
|
316
|
+
nv_iowr(NVKIface.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
317
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
318
|
+
NVKIface.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
|
319
|
+
|
320
|
+
self.dev, self.device_id = dev, device_id
|
321
|
+
if self.device_id >= len(NVKIface.gpus_info) or not NVKIface.gpus_info[self.device_id].valid:
|
322
|
+
raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
323
|
+
|
324
|
+
self.fd_dev = self._new_gpu_fd()
|
325
|
+
self.gpu_info = self.rm_control(self.root, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2,
|
326
|
+
nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVKIface.gpus_info[self.device_id].gpu_id))
|
327
|
+
self.gpu_minor = NVKIface.gpus_info[self.device_id].minor_number
|
328
|
+
self.gpu_instance = self.gpu_info.deviceInstance
|
329
|
+
|
330
|
+
def rm_alloc(self, parent, clss, params=None, root=None) -> int:
|
331
|
+
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_ALLOC, made:=nv_gpu.NVOS21_PARAMETERS(hRoot=root if root is not None else self.root,
|
332
|
+
hObjectParent=parent, hClass=clss, pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None))
|
333
|
+
if made.status == nv_gpu.NV_ERR_NO_MEMORY: raise MemoryError(f"rm_alloc returned {get_error_str(made.status)}")
|
334
|
+
if made.status != 0: raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
|
335
|
+
return made.hObjectNew
|
336
|
+
|
337
|
+
def rm_control(self, obj, cmd, params=None):
|
338
|
+
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_CONTROL, made:=nv_gpu.NVOS54_PARAMETERS(hClient=self.root, hObject=obj, cmd=cmd,
|
339
|
+
paramsSize=ctypes.sizeof(params), params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None))
|
340
|
+
if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
|
341
|
+
return params
|
342
|
+
|
343
|
+
def setup_usermode(self):
|
344
|
+
clsinfo = self.rm_control(self.dev.nvdevice, nv_gpu.NV0080_CTRL_CMD_GPU_GET_CLASSLIST, nv_gpu.NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS(numClasses=100,
|
345
|
+
classList=mv_address(classlist:=memoryview(bytearray(100 * 4)).cast('I'))))
|
346
|
+
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
347
|
+
self.usermode_class:int = next(c for c in [nv_gpu.HOPPER_USERMODE_A, nv_gpu.TURING_USERMODE_A] if c in self.nvclasses)
|
348
|
+
self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses)
|
349
|
+
self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses)
|
350
|
+
self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses)
|
351
|
+
|
352
|
+
usermode = self.rm_alloc(self.dev.subdevice, self.usermode_class)
|
353
|
+
return usermode, MMIOInterface(self._gpu_map_to_cpu(usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I')
|
354
|
+
|
355
|
+
def setup_vm(self, vaspace):
|
356
|
+
self.rm_control(self.dev.subdevice, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, raw_uuid:=nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(
|
357
|
+
flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16))
|
358
|
+
self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
|
359
|
+
|
360
|
+
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
|
361
|
+
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
|
362
|
+
|
363
|
+
for dev in cast(list[NVDevice], [d for pg in HCQCompiled.peer_groups.values() for d in pg if isinstance(d, NVDevice) and not d.is_nvd()]):
|
364
|
+
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.iface.gpu_uuid)
|
365
|
+
except RuntimeError as e: raise RuntimeError(f"{e}. Make sure GPUs #{self.gpu_minor} & #{dev.iface.gpu_minor} have P2P enabled.") from e
|
366
|
+
|
367
|
+
def setup_gpfifo_vm(self, gpfifo):
|
368
|
+
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
|
369
|
+
hChannel=gpfifo, base=self._alloc_gpu_vaddr(0x4000000, force_low=True), length=0x4000000)
|
370
|
+
|
302
371
|
def _new_gpu_fd(self):
|
303
|
-
fd_dev =
|
372
|
+
fd_dev = FileIOInterface(f"/dev/nvidia{NVKIface.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
304
373
|
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
|
305
374
|
return fd_dev
|
306
375
|
|
307
376
|
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
|
308
|
-
fd_dev = self._new_gpu_fd() if not system else
|
377
|
+
fd_dev = self._new_gpu_fd() if not system else FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
309
378
|
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
|
310
|
-
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
|
379
|
+
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.dev.nvdevice, hMemory=memory_handle, length=size, flags=flags))
|
311
380
|
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
312
381
|
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
|
313
382
|
return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
|
314
383
|
|
315
|
-
def
|
384
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, cpu_addr=None) -> HCQBuffer:
|
316
385
|
# Uncached memory is "system". Use huge pages only for gpu memory.
|
317
386
|
page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10)))
|
318
387
|
size = round_up(size, page_size)
|
319
388
|
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
|
320
389
|
|
321
390
|
if host:
|
322
|
-
va_addr =
|
391
|
+
va_addr = cpu_addr or FileIOInterface.anon_mmap(va_addr, size, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, 0)
|
323
392
|
|
324
393
|
flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
|
325
394
|
| (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
|
326
395
|
|
327
|
-
|
328
|
-
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags,
|
329
|
-
hObjectNew=
|
396
|
+
NVKIface.host_object_enumerator += 1
|
397
|
+
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.dev.nvdevice, flags=flags,
|
398
|
+
hObjectNew=NVKIface.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
|
330
399
|
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
331
400
|
|
332
401
|
if made.params.status != 0: raise RuntimeError(f"host alloc returned {get_error_str(made.params.status)}")
|
@@ -344,169 +413,171 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|
344
413
|
alloc_func = nv_gpu.NV1_MEMORY_SYSTEM if uncached else nv_gpu.NV1_MEMORY_USER
|
345
414
|
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=page_size, offset=0, limit=size-1, format=6, size=size,
|
346
415
|
type=nv_gpu.NVOS32_TYPE_NOTIFIER if uncached else nv_gpu.NVOS32_TYPE_IMAGE, attr=attr, attr2=attr2, flags=fl)
|
347
|
-
mem_handle = rm_alloc(self.
|
416
|
+
mem_handle = self.rm_alloc(self.dev.nvdevice, alloc_func, alloc_params)
|
348
417
|
|
349
418
|
if cpu_access: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=uncached)
|
350
419
|
|
351
|
-
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host
|
420
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host)
|
352
421
|
|
353
|
-
def
|
354
|
-
if mem.meta.hMemory >
|
355
|
-
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
|
422
|
+
def free(self, mem:HCQBuffer):
|
423
|
+
if mem.meta.hMemory > NVKIface.host_object_enumerator: # not a host object, clear phys mem.
|
424
|
+
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.dev.nvdevice, hObjectOld=mem.meta.hMemory)
|
356
425
|
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
357
426
|
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
358
427
|
|
359
|
-
self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
|
360
428
|
uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
|
361
|
-
if mem.meta.has_cpu_mapping:
|
429
|
+
if mem.meta.has_cpu_mapping: FileIOInterface.munmap(cast(int, mem.va_addr), mem.size)
|
362
430
|
|
363
|
-
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False
|
431
|
+
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> HCQBuffer:
|
364
432
|
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
365
433
|
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
|
366
434
|
|
367
435
|
# NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
|
368
|
-
self._debug_mappings[(va_base, size)] = tag
|
369
436
|
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
|
370
|
-
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
|
371
|
-
|
437
|
+
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid],
|
438
|
+
has_cpu_mapping=has_cpu_mapping), view=MMIOInterface(va_base, size, fmt='B') if has_cpu_mapping else None, owner=self.dev)
|
372
439
|
|
373
|
-
def
|
374
|
-
if
|
375
|
-
|
376
|
-
|
440
|
+
def map(self, mem:HCQBuffer):
|
441
|
+
if mem.owner is not None and mem.owner._is_cpu():
|
442
|
+
if not any(x.device.startswith("NV") for x in mem.mapped_devs): return self.alloc(mem.size, host=True, cpu_addr=mem.va_addr)
|
443
|
+
mem = mem.mappings[next(x for x in mem.mapped_devs if x.device.startswith("NV"))]
|
444
|
+
self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False)
|
377
445
|
|
378
446
|
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
379
|
-
return
|
447
|
+
return NVKIface.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVKIface.uvm_vaddr_allocator.alloc(size, alignment)
|
380
448
|
|
381
|
-
|
382
|
-
|
383
|
-
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
|
384
|
-
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
385
|
-
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
|
449
|
+
class PCIIface(PCIIfaceBase):
|
450
|
+
gpus:ClassVar[list[str]] = []
|
386
451
|
|
387
|
-
def __init__(self,
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
392
|
-
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
|
393
|
-
uvm.initialize(self.fd_uvm)
|
394
|
-
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
|
452
|
+
def __init__(self, dev, dev_id):
|
453
|
+
super().__init__(dev, dev_id, vendor=0x10de, devices=[0x2204, 0x2684, 0x2b85], bars=[0, 1], vram_bar=1,
|
454
|
+
va_start=NVMemoryManager.va_allocator.base, va_size=NVMemoryManager.va_allocator.size)
|
455
|
+
System.reserve_hugepages(64)
|
395
456
|
|
396
|
-
|
397
|
-
|
398
|
-
|
457
|
+
self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2)
|
458
|
+
self.dev_impl:NVDev = NVDev(self.pci_dev.pcibus, self.pci_dev.map_bar(0, fmt='I'), self.pci_dev.map_bar(1),
|
459
|
+
self.pci_dev.read_config(pci.PCI_VENDOR_ID, 4), self.pci_dev.read_config(pci.PCI_SUBSYSTEM_VENDOR_ID, 4),
|
460
|
+
self.pci_dev.read_config(pci.PCI_REVISION_ID, 1), self.pci_dev.bar_info)
|
461
|
+
self.root, self.gpu_instance, self.p2p_base_addr = 0xc1000000, 0, self.pci_dev.bar_info[1][0]
|
462
|
+
self.rm_alloc(0, nv_gpu.NV01_ROOT, nv_gpu.NV0000_ALLOC_PARAMETERS())
|
399
463
|
|
400
|
-
|
464
|
+
# Setup classes for the GPU
|
465
|
+
self.gpfifo_class, self.compute_class, self.dma_class = (gsp:=self.dev_impl.gsp).gpfifo_class, gsp.compute_class, gsp.dma_class
|
401
466
|
|
402
|
-
|
403
|
-
|
467
|
+
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, **kwargs) -> HCQBuffer:
|
468
|
+
# Force use of huge pages for large allocations. NVDev will attempt to use huge pages in any case,
|
469
|
+
# but if the size is not aligned, the tail will be allocated with 4KB pages, increasing TLB pressure.
|
470
|
+
page_size = (2 << 20) if size >= (8 << 20) and not uncached and not host else (4 << 10)
|
471
|
+
return super().alloc(round_up(size, page_size), host=host, uncached=uncached, cpu_access=cpu_access, contiguous=contiguous, **kwargs)
|
404
472
|
|
405
|
-
|
406
|
-
|
407
|
-
|
473
|
+
def setup_usermode(self): return 0xce000000, self.pci_dev.map_bar(bar=0, fmt='I', off=0xbb0000, size=0x10000)
|
474
|
+
def setup_vm(self, vaspace): pass
|
475
|
+
def setup_gpfifo_vm(self, gpfifo): pass
|
408
476
|
|
409
|
-
|
410
|
-
|
411
|
-
self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
412
|
-
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
|
413
|
-
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
414
|
-
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
477
|
+
def rm_alloc(self, parent, clss, params=None, root=None) -> int: return self.dev_impl.gsp.rpc_rm_alloc(parent, clss, params, self.root)
|
478
|
+
def rm_control(self, obj, cmd, params=None): return self.dev_impl.gsp.rpc_rm_control(obj, cmd, params, self.root)
|
415
479
|
|
416
|
-
|
417
|
-
self._debug_mappings: dict[tuple[int, int], str] = dict()
|
480
|
+
def device_fini(self): self.dev_impl.fini()
|
418
481
|
|
419
|
-
|
420
|
-
|
482
|
+
class NVDevice(HCQCompiled[HCQSignal]):
|
483
|
+
def is_nvd(self) -> bool: return isinstance(self.iface, PCIIface)
|
421
484
|
|
422
|
-
|
423
|
-
|
424
|
-
|
485
|
+
def __init__(self, device:str=""):
|
486
|
+
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
487
|
+
self.iface = self._select_iface(NVKIface, PCIIface)
|
425
488
|
|
426
|
-
|
427
|
-
|
489
|
+
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.iface.gpu_instance, hClientShare=self.iface.root,
|
490
|
+
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
491
|
+
self.nvdevice = self.iface.rm_alloc(self.iface.root, nv_gpu.NV01_DEVICE_0, device_params)
|
492
|
+
self.subdevice = self.iface.rm_alloc(self.nvdevice, nv_gpu.NV20_SUBDEVICE_0, nv_gpu.NV2080_ALLOC_PARAMETERS())
|
493
|
+
self.usermode, self.gpu_mmio = self.iface.setup_usermode()
|
428
494
|
|
429
|
-
|
430
|
-
|
495
|
+
self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, nv_gpu.NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff,
|
496
|
+
flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | \
|
497
|
+
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX))))
|
431
498
|
|
432
|
-
|
433
|
-
|
434
|
-
|
499
|
+
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
|
500
|
+
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
|
501
|
+
vaspace = self.iface.rm_alloc(self.nvdevice, nv_gpu.FERMI_VASPACE_A, vaspace_params)
|
435
502
|
|
436
|
-
|
437
|
-
NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
|
438
|
-
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
|
439
|
-
else: self._gpu_map(NVDevice.signals_page)
|
503
|
+
self.iface.setup_vm(vaspace)
|
440
504
|
|
441
505
|
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
442
|
-
channel_group = rm_alloc(self.
|
506
|
+
channel_group = self.iface.rm_alloc(self.nvdevice, nv_gpu.KEPLER_CHANNEL_GROUP_A, channel_params)
|
443
507
|
|
444
|
-
gpfifo_area = self.
|
508
|
+
gpfifo_area = self.iface.alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000)
|
445
509
|
|
446
510
|
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
447
|
-
ctxshare = rm_alloc(
|
511
|
+
ctxshare = self.iface.rm_alloc(channel_group, nv_gpu.FERMI_CONTEXT_SHARE_A, ctxshare_params)
|
448
512
|
|
449
|
-
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000,
|
450
|
-
self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
|
513
|
+
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, compute=True)
|
514
|
+
self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000, compute=False)
|
515
|
+
self.iface.rm_control(channel_group, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1))
|
451
516
|
|
452
|
-
|
453
|
-
|
454
|
-
self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
|
517
|
+
self.cmdq_page:HCQBuffer = self.iface.alloc(0x200000, cpu_access=True)
|
455
518
|
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True)
|
456
|
-
self.cmdq
|
519
|
+
self.cmdq = MMIOInterface(cast(int, self.cmdq_page.va_addr), 0x200000, fmt='I')
|
457
520
|
|
458
521
|
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
|
459
522
|
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
|
460
|
-
|
523
|
+
|
524
|
+
# FIXME: no idea how to convert this for blackwells
|
525
|
+
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
|
526
|
+
self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)
|
461
527
|
|
462
528
|
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
|
463
529
|
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
|
464
|
-
functools.partial(NVProgram, self),
|
530
|
+
functools.partial(NVProgram, self), HCQSignal, NVComputeQueue, NVCopyQueue)
|
465
531
|
|
466
532
|
self._setup_gpfifos()
|
467
533
|
|
468
|
-
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400,
|
469
|
-
notifier = self.
|
534
|
+
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, compute=False) -> GPFifo:
|
535
|
+
notifier = self.iface.alloc(48 << 20, uncached=True)
|
470
536
|
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
|
471
537
|
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
472
538
|
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
473
|
-
gpfifo =
|
474
|
-
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
|
475
|
-
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
539
|
+
gpfifo = self.iface.rm_alloc(channel_group, self.iface.gpfifo_class, params)
|
476
540
|
|
477
|
-
if
|
478
|
-
self.debug_compute_obj, self.debug_channel =
|
479
|
-
debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
|
480
|
-
self.debugger = rm_alloc(self.
|
541
|
+
if compute:
|
542
|
+
self.debug_compute_obj, self.debug_channel = self.iface.rm_alloc(gpfifo, self.iface.compute_class), gpfifo
|
543
|
+
debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.iface.root, hClass3dObject=self.debug_compute_obj)
|
544
|
+
self.debugger = self.iface.rm_alloc(self.nvdevice, nv_gpu.GT200_DEBUGGER, debugger_params)
|
545
|
+
else: self.iface.rm_alloc(gpfifo, self.iface.dma_class)
|
481
546
|
|
482
|
-
ws_token_params =
|
483
|
-
|
484
|
-
|
485
|
-
channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
|
486
|
-
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
|
487
|
-
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
547
|
+
ws_token_params = self.iface.rm_control(gpfifo, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN,
|
548
|
+
nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1))
|
549
|
+
self.iface.setup_gpfifo_vm(gpfifo)
|
488
550
|
|
489
|
-
return GPFifo(ring=
|
551
|
+
return GPFifo(ring=MMIOInterface(gpfifo_area.va_addr + offset, entries*8, fmt='Q'), entries_count=entries, token=ws_token_params.workSubmitToken,
|
490
552
|
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))
|
491
553
|
|
492
554
|
def _query_gpu_info(self, *reqs):
|
493
|
-
nvrs = [getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_'+r.upper(), getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_LITTER_'+r.upper(),None)) for r in reqs]
|
555
|
+
nvrs = [getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_'+r.upper(), getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_LITTER_'+r.upper(), None)) for r in reqs]
|
556
|
+
|
557
|
+
if self.is_nvd():
|
558
|
+
x = self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_INTERNAL_STATIC_KGR_GET_INFO,
|
559
|
+
nv_gpu.NV2080_CTRL_INTERNAL_STATIC_GR_GET_INFO_PARAMS())
|
560
|
+
return [x.engineInfo[0].infoList[nvr].data for nvr in nvrs]
|
561
|
+
|
494
562
|
infos = (nv_gpu.NV2080_CTRL_GR_INFO*len(nvrs))(*[nv_gpu.NV2080_CTRL_GR_INFO(index=nvr) for nvr in nvrs])
|
495
|
-
|
563
|
+
self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_GR_GET_INFO,
|
564
|
+
nv_gpu.NV2080_CTRL_GR_GET_INFO_PARAMS(grInfoListSize=len(infos), grInfoList=ctypes.addressof(infos)))
|
496
565
|
return [x.data for x in infos]
|
497
566
|
|
498
567
|
def _setup_gpfifos(self):
|
568
|
+
self.slm_per_thread, self.shader_local_mem = 0, None
|
569
|
+
|
499
570
|
# Set windows addresses to not collide with other allocated buffers.
|
500
|
-
self.shared_mem_window, self.local_mem_window
|
571
|
+
self.shared_mem_window, self.local_mem_window = 0x729400000000, 0x729300000000
|
501
572
|
|
502
|
-
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
503
|
-
.signal(self.timeline_signal, self.
|
573
|
+
NVComputeQueue().setup(compute_class=self.iface.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
574
|
+
.signal(self.timeline_signal, self.next_timeline()).submit(self)
|
504
575
|
|
505
|
-
|
506
|
-
|
507
|
-
|
576
|
+
NVCopyQueue().wait(self.timeline_signal, self.timeline_value - 1) \
|
577
|
+
.setup(copy_class=self.iface.dma_class) \
|
578
|
+
.signal(self.timeline_signal, self.next_timeline()).submit(self)
|
508
579
|
|
509
|
-
self.
|
580
|
+
self.synchronize()
|
510
581
|
|
511
582
|
def _ensure_has_local_memory(self, required):
|
512
583
|
if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
|
@@ -520,30 +591,31 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|
520
591
|
|
521
592
|
cast(NVComputeQueue, NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1)) \
|
522
593
|
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
|
523
|
-
.signal(self.timeline_signal, self.
|
524
|
-
self.timeline_value += 1
|
594
|
+
.signal(self.timeline_signal, self.next_timeline()).submit(self)
|
525
595
|
|
526
596
|
def invalidate_caches(self):
|
527
|
-
|
528
|
-
|
529
|
-
|
597
|
+
if self.is_nvd(): self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_INTERNAL_BUS_FLUSH_WITH_SYSMEMBAR, None)
|
598
|
+
else:
|
599
|
+
self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_FB_FLUSH_GPU_CACHE, nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_PARAMS(
|
600
|
+
flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
|
601
|
+
(nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4))))
|
530
602
|
|
531
603
|
def on_device_hang(self):
|
532
604
|
# Prepare fault report.
|
533
605
|
# TODO: Restore the GPU using NV83DE_CTRL_CMD_CLEAR_ALL_SM_ERROR_STATES if needed.
|
534
606
|
|
535
607
|
report = []
|
536
|
-
sm_errors =
|
608
|
+
sm_errors = self.iface.rm_control(self.debugger, nv_gpu.NV83DE_CTRL_CMD_DEBUG_READ_ALL_SM_ERROR_STATES,
|
609
|
+
nv_gpu.NV83DE_CTRL_DEBUG_READ_ALL_SM_ERROR_STATES_PARAMS(hTargetChannel=self.debug_channel, numSMsToRead=100))
|
537
610
|
|
538
611
|
if sm_errors.mmuFault.valid:
|
539
|
-
|
540
|
-
|
541
|
-
|
612
|
+
mmu = self.iface.rm_control(self.debugger, nv_gpu.NV83DE_CTRL_CMD_DEBUG_READ_MMU_FAULT_INFO,
|
613
|
+
nv_gpu.NV83DE_CTRL_DEBUG_READ_MMU_FAULT_INFO_PARAMS())
|
614
|
+
for i in range(mmu.count):
|
615
|
+
pfinfo = mmu.mmuFaultInfoList[i]
|
542
616
|
report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
|
543
|
-
if DEBUG >= 5:
|
544
|
-
report += ["GPU mappings:\n"+"\n".join(f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings))]
|
545
617
|
else:
|
546
618
|
for i, e in enumerate(sm_errors.smErrorStateArray):
|
547
|
-
if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM {i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr} warp_pc={e.hwwWarpEsrPc64}"]
|
619
|
+
if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM {i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr:#x} warp_pc={e.hwwWarpEsrPc64:#x}"]
|
548
620
|
|
549
621
|
raise RuntimeError("\n".join(report))
|