tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +11 -6
- tinygrad/codegen/kernel.py +308 -175
- tinygrad/codegen/linearize.py +95 -0
- tinygrad/codegen/lowerer.py +143 -0
- tinygrad/codegen/transcendental.py +257 -0
- tinygrad/codegen/uopgraph.py +506 -0
- tinygrad/device.py +72 -171
- tinygrad/dtype.py +122 -47
- tinygrad/engine/jit.py +184 -87
- tinygrad/{lazy.py → engine/lazy.py} +74 -66
- tinygrad/engine/memory.py +51 -0
- tinygrad/engine/realize.py +86 -61
- tinygrad/engine/schedule.py +366 -317
- tinygrad/engine/search.py +58 -47
- tinygrad/function.py +59 -58
- tinygrad/helpers.py +120 -102
- tinygrad/multi.py +82 -78
- tinygrad/nn/__init__.py +116 -67
- tinygrad/nn/datasets.py +12 -5
- tinygrad/nn/optim.py +1 -1
- tinygrad/nn/state.py +91 -6
- tinygrad/ops.py +1126 -143
- tinygrad/renderer/__init__.py +47 -23
- tinygrad/renderer/cstyle.py +338 -265
- tinygrad/renderer/llvmir.py +125 -143
- tinygrad/renderer/ptx.py +225 -0
- tinygrad/runtime/autogen/adreno.py +17904 -0
- tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
- tinygrad/runtime/autogen/cuda.py +6 -162
- tinygrad/runtime/autogen/io_uring.py +97 -63
- tinygrad/runtime/autogen/kfd.py +60 -47
- tinygrad/runtime/autogen/kgsl.py +1386 -0
- tinygrad/runtime/autogen/libc.py +5462 -0
- tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
- tinygrad/runtime/autogen/nvrtc.py +579 -0
- tinygrad/runtime/autogen/opencl.py +11 -11
- tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
- tinygrad/runtime/graph/clang.py +3 -3
- tinygrad/runtime/graph/cuda.py +11 -15
- tinygrad/runtime/graph/hcq.py +120 -107
- tinygrad/runtime/graph/metal.py +71 -43
- tinygrad/runtime/ops_amd.py +244 -323
- tinygrad/runtime/ops_clang.py +12 -5
- tinygrad/runtime/ops_cloud.py +220 -0
- tinygrad/runtime/ops_cuda.py +42 -99
- tinygrad/runtime/ops_disk.py +25 -26
- tinygrad/runtime/ops_dsp.py +181 -0
- tinygrad/runtime/ops_gpu.py +29 -16
- tinygrad/runtime/ops_hip.py +68 -0
- tinygrad/runtime/ops_llvm.py +15 -10
- tinygrad/runtime/ops_metal.py +147 -64
- tinygrad/runtime/ops_nv.py +356 -397
- tinygrad/runtime/ops_python.py +78 -79
- tinygrad/runtime/ops_qcom.py +405 -0
- tinygrad/runtime/support/__init__.py +0 -0
- tinygrad/runtime/support/compiler_cuda.py +77 -0
- tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
- tinygrad/runtime/support/elf.py +38 -0
- tinygrad/runtime/support/hcq.py +539 -0
- tinygrad/shape/shapetracker.py +40 -50
- tinygrad/shape/view.py +102 -63
- tinygrad/tensor.py +1109 -365
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
- tinygrad-0.10.0.dist-info/RECORD +77 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/linearizer.py +0 -528
- tinygrad/codegen/uops.py +0 -451
- tinygrad/engine/graph.py +0 -100
- tinygrad/renderer/assembly.py +0 -269
- tinygrad/shape/symbolic.py +0 -327
- tinygrad-0.9.1.dist-info/RECORD +0 -63
- /tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_nv.py
CHANGED
@@ -1,25 +1,24 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, contextlib,
|
3
|
-
|
2
|
+
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys
|
3
|
+
assert sys.platform != 'win32'
|
4
|
+
from typing import Tuple, List, Any, cast, Union, Dict, Type
|
4
5
|
from dataclasses import dataclass
|
5
|
-
from tinygrad.
|
6
|
-
from tinygrad.
|
6
|
+
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
|
7
|
+
from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
|
8
|
+
from tinygrad.device import BufferOptions
|
9
|
+
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
10
|
+
from tinygrad.renderer.ptx import PTXRenderer
|
7
11
|
from tinygrad.renderer.cstyle import NVRenderer
|
8
|
-
from tinygrad.runtime.
|
9
|
-
|
10
|
-
|
11
|
-
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
if MOCKGPU:=getenv("MOCKGPU"):
|
20
|
-
import extra.mockgpu.mockgpu # noqa: F401
|
21
|
-
libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
|
22
|
-
libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
|
12
|
+
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
13
|
+
from tinygrad.runtime.autogen import nv_gpu, libc
|
14
|
+
from tinygrad.runtime.support.elf import elf_loader
|
15
|
+
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
16
|
+
if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
17
|
+
|
18
|
+
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
|
19
|
+
|
20
|
+
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
|
21
|
+
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
|
23
22
|
|
24
23
|
def nv_iowr(fd, nr, args):
|
25
24
|
ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
@@ -27,117 +26,105 @@ def nv_iowr(fd, nr, args):
|
|
27
26
|
|
28
27
|
def rm_alloc(fd, clss, root, parant, params):
|
29
28
|
made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
|
30
|
-
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.
|
29
|
+
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
|
31
30
|
nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
|
32
|
-
if made.status != 0:
|
31
|
+
if made.status != 0:
|
32
|
+
if made.status == nv_gpu.NV_ERR_NO_MEMORY: raise MemoryError(f"rm_alloc returned {get_error_str(made.status)}")
|
33
|
+
raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
|
33
34
|
return made
|
34
35
|
|
35
|
-
def rm_control(
|
36
|
-
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
|
37
|
-
params=ctypes.cast(ctypes.byref(params), ctypes.
|
36
|
+
def rm_control(cmd, sttyp, fd, client, obj, **kwargs):
|
37
|
+
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params:=sttyp(**kwargs)),
|
38
|
+
params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
|
38
39
|
nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
|
39
|
-
if made.status != 0: raise RuntimeError(f"rm_control returned {
|
40
|
-
return
|
40
|
+
if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
|
41
|
+
return params
|
42
|
+
|
43
|
+
def make_rmctrl_type():
|
44
|
+
return type("NVRMCTRL", (object,), {name[name.find("_CTRL_CMD_")+10:].lower(): functools.partial(rm_control, dt, sttyp)
|
45
|
+
for name,dt in nv_gpu.__dict__.items() if name.find("_CTRL_CMD_")>=0 and (sttyp:=getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_")+"_PARAMS", \
|
46
|
+
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
|
47
|
+
rmctrl = make_rmctrl_type()
|
41
48
|
|
42
49
|
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
43
50
|
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
|
44
51
|
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
45
|
-
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {
|
52
|
+
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
|
46
53
|
return made
|
47
54
|
|
48
55
|
def make_uvm_type():
|
49
|
-
|
50
|
-
|
51
|
-
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
|
52
|
-
return type("NVUVM", (object, ), fxns)
|
56
|
+
return type("NVUVM", (object,), {name.replace("UVM_", "").lower(): functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
|
57
|
+
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
|
53
58
|
uvm = make_uvm_type()
|
54
59
|
|
55
60
|
def make_qmd_struct_type():
|
56
|
-
fields = []
|
61
|
+
fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
|
57
62
|
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
58
63
|
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
59
64
|
bits = sorted(bits, key=lambda x: x[1][1])
|
60
65
|
for i,(name, data) in enumerate(bits):
|
61
|
-
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0:
|
66
|
+
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
|
62
67
|
fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
|
68
|
+
if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
|
69
|
+
fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
|
63
70
|
return init_c_struct_t(tuple(fields))
|
64
71
|
qmd_struct_t = make_qmd_struct_type()
|
65
72
|
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
66
73
|
|
67
74
|
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
68
|
-
def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
|
69
|
-
def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
|
70
|
-
|
71
|
-
class NVCompiler(Compiler):
|
72
|
-
def __init__(self, arch:str):
|
73
|
-
self.arch = arch
|
74
|
-
#NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
|
75
|
-
cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
76
|
-
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
77
|
-
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
78
|
-
super().__init__(f"compile_nv_{self.arch}")
|
79
|
-
def compile(self, src:str) -> bytes:
|
80
|
-
cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
|
81
|
-
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
|
82
|
-
|
83
|
-
if status != 0:
|
84
|
-
raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
|
85
|
-
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
|
86
|
-
|
87
|
-
class HWQueue:
|
88
|
-
def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
|
89
|
-
def __del__(self):
|
90
|
-
if self.binded_device is not None:
|
91
|
-
self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
|
92
|
-
self.binded_device._gpu_free(self.hw_page)
|
93
75
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
def
|
100
|
-
|
101
|
-
def
|
102
|
-
|
76
|
+
class NVSignal(HCQSignal):
|
77
|
+
def __init__(self, value=0, is_timeline=False):
|
78
|
+
self._signal = NVDevice.signals_pool.pop()
|
79
|
+
self.signal_addr = mv_address(self._signal)
|
80
|
+
super().__init__(value)
|
81
|
+
def __del__(self): NVDevice.signals_pool.append(self._signal)
|
82
|
+
def _get_value(self) -> int: return self._signal[0]
|
83
|
+
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
|
84
|
+
def _set_value(self, new_value:int): self._signal[0] = new_value
|
85
|
+
|
86
|
+
class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
|
87
|
+
def __del__(self):
|
88
|
+
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
|
89
|
+
|
90
|
+
@hcq_command
|
91
|
+
def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
|
92
|
+
if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class]
|
93
|
+
if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class]
|
94
|
+
if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)]
|
95
|
+
if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)]
|
96
|
+
if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)]
|
97
|
+
if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff]
|
98
|
+
|
99
|
+
def _wait(self, signal, value=0):
|
100
|
+
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
|
103
101
|
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
|
104
|
-
return self._mark_command_end()
|
105
102
|
|
106
|
-
def
|
103
|
+
def _update_wait(self, cmd_idx, signal=None, value=None):
|
104
|
+
if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
|
105
|
+
if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
|
107
106
|
|
108
|
-
def
|
109
|
-
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
110
|
-
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
111
|
-
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
112
|
-
return self._mark_command_end()
|
107
|
+
def _timestamp(self, signal): return self._signal(signal, 0)
|
113
108
|
|
114
|
-
def
|
115
|
-
def update_wait(self, cmd_idx, signal=None, value=None):
|
116
|
-
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
|
117
|
-
if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
|
118
|
-
return self
|
119
|
-
|
120
|
-
def bind(self, device: NVDevice):
|
109
|
+
def bind(self, device):
|
121
110
|
self.binded_device = device
|
122
|
-
self.hw_page = device.
|
123
|
-
hw_view = to_mv(self.hw_page.
|
111
|
+
self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
|
112
|
+
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
124
113
|
for i, value in enumerate(self.q): hw_view[i] = value
|
125
114
|
|
126
115
|
# From now on, the queue is on the device for faster submission.
|
127
116
|
self.q = hw_view # type: ignore
|
128
117
|
|
129
|
-
def
|
130
|
-
if
|
131
|
-
|
132
|
-
if dev == self.binded_device: cmdq_addr = self.hw_page.base
|
118
|
+
def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
|
119
|
+
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
|
133
120
|
else:
|
134
|
-
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.
|
135
|
-
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.
|
121
|
+
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
|
122
|
+
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
|
136
123
|
gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
|
137
124
|
dev.cmdq_wptr = 0
|
138
125
|
|
139
126
|
dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
|
140
|
-
cmdq_addr = dev.cmdq_page.
|
127
|
+
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
|
141
128
|
dev.cmdq_wptr += len(self.q) * 4
|
142
129
|
|
143
130
|
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
|
@@ -145,37 +132,26 @@ class HWQueue:
|
|
145
132
|
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
146
133
|
gpfifo.put_value += 1
|
147
134
|
|
148
|
-
class HWComputeQueue
|
135
|
+
class NVComputeQueue(NVCommandQueue, HWComputeQueue):
|
149
136
|
def __init__(self):
|
137
|
+
self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
|
150
138
|
super().__init__()
|
151
|
-
self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
|
152
139
|
|
153
|
-
def
|
154
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
|
155
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
|
156
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
|
157
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
|
158
|
-
return self._mark_command_end()
|
140
|
+
def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
|
159
141
|
|
160
|
-
def
|
161
|
-
ctypes.memmove(qmd_addr:=(
|
162
|
-
|
163
|
-
|
164
|
-
self.
|
142
|
+
def _exec(self, prg, args_state, global_size, local_size):
|
143
|
+
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
144
|
+
assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
|
145
|
+
|
146
|
+
self.cmd_idx_to_qmd[self._cur_cmd_idx()] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
147
|
+
self.cmd_idx_to_global_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
148
|
+
self.cmd_idx_to_local_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
165
149
|
|
166
150
|
qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
|
167
151
|
qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
|
168
|
-
qmd.constant_buffer_addr_lower_0 =
|
169
|
-
|
170
|
-
if
|
171
|
-
qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
|
172
|
-
qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
|
173
|
-
qmd.release0_payload_lower = signal_value & 0xffffffff
|
174
|
-
qmd.release0_payload_upper = signal_value >> 32
|
175
|
-
qmd.release0_enable = 1
|
176
|
-
|
177
|
-
if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
|
178
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
|
152
|
+
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
|
153
|
+
|
154
|
+
if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is None:
|
179
155
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
|
180
156
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
|
181
157
|
else:
|
@@ -183,180 +159,146 @@ class HWComputeQueue(HWQueue):
|
|
183
159
|
prev_qmd.dependent_qmd0_action = 1
|
184
160
|
prev_qmd.dependent_qmd0_prefetch = 1
|
185
161
|
prev_qmd.dependent_qmd0_enable = 1
|
186
|
-
return self._mark_command_end()
|
187
162
|
|
188
|
-
def
|
163
|
+
def _update_exec(self, cmd_idx, global_size, local_size):
|
189
164
|
# Patch the exec cmd with new launch dims
|
190
|
-
self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
191
|
-
self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
165
|
+
if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
166
|
+
if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
167
|
+
|
168
|
+
def _signal(self, signal, value=0):
|
169
|
+
if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is not None:
|
170
|
+
for i in range(2):
|
171
|
+
if getattr(prev_qmd, f'release{i}_enable') == 0:
|
172
|
+
setattr(prev_qmd, f'release{i}_enable', 1)
|
173
|
+
setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
|
174
|
+
setattr(prev_qmd, f'release{i}_payload', value)
|
175
|
+
self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd
|
176
|
+
self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i
|
177
|
+
return
|
178
|
+
|
179
|
+
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
|
180
|
+
(1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
181
|
+
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
182
|
+
|
183
|
+
def _update_signal(self, cmd_idx, signal=None, value=None):
|
184
|
+
if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
|
185
|
+
if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
|
186
|
+
if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
|
192
187
|
|
193
|
-
def
|
188
|
+
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
|
194
189
|
|
195
|
-
class HWCopyQueue
|
196
|
-
def
|
197
|
-
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *
|
190
|
+
class NVCopyQueue(NVCommandQueue, HWCopyQueue):
|
191
|
+
def _copy(self, dest, src, copy_size):
|
192
|
+
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
|
198
193
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
199
194
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
200
|
-
return self._mark_command_end()
|
201
195
|
|
202
|
-
def
|
203
|
-
|
196
|
+
def _update_copy(self, cmd_idx, dest=None, src=None):
|
197
|
+
if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
|
198
|
+
if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
|
199
|
+
|
200
|
+
def _signal(self, signal, value=0):
|
201
|
+
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value]
|
204
202
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
|
205
|
-
return self._mark_command_end()
|
206
203
|
|
207
|
-
def
|
208
|
-
if signal is not None: self.
|
209
|
-
if value is not None: self.
|
210
|
-
|
204
|
+
def _update_signal(self, cmd_idx, signal=None, value=None):
|
205
|
+
if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
|
206
|
+
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
207
|
+
|
208
|
+
def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
|
211
209
|
|
212
|
-
|
210
|
+
class NVArgsState(HCQArgsState):
|
211
|
+
def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
212
|
+
super().__init__(ptr, prg, bufs, vals=vals)
|
213
213
|
|
214
|
-
|
215
|
-
|
214
|
+
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
|
215
|
+
kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
|
216
|
+
to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
|
217
|
+
self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
|
218
|
+
self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
|
219
|
+
|
220
|
+
def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
|
221
|
+
def update_var(self, index:int, val:int): self.vals[index] = val
|
222
|
+
|
223
|
+
class NVProgram(HCQProgram):
|
216
224
|
def __init__(self, device:NVDevice, name:str, lib:bytes):
|
217
225
|
self.device, self.name, self.lib = device, name, lib
|
218
|
-
if DEBUG >= 6:
|
219
|
-
try:
|
220
|
-
fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
|
221
|
-
with open(fn + ".cubin", "wb") as f: f.write(lib)
|
222
|
-
print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
|
223
|
-
except Exception as e: print("failed to disasm cubin", str(e))
|
224
|
-
|
225
|
-
self.rel_info, self.global_init, self.shmem_usage = None, None, 0
|
226
|
-
constant_buffers_data = {}
|
227
|
-
|
228
|
-
if MOCKGPU:
|
229
|
-
self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
|
230
|
-
constant_buffers_data[0] = memoryview(bytearray(0x190))
|
231
|
-
else:
|
232
|
-
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
233
|
-
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
234
|
-
shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
|
235
|
-
for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
|
236
|
-
section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
|
237
|
-
if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
|
238
|
-
elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
|
239
|
-
self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
240
|
-
self.registers_usage = sh_info >> 24
|
241
|
-
if match := re.match(r'\.nv\.constant(\d+)', section_name):
|
242
|
-
constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
243
|
-
if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
244
|
-
elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
|
245
|
-
elif section_name == ".nv.info":
|
246
|
-
section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
247
|
-
for i in range(sh_size // 12):
|
248
|
-
if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
|
249
|
-
raise RuntimeError("too high local memory")
|
250
226
|
|
251
|
-
|
252
|
-
|
227
|
+
if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
228
|
+
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
253
229
|
|
254
|
-
# Load program and constant buffers (if any)
|
255
230
|
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
256
|
-
self.
|
257
|
-
|
258
|
-
self.
|
231
|
+
self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True))
|
232
|
+
|
233
|
+
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
|
234
|
+
self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
|
235
|
+
for sh in sections:
|
236
|
+
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
|
237
|
+
if sh.name == f".text.{self.name}":
|
238
|
+
self.prog_addr, self.prog_sz, self.regs_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, max(sh.header.sh_info>>24, 16)
|
239
|
+
elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
|
240
|
+
elif sh.name == ".nv.info":
|
241
|
+
for off in range(0, sh.header.sh_size, 12):
|
242
|
+
typ, _, val = struct.unpack_from("III", sh.content, off)
|
243
|
+
if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
|
244
|
+
|
245
|
+
# Ensure device has enough local memory to run the program
|
246
|
+
self.device._ensure_has_local_memory(self.lcmem_usage)
|
247
|
+
|
248
|
+
# Apply relocs
|
249
|
+
for apply_image_offset, rel_sym_offset, typ, _ in relocs:
|
250
|
+
# These types are CUDA-specific, applying them here
|
251
|
+
if typ == 2: image[apply_image_offset:apply_image_offset+8] = struct.pack('<Q', self.lib_gpu.va_addr + rel_sym_offset) # R_CUDA_64
|
252
|
+
elif typ == 0x38: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) & 0xffffffff)
|
253
|
+
elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
|
254
|
+
else: raise RuntimeError(f"unknown NV reloc {typ}")
|
255
|
+
|
256
|
+
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
259
257
|
|
260
258
|
self.constbuffer_0 = [0] * 88
|
261
|
-
self.constbuffer_0[6:12] = [*
|
259
|
+
self.constbuffer_0[6:12] = [*data64_le(self.device.shared_mem_window), *data64_le(self.device.local_mem_window), *data64_le(0xfffdc0)]
|
262
260
|
|
263
|
-
|
261
|
+
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
264
262
|
self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
265
263
|
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
|
266
|
-
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
|
267
|
-
shared_memory_size=
|
268
|
-
max_sm_config_shared_mem_size=0x1a, register_count_v=self.
|
269
|
-
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
self.kernargs_alloc_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
|
277
|
-
self.kernargs_offset = 0x160
|
278
|
-
|
279
|
-
# constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
|
280
|
-
if 0 in constant_buffers_data: constant_buffers_data.pop(0)
|
281
|
-
|
282
|
-
off = round_up(self.program.nbytes, 128)
|
283
|
-
|
284
|
-
if self.rel_info is not None:
|
285
|
-
assert self.global_init is not None
|
286
|
-
global_init_addr = self.lib_gpu.base + off
|
287
|
-
for rel_i in range(0, len(self.rel_info), 4):
|
288
|
-
if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
|
289
|
-
elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
|
290
|
-
else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
|
291
|
-
|
292
|
-
HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
|
293
|
-
for st in range(0, len(self.program), 4095):
|
294
|
-
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
|
295
|
-
|
296
|
-
if self.global_init is not None:
|
297
|
-
HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
|
298
|
-
off += round_up(self.global_init.nbytes, 128)
|
299
|
-
if 4 in constant_buffers_data: # >= 12.4
|
300
|
-
# Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
|
301
|
-
assert constant_buffers_data[4].nbytes == 8
|
302
|
-
constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
|
303
|
-
|
304
|
-
for i,data in constant_buffers_data.items():
|
305
|
-
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
|
306
|
-
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
|
307
|
-
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
|
264
|
+
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
|
265
|
+
shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
|
266
|
+
max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
|
267
|
+
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
|
268
|
+
program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
|
269
|
+
|
270
|
+
for i,(addr,sz) in self.constbufs.items():
|
271
|
+
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (addr) >> 32)
|
272
|
+
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (addr) & 0xffffffff)
|
273
|
+
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
|
308
274
|
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
|
309
275
|
|
310
|
-
|
311
|
-
|
276
|
+
# Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
|
277
|
+
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
|
312
278
|
|
313
|
-
|
314
|
-
self.device.
|
315
|
-
self.device.synchronize()
|
279
|
+
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
280
|
+
super().__init__(NVArgsState, self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
|
316
281
|
|
317
282
|
def __del__(self):
|
318
|
-
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.
|
283
|
+
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
|
319
284
|
|
320
|
-
def __call__(self, *
|
321
|
-
if prod(local_size) > 1024 or self.max_threads < prod(local_size)
|
285
|
+
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
286
|
+
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.device).slm_per_thread:
|
287
|
+
raise RuntimeError("Too many resources requested for launch")
|
322
288
|
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
323
|
-
raise RuntimeError("Invalid global/local dims")
|
289
|
+
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
290
|
+
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
324
291
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
|
330
|
-
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
|
331
|
-
|
332
|
-
sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
333
|
-
|
334
|
-
queue = HWComputeQueue()
|
335
|
-
queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
|
336
|
-
if wait or PROFILE: queue.timestamp(sig_st)
|
337
|
-
queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
|
338
|
-
queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
339
|
-
if wait or PROFILE: queue.timestamp(sig_en)
|
340
|
-
queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
341
|
-
self.device.timeline_value += 1
|
342
|
-
self.device.kernargs_ptr += self.kernargs_alloc_size
|
343
|
-
|
344
|
-
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
345
|
-
if wait:
|
346
|
-
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
347
|
-
return (sig_en[1] - sig_st[1]) / 1e9
|
348
|
-
|
349
|
-
class NVAllocator(HCQCompatAllocator):
|
350
|
-
def __init__(self, device:NVDevice): super().__init__(device)
|
351
|
-
|
352
|
-
def _alloc(self, size:int, options:BufferOptions):
|
353
|
-
if options.host: return self.device._gpu_host_alloc(size)
|
354
|
-
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
|
292
|
+
class NVAllocator(HCQAllocator):
|
293
|
+
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
294
|
+
if options.host: return self.device._gpu_host_alloc(size, tag="user host memory")
|
295
|
+
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)), tag=f"user memory ({options})")
|
355
296
|
|
356
297
|
def _free(self, opaque, options:BufferOptions):
|
357
298
|
self.device.synchronize()
|
358
|
-
|
359
|
-
|
299
|
+
self.device._gpu_free(opaque)
|
300
|
+
|
301
|
+
def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
|
360
302
|
|
361
303
|
@dataclass
|
362
304
|
class GPFifo:
|
@@ -367,19 +309,19 @@ class GPFifo:
|
|
367
309
|
put_value: int = 0
|
368
310
|
|
369
311
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
370
|
-
class NVDevice(
|
312
|
+
class NVDevice(HCQCompiled):
|
371
313
|
root = None
|
372
314
|
fd_ctl: int = -1
|
373
315
|
fd_uvm: int = -1
|
374
|
-
gpus_info =
|
375
|
-
signals_page:Any = None
|
316
|
+
gpus_info: Union[List, ctypes.Array] = []
|
317
|
+
signals_page: Any = None
|
376
318
|
signals_pool: List[Any] = []
|
377
|
-
|
319
|
+
low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
320
|
+
uvm_vaddr: int = 0x2000000000 # 0x2000000000+
|
378
321
|
host_object_enumerator: int = 0x1000
|
379
|
-
devices: List[NVDevice] = []
|
380
322
|
|
381
323
|
def _new_gpu_fd(self):
|
382
|
-
fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
|
324
|
+
fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
383
325
|
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
384
326
|
return fd_dev
|
385
327
|
|
@@ -388,10 +330,12 @@ class NVDevice(HCQCompatCompiled):
|
|
388
330
|
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
|
389
331
|
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
|
390
332
|
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
391
|
-
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
|
392
|
-
|
333
|
+
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
|
334
|
+
res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
|
335
|
+
os.close(fd_dev)
|
336
|
+
return res
|
393
337
|
|
394
|
-
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
|
338
|
+
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
|
395
339
|
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
|
396
340
|
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
|
397
341
|
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
|
@@ -402,11 +346,11 @@ class NVDevice(HCQCompatCompiled):
|
|
402
346
|
nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
|
403
347
|
mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
|
404
348
|
|
405
|
-
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
|
349
|
+
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu)
|
406
350
|
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
|
407
|
-
return self._gpu_uvm_map(va_addr, size, mem_handle)
|
351
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
|
408
352
|
|
409
|
-
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
|
353
|
+
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
|
410
354
|
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
|
411
355
|
attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
|
412
356
|
attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
|
@@ -414,54 +358,62 @@ class NVDevice(HCQCompatCompiled):
|
|
414
358
|
nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
|
415
359
|
mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
|
416
360
|
|
417
|
-
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
|
361
|
+
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True)
|
418
362
|
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
|
419
363
|
|
420
|
-
return self._gpu_uvm_map(va_addr, size, mem_handle)
|
364
|
+
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
|
421
365
|
|
422
|
-
def _gpu_host_alloc(self, size):
|
423
|
-
va_base = self._alloc_gpu_vaddr(
|
424
|
-
libc.mmap(va_base,
|
425
|
-
|
366
|
+
def _gpu_host_alloc(self, size, tag=""):
|
367
|
+
va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
|
368
|
+
mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
369
|
+
assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
|
426
370
|
|
427
|
-
def _gpu_free(self, mem):
|
428
|
-
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
|
429
|
-
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
430
|
-
if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
|
431
|
-
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
|
432
|
-
|
433
|
-
def _gpu_host_free(self, mem):
|
434
|
-
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
|
435
|
-
libc.munmap(mem.base, mem.length)
|
436
|
-
|
437
|
-
def _map_to_gpu(self, va_base, size):
|
438
371
|
NVDevice.host_object_enumerator += 1
|
439
372
|
flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
|
440
373
|
(nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
|
441
374
|
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
|
442
|
-
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=
|
375
|
+
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1)
|
443
376
|
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
444
|
-
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
|
445
|
-
return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
|
446
377
|
|
447
|
-
|
378
|
+
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
|
379
|
+
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
|
380
|
+
|
381
|
+
def _gpu_free(self, mem):
|
382
|
+
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
383
|
+
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
|
384
|
+
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
385
|
+
|
386
|
+
self._debug_mappings.pop((mem.va_addr, mem.size))
|
387
|
+
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
|
388
|
+
if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
|
389
|
+
|
390
|
+
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
|
448
391
|
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
449
|
-
|
450
|
-
nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
|
392
|
+
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
|
451
393
|
|
452
|
-
# NOTE: va_addr is set to make rawbufs compatable with
|
394
|
+
# NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
|
395
|
+
self._debug_mappings[(va_base, size)] = tag
|
453
396
|
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
|
454
|
-
|
397
|
+
gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
|
455
398
|
|
456
399
|
def _gpu_map(self, mem):
|
457
|
-
if self.gpu_uuid in
|
458
|
-
mem.
|
459
|
-
|
460
|
-
|
461
|
-
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
|
462
|
-
|
400
|
+
if self.gpu_uuid in mem.mapped_gpu_ids: return
|
401
|
+
mem.mapped_gpu_ids.append(self.gpu_uuid)
|
402
|
+
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
|
403
|
+
|
404
|
+
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
405
|
+
if force_low:
|
406
|
+
NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
|
407
|
+
assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
|
408
|
+
else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
|
463
409
|
return res_va
|
464
410
|
|
411
|
+
def _setup_nvclasses(self):
|
412
|
+
classlist = memoryview(bytearray(100 * 4)).cast('I')
|
413
|
+
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist))
|
414
|
+
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
415
|
+
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
|
416
|
+
|
465
417
|
def __init__(self, device:str=""):
|
466
418
|
if NVDevice.root is None:
|
467
419
|
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
@@ -471,155 +423,162 @@ class NVDevice(HCQCompatCompiled):
|
|
471
423
|
uvm.initialize(self.fd_uvm)
|
472
424
|
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
473
425
|
|
474
|
-
NVDevice.
|
475
|
-
|
426
|
+
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
427
|
+
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
428
|
+
NVDevice.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
|
476
429
|
|
477
|
-
# TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
|
478
430
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
479
|
-
self.fd_dev = self._new_gpu_fd()
|
480
431
|
|
481
|
-
|
482
|
-
|
483
|
-
rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
|
484
|
-
device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
|
485
|
-
self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A
|
432
|
+
if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid:
|
433
|
+
raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
486
434
|
|
487
|
-
|
435
|
+
self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
|
436
|
+
self.gpu_minor = NVDevice.gpus_info[self.device_id].minor_number
|
437
|
+
self.fd_dev = self._new_gpu_fd()
|
438
|
+
|
439
|
+
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
|
488
440
|
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
489
441
|
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
490
442
|
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
|
491
443
|
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
492
444
|
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
493
445
|
|
494
|
-
|
446
|
+
self._setup_nvclasses()
|
447
|
+
self._debug_mappings: Dict[Tuple[int, int], str] = dict()
|
448
|
+
|
449
|
+
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
495
450
|
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
496
|
-
rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
|
497
451
|
|
498
452
|
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
|
499
453
|
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
|
500
454
|
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
|
501
455
|
|
502
|
-
|
503
|
-
|
504
|
-
self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
|
456
|
+
raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
|
457
|
+
self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
|
505
458
|
|
506
|
-
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=
|
507
|
-
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=
|
508
|
-
hClient=self.root, hVaSpace=vaspace)
|
459
|
+
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
|
460
|
+
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
|
509
461
|
|
510
|
-
for dev in self.devices:
|
511
|
-
uvm.enable_peer_access(self.fd_uvm, gpuUuidA=
|
462
|
+
for dev in cast(List[NVDevice], self.devices):
|
463
|
+
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
|
464
|
+
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
|
512
465
|
|
513
466
|
if NVDevice.signals_page is None:
|
514
467
|
NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
|
515
|
-
NVDevice.signals_pool = [to_mv(self.signals_page.
|
468
|
+
NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
|
516
469
|
else: self._gpu_map(NVDevice.signals_page)
|
517
470
|
|
518
471
|
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
519
472
|
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
|
520
473
|
|
521
|
-
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
|
474
|
+
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo")
|
522
475
|
|
523
476
|
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
524
477
|
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
|
525
478
|
|
526
|
-
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
|
479
|
+
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, enable_debug=True)
|
527
480
|
self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
|
528
481
|
|
529
|
-
|
530
|
-
rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
|
531
|
-
|
532
|
-
self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
|
482
|
+
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
533
483
|
|
534
|
-
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
|
535
|
-
self.cmdq: memoryview = to_mv(self.cmdq_page.
|
484
|
+
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True, tag="cmdq")
|
485
|
+
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
|
536
486
|
self.cmdq_wptr: int = 0 # in bytes
|
537
487
|
|
538
|
-
self.
|
539
|
-
|
488
|
+
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
|
489
|
+
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
|
490
|
+
self.arch: str = f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
|
540
491
|
|
541
|
-
|
492
|
+
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
|
493
|
+
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
|
494
|
+
functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
|
542
495
|
|
543
|
-
|
544
|
-
functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
|
496
|
+
self._setup_gpfifos()
|
545
497
|
|
546
|
-
|
547
|
-
self.
|
548
|
-
|
549
|
-
|
498
|
+
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
|
499
|
+
notifier = self._gpu_system_alloc(48 << 20)
|
500
|
+
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
501
|
+
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
502
|
+
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
503
|
+
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
504
|
+
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
|
505
|
+
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
550
506
|
|
551
|
-
|
552
|
-
|
507
|
+
if enable_debug:
|
508
|
+
self.debug_compute_obj, self.debug_channel = comp, gpfifo
|
509
|
+
debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
|
510
|
+
self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.device, debugger_params).hObjectNew
|
553
511
|
|
554
|
-
|
555
|
-
|
512
|
+
ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
|
513
|
+
assert ws_token_params.workSubmitToken != -1
|
556
514
|
|
557
|
-
|
558
|
-
|
515
|
+
channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
|
516
|
+
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
|
517
|
+
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
559
518
|
|
560
|
-
|
561
|
-
|
562
|
-
self._set_signal(sig := self.signals_pool.pop(), value)
|
563
|
-
return sig
|
519
|
+
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
|
520
|
+
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))
|
564
521
|
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
|
522
|
+
def _query_gpu_info(self, *reqs):
|
523
|
+
nvrs = [getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_'+r.upper(), getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_LITTER_'+r.upper(),None)) for r in reqs]
|
524
|
+
infos = (nv_gpu.NV2080_CTRL_GR_INFO*len(nvrs))(*[nv_gpu.NV2080_CTRL_GR_INFO(index=nvr) for nvr in nvrs])
|
525
|
+
rmctrl.gr_get_info(self.fd_ctl, self.root, self.subdevice, grInfoListSize=len(infos), grInfoList=ctypes.addressof(infos))
|
526
|
+
return [x.data for x in infos]
|
571
527
|
|
572
|
-
def
|
528
|
+
def _setup_gpfifos(self):
|
529
|
+
# Set windows addresses to not collide with other allocated buffers.
|
530
|
+
self.shared_mem_window, self.local_mem_window, self.slm_per_thread, self.shader_local_mem = 0xfe000000, 0xff000000, 0, None
|
573
531
|
|
574
|
-
|
575
|
-
|
576
|
-
self.cmdq_wptr = 0
|
532
|
+
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
533
|
+
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
577
534
|
|
578
|
-
|
579
|
-
|
535
|
+
NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
|
536
|
+
.setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
|
537
|
+
.signal(self.timeline_signal, self.timeline_value + 1).submit(self)
|
580
538
|
|
581
|
-
|
582
|
-
notifier = self._gpu_system_alloc(48 << 20)
|
583
|
-
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
584
|
-
gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
585
|
-
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
586
|
-
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
587
|
-
rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
|
588
|
-
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
539
|
+
self.timeline_value += 2
|
589
540
|
|
590
|
-
|
591
|
-
|
592
|
-
assert ws_token_params.workSubmitToken != -1
|
541
|
+
def _ensure_has_local_memory(self, required):
|
542
|
+
if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
|
593
543
|
|
594
|
-
|
595
|
-
uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
|
596
|
-
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
544
|
+
if self.shader_local_mem is not None: self.allocator.free(self.shader_local_mem, self.shader_local_mem.size)
|
597
545
|
|
598
|
-
|
599
|
-
|
546
|
+
self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread
|
547
|
+
bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
|
600
548
|
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
549
|
+
try: self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
|
550
|
+
except MemoryError:
|
551
|
+
# If can't allocate a new size, reallocator the old buffer.
|
552
|
+
self.slm_per_thread = old_slm_per_thread
|
553
|
+
bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
|
554
|
+
self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
|
606
555
|
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
queue = HWComputeQueue()
|
611
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
|
612
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
|
613
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
|
614
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
|
615
|
-
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
|
616
|
-
queue.signal(self.timeline_signal, self.timeline_value).submit(self)
|
556
|
+
NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
|
557
|
+
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
|
558
|
+
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
617
559
|
self.timeline_value += 1
|
618
|
-
self.synchronize()
|
619
560
|
|
620
|
-
def
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
561
|
+
def invalidate_caches(self):
|
562
|
+
rmctrl.fb_flush_gpu_cache(self.fd_ctl, self.root, self.subdevice,
|
563
|
+
flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
|
564
|
+
(nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4)))
|
565
|
+
|
566
|
+
def on_device_hang(self):
|
567
|
+
# Prepare fault report.
|
568
|
+
# TODO: Restore the GPU using NV83DE_CTRL_CMD_CLEAR_ALL_SM_ERROR_STATES if needed.
|
569
|
+
|
570
|
+
report = []
|
571
|
+
sm_errors = rmctrl.debug_read_all_sm_error_states(self.fd_ctl, self.root, self.debugger, hTargetChannel=self.debug_channel, numSMsToRead=100)
|
572
|
+
|
573
|
+
if sm_errors.mmuFault.valid:
|
574
|
+
mmu_info = rmctrl.debug_read_mmu_fault_info(self.fd_ctl, self.root, self.debugger)
|
575
|
+
for i in range(mmu_info.count):
|
576
|
+
pfinfo = mmu_info.mmuFaultInfoList[i]
|
577
|
+
report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
|
578
|
+
if DEBUG >= 5:
|
579
|
+
report += ["GPU mappings:\n"+"\n".join(f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings))]
|
580
|
+
else:
|
581
|
+
for i, e in enumerate(sm_errors.smErrorStateArray):
|
582
|
+
if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM {i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr} warp_pc={e.hwwWarpEsrPc64}"]
|
583
|
+
|
584
|
+
raise RuntimeError("\n".join(report))
|