tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tinygrad/codegen/kernel.py +248 -115
  2. tinygrad/codegen/lowerer.py +215 -0
  3. tinygrad/codegen/transcendental.py +310 -0
  4. tinygrad/codegen/uopgraph.py +622 -0
  5. tinygrad/codegen/uops.py +235 -393
  6. tinygrad/device.py +428 -69
  7. tinygrad/dtype.py +18 -4
  8. tinygrad/engine/graph.py +19 -32
  9. tinygrad/engine/jit.py +148 -70
  10. tinygrad/engine/realize.py +127 -51
  11. tinygrad/engine/schedule.py +259 -216
  12. tinygrad/engine/search.py +29 -22
  13. tinygrad/function.py +9 -0
  14. tinygrad/helpers.py +87 -49
  15. tinygrad/lazy.py +34 -35
  16. tinygrad/multi.py +41 -36
  17. tinygrad/nn/__init__.py +39 -22
  18. tinygrad/nn/state.py +3 -3
  19. tinygrad/ops.py +63 -62
  20. tinygrad/renderer/__init__.py +43 -21
  21. tinygrad/renderer/assembly.py +104 -106
  22. tinygrad/renderer/cstyle.py +87 -60
  23. tinygrad/renderer/llvmir.py +21 -30
  24. tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
  25. tinygrad/runtime/autogen/cuda.py +6 -162
  26. tinygrad/runtime/autogen/kfd.py +32 -0
  27. tinygrad/runtime/autogen/libc.py +4260 -0
  28. tinygrad/runtime/autogen/nvrtc.py +579 -0
  29. tinygrad/runtime/graph/clang.py +2 -2
  30. tinygrad/runtime/graph/cuda.py +8 -11
  31. tinygrad/runtime/graph/hcq.py +120 -107
  32. tinygrad/runtime/graph/metal.py +18 -15
  33. tinygrad/runtime/ops_amd.py +197 -305
  34. tinygrad/runtime/ops_clang.py +2 -2
  35. tinygrad/runtime/ops_cuda.py +36 -94
  36. tinygrad/runtime/ops_disk.py +3 -7
  37. tinygrad/runtime/ops_gpu.py +4 -2
  38. tinygrad/runtime/ops_hip.py +70 -0
  39. tinygrad/runtime/ops_metal.py +38 -27
  40. tinygrad/runtime/ops_nv.py +283 -363
  41. tinygrad/runtime/ops_python.py +26 -30
  42. tinygrad/runtime/support/compiler_cuda.py +78 -0
  43. tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
  44. tinygrad/runtime/support/elf.py +38 -0
  45. tinygrad/shape/shapetracker.py +5 -14
  46. tinygrad/shape/symbolic.py +4 -8
  47. tinygrad/shape/view.py +34 -22
  48. tinygrad/tensor.py +399 -97
  49. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
  50. tinygrad-0.9.2.dist-info/RECORD +70 -0
  51. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
  52. tinygrad/codegen/linearizer.py +0 -528
  53. tinygrad-0.9.1.dist-info/RECORD +0 -63
  54. /tinygrad/runtime/{driver → support}/__init__.py +0 -0
  55. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
  56. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,19 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
3
- from typing import Tuple, List, Any
2
+ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, time, array, decimal
3
+ from typing import Tuple, List, Any, cast, Union, Dict, Type
4
4
  from dataclasses import dataclass
5
- from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
- from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
5
+ from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \
6
+ HCQArgsState, HCQProgram, HCQSignal, BufferOptions
7
+ from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
8
+ from tinygrad.renderer.assembly import PTXRenderer
7
9
  from tinygrad.renderer.cstyle import NVRenderer
8
- from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
9
- import tinygrad.runtime.autogen.cuda as cuda
10
- import tinygrad.runtime.autogen.nv_gpu as nv_gpu
11
- if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
12
-
13
- libc = ctypes.CDLL(ctypes.util.find_library("c"))
14
- libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
15
- libc.mmap.restype = ctypes.c_void_p
16
- libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
17
- libc.munmap.restype = ctypes.c_int
18
-
19
- if MOCKGPU:=getenv("MOCKGPU"):
20
- import extra.mockgpu.mockgpu # noqa: F401
21
- libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
22
- libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
10
+ from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler, nv_disassemble
11
+ from tinygrad.runtime.autogen import nv_gpu, libc
12
+ from tinygrad.runtime.support.elf import elf_loader
13
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
14
+ if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
15
+
16
+ def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
23
17
 
24
18
  def nv_iowr(fd, nr, args):
25
19
  ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
@@ -27,117 +21,113 @@ def nv_iowr(fd, nr, args):
27
21
 
28
22
  def rm_alloc(fd, clss, root, parant, params):
29
23
  made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
30
- pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
24
+ pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
31
25
  nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
32
- if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
26
+ if made.status != 0: raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
33
27
  return made
34
28
 
35
- def rm_control(fd, cmd, client, obj, params):
36
- made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
37
- params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
29
+ def rm_control(cmd, sttyp, fd, client, obj, **kwargs):
30
+ made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params:=sttyp(**kwargs)),
31
+ params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
38
32
  nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
39
- if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
40
- return made
33
+ if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
34
+ return params
35
+
36
+ def make_rmctrl_type():
37
+ return type("NVRMCTRL", (object,), {name[name.find("_CTRL_CMD_")+10:].lower(): functools.partial(rm_control, dt, sttyp)
38
+ for name,dt in nv_gpu.__dict__.items() if name.find("_CTRL_CMD_")>=0 and
39
+ (sttyp:=getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_")+"_PARAMS", getattr(nv_gpu, name+"_PARAMS", None)))})
40
+ rmctrl = make_rmctrl_type()
41
41
 
42
42
  def uvm_ioctl(cmd, sttyp, fd, **kwargs):
43
43
  ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
44
44
  if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
45
- if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {made.rmStatus}: {nv_gpu.nv_status_codes.get(made.rmStatus, 'Unknown error')}")
45
+ if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
46
46
  return made
47
47
 
48
48
  def make_uvm_type():
49
- fxns = {name.replace("UVM_", "").lower():
50
- functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
51
- for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
52
- return type("NVUVM", (object, ), fxns)
49
+ return type("NVUVM", (object,), {name.replace("UVM_", "").lower(): functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
50
+ for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
53
51
  uvm = make_uvm_type()
54
52
 
55
53
  def make_qmd_struct_type():
56
- fields = []
54
+ fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
57
55
  bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
58
56
  bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
59
57
  bits = sorted(bits, key=lambda x: x[1][1])
60
58
  for i,(name, data) in enumerate(bits):
61
- if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
59
+ if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
62
60
  fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
61
+ if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
62
+ fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
63
63
  return init_c_struct_t(tuple(fields))
64
64
  qmd_struct_t = make_qmd_struct_type()
65
65
  assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
66
66
 
67
67
  def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
68
- def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
69
- def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
70
-
71
- class NVCompiler(Compiler):
72
- def __init__(self, arch:str):
73
- self.arch = arch
74
- #NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
75
- cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
76
- self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
77
- if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
78
- super().__init__(f"compile_nv_{self.arch}")
79
- def compile(self, src:str) -> bytes:
80
- cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
81
- status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
82
-
83
- if status != 0:
84
- raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
85
- return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
86
-
87
- class HWQueue:
88
- def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
68
+
69
+
70
+ class NVSignal(HCQSignal):
71
+ def __init__(self, value=0):
72
+ self._signal = NVDevice.signals_pool.pop()
73
+ self.signal_addr = mv_address(self._signal)
74
+ super().__init__(value)
75
+ def __del__(self): NVDevice.signals_pool.append(self._signal)
76
+ def _get_value(self) -> int: return self._signal[0]
77
+ def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
78
+ def _set_value(self, new_value:int): self._signal[0] = new_value
79
+ def wait(self, value:int, timeout:int=10000):
80
+ start_time = time.time() * 1000
81
+ while time.time() * 1000 - start_time < timeout:
82
+ if self._signal[0] >= value: return
83
+ raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
84
+
85
+ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
89
86
  def __del__(self):
90
87
  if self.binded_device is not None:
91
88
  self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
92
89
  self.binded_device._gpu_free(self.hw_page)
93
90
 
94
- def _mark_command_end(self):
95
- self.cmd_offsets.append(len(self.q))
96
- return self
97
- def __len__(self): return len(self.cmd_offsets) - 1
98
-
99
- def memory_barrier(self): return self._mark_command_end()
100
-
101
- def wait(self, signal, value=0):
102
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
91
+ @hcq_command
92
+ def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
93
+ if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class]
94
+ if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class]
95
+ if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)]
96
+ if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)]
97
+ if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)]
98
+ if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0x40]
99
+
100
+ def _wait(self, signal, value=0):
101
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
103
102
  (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
104
- return self._mark_command_end()
105
-
106
- def timestamp(self, signal): return HWQueue.signal(self, signal, timestamp=True)
107
103
 
108
- def signal(self, signal, value=0, timestamp=False):
109
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
110
- (1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
111
- self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
112
- return self._mark_command_end()
104
+ def _update_wait(self, cmd_idx, signal=None, value=None):
105
+ if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
106
+ if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
113
107
 
114
- def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
115
- def update_wait(self, cmd_idx, signal=None, value=None):
116
- if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
117
- if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
118
- return self
108
+ def _timestamp(self, signal): return self._signal(signal, 0)
119
109
 
120
- def bind(self, device: NVDevice):
110
+ def bind(self, device):
121
111
  self.binded_device = device
122
- self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
123
- hw_view = to_mv(self.hw_page.base, self.hw_page.length).cast("I")
112
+ self.hw_page = cast(NVDevice, device)._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
113
+ hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
124
114
  for i, value in enumerate(self.q): hw_view[i] = value
125
115
 
126
116
  # From now on, the queue is on the device for faster submission.
127
117
  self.q = hw_view # type: ignore
128
118
 
129
- def _submit(self, dev, gpfifo:GPFifo):
119
+ def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
130
120
  if len(self.q) == 0: return
131
121
 
132
- if dev == self.binded_device: cmdq_addr = self.hw_page.base
122
+ if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
133
123
  else:
134
- if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.length:
135
- assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.base + len(self.q) * 4 or \
124
+ if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
125
+ assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
136
126
  gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
137
127
  dev.cmdq_wptr = 0
138
128
 
139
129
  dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
140
- cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
130
+ cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
141
131
  dev.cmdq_wptr += len(self.q) * 4
142
132
 
143
133
  gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
@@ -145,37 +135,26 @@ class HWQueue:
145
135
  dev.gpu_mmio[0x90 // 4] = gpfifo.token
146
136
  gpfifo.put_value += 1
147
137
 
148
- class HWComputeQueue(HWQueue):
138
+ class NVComputeQueue(NVCommandQueue, HWComputeQueue):
149
139
  def __init__(self):
140
+ self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
150
141
  super().__init__()
151
- self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
152
142
 
153
- def copy_from_cpu(self, gpuaddr, data):
154
- self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
155
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
156
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
157
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
158
- return self._mark_command_end()
143
+ def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
159
144
 
160
- def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0):
161
- ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
162
- self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
163
- self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
164
- self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
145
+ def _exec(self, prg, args_state, global_size, local_size):
146
+ cmd_idx = len(self) - 1
147
+
148
+ ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
149
+ self.cmd_idx_to_qmd[cmd_idx] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
150
+ self.cmd_idx_to_global_dims[cmd_idx] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
151
+ self.cmd_idx_to_local_dims[cmd_idx] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
165
152
 
166
153
  qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
167
154
  qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
168
- qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
169
- qmd.constant_buffer_addr_upper_0 = kernargs >> 32
170
- if signal is not None:
171
- qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
172
- qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
173
- qmd.release0_payload_lower = signal_value & 0xffffffff
174
- qmd.release0_payload_upper = signal_value >> 32
175
- qmd.release0_enable = 1
176
-
177
- if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
178
- self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
155
+ qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
156
+
157
+ if (prev_qmd:=self.cmd_idx_to_qmd.get(cmd_idx - 1)) is None:
179
158
  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
180
159
  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
181
160
  else:
@@ -183,180 +162,145 @@ class HWComputeQueue(HWQueue):
183
162
  prev_qmd.dependent_qmd0_action = 1
184
163
  prev_qmd.dependent_qmd0_prefetch = 1
185
164
  prev_qmd.dependent_qmd0_enable = 1
186
- return self._mark_command_end()
187
165
 
188
- def update_exec(self, cmd_idx, global_size, local_size):
166
+ def _update_exec(self, cmd_idx, global_size, local_size):
189
167
  # Patch the exec cmd with new launch dims
190
- self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
191
- self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
168
+ if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
169
+ if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
170
+
171
+ def _signal(self, signal, value=0):
172
+ if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 2)) is not None:
173
+ for i in range(2):
174
+ if getattr(prev_qmd, f'release{i}_enable') == 0:
175
+ setattr(prev_qmd, f'release{i}_enable', 1)
176
+ setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
177
+ setattr(prev_qmd, f'release{i}_payload', value)
178
+ self.cmd_idx_to_qmd[len(self) - 1] = prev_qmd
179
+ self.cmd_idx_to_signal_id[len(self) - 1] = i
180
+ return
181
+
182
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
183
+ (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
184
+ self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
185
+
186
+ def _update_signal(self, cmd_idx, signal=None, value=None):
187
+ if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
188
+ if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
189
+ if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
192
190
 
193
- def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
191
+ def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
194
192
 
195
- class HWCopyQueue(HWQueue):
196
- def copy(self, dest, src, copy_size):
197
- self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
193
+ class NVCopyQueue(NVCommandQueue, HWCopyQueue):
194
+ def _copy(self, dest, src, copy_size):
195
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
198
196
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
199
197
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
200
- return self._mark_command_end()
201
198
 
202
- def signal(self, signal, value=0):
203
- self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
199
+ def _update_copy(self, cmd_idx, dest=None, src=None):
200
+ if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
201
+ if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
202
+
203
+ def _signal(self, signal, value=0):
204
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *data64(signal.signal_addr), value, 4]
204
205
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
205
- return self._mark_command_end()
206
206
 
207
- def update_signal(self, cmd_idx, signal=None, value=None):
208
- if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
209
- if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
210
- return self
207
+ def _update_signal(self, cmd_idx, signal=None, value=None):
208
+ if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
209
+ if value is not None: self._patch(cmd_idx, offset=3, data=[value])
210
+
211
+ def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
211
212
 
212
- def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)
213
+ class NVArgsState(HCQArgsState):
214
+ def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
215
+ super().__init__(ptr, prg, bufs, vals=vals)
213
216
 
214
- SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
215
- class NVProgram:
217
+ if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
218
+ kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
219
+ to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
220
+ self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
221
+ self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
222
+
223
+ def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
224
+ def update_var(self, index:int, val:int): self.vals[index] = val
225
+
226
+ class NVProgram(HCQProgram):
216
227
  def __init__(self, device:NVDevice, name:str, lib:bytes):
217
228
  self.device, self.name, self.lib = device, name, lib
218
- if DEBUG >= 6:
219
- try:
220
- fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
221
- with open(fn + ".cubin", "wb") as f: f.write(lib)
222
- print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
223
- except Exception as e: print("failed to disasm cubin", str(e))
224
-
225
- self.rel_info, self.global_init, self.shmem_usage = None, None, 0
226
- constant_buffers_data = {}
227
-
228
- if MOCKGPU:
229
- self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
230
- constant_buffers_data[0] = memoryview(bytearray(0x190))
231
- else:
232
- _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
233
- sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
234
- shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
235
- for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
236
- section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
237
- if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
238
- elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
239
- self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
240
- self.registers_usage = sh_info >> 24
241
- if match := re.match(r'\.nv\.constant(\d+)', section_name):
242
- constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
243
- if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
244
- elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
245
- elif section_name == ".nv.info":
246
- section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
247
- for i in range(sh_size // 12):
248
- if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
249
- raise RuntimeError("too high local memory")
229
+ if DEBUG >= 6: nv_disassemble(lib)
250
230
 
251
- # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
252
- self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
231
+ if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
232
+ else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
253
233
 
254
- # Load program and constant buffers (if any)
255
234
  # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
256
- self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
257
- round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
258
- self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
235
+ self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True))
236
+
237
+ self.program_addr, self.program_sz, self.registers_usage, self.shmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0
238
+ self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
239
+ for sh in sections:
240
+ if sh.name == f".nv.shared.{self.name}": self.shmem_usage = sh.header.sh_size
241
+ if sh.name == f".text.{self.name}":
242
+ self.program_addr, self.program_sz, self.registers_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, sh.header.sh_info>>24
243
+ elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
244
+ elif sh.name == ".nv.info":
245
+ for off in range(0, sh.header.sh_size, 12):
246
+ typ, _, val = struct.unpack_from("III", sh.content, off)
247
+ if typ & 0xffff == 0x1204: self.device._ensure_has_local_memory(val + 0x240)
248
+
249
+ # Apply relocs
250
+ for apply_image_offset, rel_sym_offset, typ, _ in relocs:
251
+ # These types are CUDA-specific, applying them here
252
+ if typ == 2: image[apply_image_offset:apply_image_offset+8] = struct.pack('<Q', self.lib_gpu.va_addr + rel_sym_offset) # R_CUDA_64
253
+ elif typ == 0x38: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) & 0xffffffff)
254
+ elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
255
+ else: raise RuntimeError(f"unknown NV reloc {typ}")
256
+
257
+ ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
259
258
 
260
259
  self.constbuffer_0 = [0] * 88
261
- self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
260
+ self.constbuffer_0[6:12] = [*data64_le(self.device.shared_mem_window), *data64_le(self.device.local_mem_window), *data64_le(0xfffdc0)]
262
261
 
263
262
  smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
264
263
  self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
265
264
  invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
266
- cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
265
+ cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
267
266
  shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
268
267
  max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
269
- barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
270
- program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
271
- program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
272
- constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
273
-
274
- # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
275
- self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
276
- self.kernargs_alloc_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
277
- self.kernargs_offset = 0x160
278
-
279
- # constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
280
- if 0 in constant_buffers_data: constant_buffers_data.pop(0)
281
-
282
- off = round_up(self.program.nbytes, 128)
283
-
284
- if self.rel_info is not None:
285
- assert self.global_init is not None
286
- global_init_addr = self.lib_gpu.base + off
287
- for rel_i in range(0, len(self.rel_info), 4):
288
- if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
289
- elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
290
- else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
291
-
292
- HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
293
- for st in range(0, len(self.program), 4095):
294
- HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
295
-
296
- if self.global_init is not None:
297
- HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
298
- off += round_up(self.global_init.nbytes, 128)
299
- if 4 in constant_buffers_data: # >= 12.4
300
- # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
301
- assert constant_buffers_data[4].nbytes == 8
302
- constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
303
-
304
- for i,data in constant_buffers_data.items():
305
- self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
306
- self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
307
- self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
268
+ barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program_sz>>8,
269
+ program_address=self.program_addr, sass_version=0x89,
270
+ program_prefetch_addr_lower_shifted=self.program_addr>>8, program_prefetch_addr_upper_shifted=self.program_addr>>40)
271
+
272
+ for i,(addr,sz) in self.constbufs.items():
273
+ self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (addr) >> 32)
274
+ self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (addr) & 0xffffffff)
275
+ self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
308
276
  self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
309
277
 
310
- HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
311
- off += round_up(data.nbytes, 128)
278
+ # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
279
+ self.max_threads = ((65536 // round_up(max(1, self.registers_usage) * 32, 256)) // 4) * 4 * 32
312
280
 
313
- HWComputeQueue().signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
314
- self.device.timeline_value += 1
315
- self.device.synchronize()
281
+ # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
282
+ super().__init__(NVArgsState, self.device, self.name,
283
+ kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8), kernargs_args_offset=0x160)
316
284
 
317
285
  def __del__(self):
318
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
286
+ if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
319
287
 
320
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
288
+ def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
321
289
  if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
322
290
  if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
323
- raise RuntimeError("Invalid global/local dims")
291
+ raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
292
+ return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
324
293
 
325
- if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_alloc_size):
326
- self.device.kernargs_ptr = self.device.kernargs_page.base
327
-
328
- # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
329
- if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
330
- kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
331
-
332
- sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
333
-
334
- queue = HWComputeQueue()
335
- queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
336
- if wait or PROFILE: queue.timestamp(sig_st)
337
- queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
338
- queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
339
- if wait or PROFILE: queue.timestamp(sig_en)
340
- queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
341
- self.device.timeline_value += 1
342
- self.device.kernargs_ptr += self.kernargs_alloc_size
343
-
344
- if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
345
- if wait:
346
- self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
347
- return (sig_en[1] - sig_st[1]) / 1e9
348
-
349
- class NVAllocator(HCQCompatAllocator):
350
- def __init__(self, device:NVDevice): super().__init__(device)
351
-
352
- def _alloc(self, size:int, options:BufferOptions):
294
+ class NVAllocator(HCQAllocator):
295
+ def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
353
296
  if options.host: return self.device._gpu_host_alloc(size)
354
297
  return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
355
298
 
356
299
  def _free(self, opaque, options:BufferOptions):
357
300
  self.device.synchronize()
358
- if options.host: self.device._gpu_host_free(opaque)
359
- else: self.device._gpu_free(opaque)
301
+ self.device._gpu_free(opaque)
302
+
303
+ def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
360
304
 
361
305
  @dataclass
362
306
  class GPFifo:
@@ -367,19 +311,18 @@ class GPFifo:
367
311
  put_value: int = 0
368
312
 
369
313
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
370
- class NVDevice(HCQCompatCompiled):
314
+ class NVDevice(HCQCompiled):
371
315
  root = None
372
316
  fd_ctl: int = -1
373
317
  fd_uvm: int = -1
374
- gpus_info = None
318
+ gpus_info:Union[List, ctypes.Array] = []
375
319
  signals_page:Any = None
376
320
  signals_pool: List[Any] = []
377
321
  uvm_vaddr: int = 0x1000000000
378
322
  host_object_enumerator: int = 0x1000
379
- devices: List[NVDevice] = []
380
323
 
381
324
  def _new_gpu_fd(self):
382
- fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
325
+ fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
383
326
  nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
384
327
  return fd_dev
385
328
 
@@ -388,8 +331,10 @@ class NVDevice(HCQCompatCompiled):
388
331
  made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
389
332
  params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
390
333
  nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
391
- if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
392
- return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
334
+ if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
335
+ res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
336
+ os.close(fd_dev)
337
+ return res
393
338
 
394
339
  def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
395
340
  size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
@@ -404,7 +349,7 @@ class NVDevice(HCQCompatCompiled):
404
349
 
405
350
  if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
406
351
  if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
407
- return self._gpu_uvm_map(va_addr, size, mem_handle)
352
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
408
353
 
409
354
  def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
410
355
  alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
@@ -417,51 +362,56 @@ class NVDevice(HCQCompatCompiled):
417
362
  if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
418
363
  if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
419
364
 
420
- return self._gpu_uvm_map(va_addr, size, mem_handle)
365
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
421
366
 
422
367
  def _gpu_host_alloc(self, size):
423
- va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
424
- libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
425
- return self._map_to_gpu(va_base, sz)
426
-
427
- def _gpu_free(self, mem):
428
- made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
429
- nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
430
- if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
431
- uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
368
+ va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
369
+ mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
370
+ assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
432
371
 
433
- def _gpu_host_free(self, mem):
434
- uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
435
- libc.munmap(mem.base, mem.length)
436
-
437
- def _map_to_gpu(self, va_base, size):
438
372
  NVDevice.host_object_enumerator += 1
439
373
  flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
440
374
  (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
441
375
  made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
442
- hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
376
+ hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1)
443
377
  nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
444
- if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
445
- return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
446
378
 
447
- def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
379
+ if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
380
+ return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True)
381
+
382
+ def _gpu_free(self, mem):
383
+ if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
384
+ made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
385
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
386
+ if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
387
+
388
+ uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
389
+ if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
390
+
391
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
448
392
  if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
449
393
  gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
450
394
  nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
451
395
 
452
- # NOTE: va_addr is set to make rawbufs compatable with AMD.
396
+ # NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
453
397
  return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
454
- gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size)
398
+ gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
455
399
 
456
400
  def _gpu_map(self, mem):
457
- if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
458
- mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_uuid])
459
- return self._gpu_uvm_map(mem.base, mem.length, mem.hMemory, create_range=False)
401
+ if self.gpu_uuid in mem.mapped_gpu_ids: return
402
+ mem.mapped_gpu_ids.append(self.gpu_uuid)
403
+ self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False)
460
404
 
461
405
  def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
462
406
  NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
463
407
  return res_va
464
408
 
409
+ def _setup_nvclasses(self):
410
+ classlist = memoryview(bytearray(100 * 4)).cast('I')
411
+ clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist))
412
+ self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
413
+ self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
414
+
465
415
  def __init__(self, device:str=""):
466
416
  if NVDevice.root is None:
467
417
  NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
@@ -471,48 +421,48 @@ class NVDevice(HCQCompatCompiled):
471
421
  uvm.initialize(self.fd_uvm)
472
422
  with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
473
423
 
474
- NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
475
- nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
424
+ nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
425
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
426
+ NVDevice.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
476
427
 
477
- # TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
478
428
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
479
- self.fd_dev = self._new_gpu_fd()
480
429
 
481
- assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
482
- gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
483
- rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
484
- device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
485
- self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A
430
+ if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid:
431
+ raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
432
+
433
+ self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
434
+ self.fd_dev = self._new_gpu_fd()
486
435
 
487
- device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
436
+ device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
488
437
  vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
489
438
  self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
490
439
  self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
491
440
  self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
492
441
  self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
493
442
 
494
- boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
443
+ self._setup_nvclasses()
444
+
445
+ rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
495
446
  (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
496
- rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
497
447
 
498
448
  vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
499
449
  flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
500
450
  vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
501
451
 
502
- gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
503
- rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
504
- self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
452
+ raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
453
+ self.gpu_uuid = (ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)])
505
454
 
506
455
  uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
507
456
  uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
508
457
  hClient=self.root, hVaSpace=vaspace)
509
458
 
510
459
  for dev in self.devices:
511
- uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
460
+ uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid),
461
+ gpuUuidB=nv_gpu.struct_nv_uuid(uuid=cast(NVDevice, dev).gpu_uuid))
512
462
 
513
463
  if NVDevice.signals_page is None:
514
464
  NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
515
- NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
465
+ NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
516
466
  else: self._gpu_map(NVDevice.signals_page)
517
467
 
518
468
  channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
@@ -526,100 +476,70 @@ class NVDevice(HCQCompatCompiled):
526
476
  self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
527
477
  self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
528
478
 
529
- en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
530
- rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
531
-
532
- self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
479
+ rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
533
480
 
534
481
  self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
535
- self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
482
+ self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
536
483
  self.cmdq_wptr: int = 0 # in bytes
537
484
 
538
- self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
539
- self.kernargs_ptr: int = self.kernargs_page.base
540
-
541
- self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
542
-
543
- super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
544
- functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
545
-
546
- self._cmdq_setup_compute_gpfifo()
547
- self._cmdq_setup_dma_gpfifo()
548
-
549
- NVDevice.devices.append(self)
550
-
551
- @classmethod
552
- def _read_signal(self, sig): return sig[0]
553
-
554
- @classmethod
555
- def _read_timestamp(self, sig): return sig[1]
556
-
557
- @classmethod
558
- def _set_signal(self, sig, value): sig[0] = value
485
+ sm_info = nv_gpu.NV2080_CTRL_GR_INFO(index=nv_gpu.NV2080_CTRL_GR_INFO_INDEX_SM_VERSION)
486
+ rmctrl.gr_get_info(self.fd_ctl, self.root, self.subdevice, grInfoListSize=1, grInfoList=ctypes.addressof(sm_info))
487
+ self.arch: str = f"sm_{(sm_info.data>>8)&0xff}{(val>>4) if (val:=sm_info.data&0xff) > 0xf else val}"
559
488
 
560
- @classmethod
561
- def _get_signal(self, value=0, **kwargs) -> memoryview:
562
- self._set_signal(sig := self.signals_pool.pop(), value)
563
- return sig
489
+ compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
490
+ super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
491
+ functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue, timeline_signals=(NVSignal(), NVSignal()))
564
492
 
565
- @classmethod
566
- def _wait_signal(self, signal, value=0, timeout=10000):
567
- start_time = time.time() * 1000
568
- while time.time() * 1000 - start_time < timeout:
569
- if signal[0] >= value: return
570
- raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
571
-
572
- def _gpu2cpu_time(self, gpu_time, is_copy): return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e3
573
-
574
- def synchronize(self):
575
- NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
576
- self.cmdq_wptr = 0
577
-
578
- if self.timeline_value > (1 << 63): self._wrap_timeline_signal()
579
- if PROFILE: self._prof_process_events()
493
+ self._setup_gpfifos()
580
494
 
581
495
  def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
582
496
  notifier = self._gpu_system_alloc(48 << 20)
583
497
  params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
584
- gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
498
+ gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
585
499
  hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
586
500
  gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
587
- rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
501
+ rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None)
588
502
  rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
589
503
 
590
- ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
591
- rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
504
+ ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
592
505
  assert ws_token_params.workSubmitToken != -1
593
506
 
594
507
  channel_base = self._alloc_gpu_vaddr(0x4000000)
595
508
  uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
596
509
  hChannel=gpfifo, base=channel_base, length=0x4000000)
597
510
 
598
- return GPFifo(ring=to_mv(gpfifo_area.base + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
599
- controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.base + offset + entries * 8))
511
+ return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
512
+ controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))
513
+
514
+ def _setup_gpfifos(self):
515
+ # Set windows addresses to not collide with other allocated buffers.
516
+ self.shared_mem_window, self.local_mem_window, self.slm_per_thread = 0xfe000000, 0xff000000, 0
517
+
518
+ NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
519
+ .signal(self.timeline_signal, self.timeline_value).submit(self)
600
520
 
601
- def _cmdq_setup_compute_gpfifo(self):
602
- self.slm_per_thread = 0x900
521
+ NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
522
+ .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
523
+ .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
524
+
525
+ self.timeline_value += 2
526
+
527
+ def _ensure_has_local_memory(self, required):
528
+ if self.slm_per_thread >= required: return
529
+
530
+ self.synchronize()
531
+ if hasattr(self, 'shader_local_mem'): self._gpu_free(self.shader_local_mem) # type: ignore # pylint: disable=access-member-before-definition
532
+
533
+ self.slm_per_thread = round_up(required, 32)
603
534
  bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
604
535
  bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
605
- self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
536
+ self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True)
606
537
 
607
- # Set windows addresses to not collide with other allocated buffers.
608
- self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
609
-
610
- queue = HWComputeQueue()
611
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
612
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
613
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
614
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
615
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
616
- queue.signal(self.timeline_signal, self.timeline_value).submit(self)
538
+ NVComputeQueue().setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
539
+ .signal(self.timeline_signal, self.timeline_value).submit(self)
617
540
  self.timeline_value += 1
618
- self.synchronize()
619
541
 
620
- def _cmdq_setup_dma_gpfifo(self):
621
- queue = HWCopyQueue()
622
- queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
623
- queue.signal(self.timeline_signal, self.timeline_value).submit(self)
624
- self.timeline_value += 1
625
- self.synchronize()
542
+ def invalidate_caches(self):
543
+ rmctrl.fb_flush_gpu_cache(self.fd_ctl, self.root, self.subdevice,
544
+ flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
545
+ (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4)))