tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. tinygrad/__init__.py +1 -1
  2. tinygrad/apps/llm.py +206 -0
  3. tinygrad/codegen/__init__.py +116 -0
  4. tinygrad/codegen/devectorizer.py +315 -172
  5. tinygrad/codegen/expander.py +8 -16
  6. tinygrad/codegen/gpudims.py +89 -0
  7. tinygrad/codegen/linearize.py +205 -203
  8. tinygrad/codegen/lowerer.py +92 -139
  9. tinygrad/codegen/opt/__init__.py +38 -0
  10. tinygrad/codegen/opt/heuristic.py +125 -0
  11. tinygrad/codegen/opt/kernel.py +510 -0
  12. tinygrad/{engine → codegen/opt}/search.py +51 -35
  13. tinygrad/codegen/opt/swizzler.py +134 -0
  14. tinygrad/codegen/opt/tc.py +127 -0
  15. tinygrad/codegen/quantize.py +67 -0
  16. tinygrad/device.py +122 -132
  17. tinygrad/dtype.py +152 -35
  18. tinygrad/engine/jit.py +81 -54
  19. tinygrad/engine/memory.py +46 -27
  20. tinygrad/engine/realize.py +82 -41
  21. tinygrad/engine/schedule.py +70 -445
  22. tinygrad/frontend/__init__.py +0 -0
  23. tinygrad/frontend/onnx.py +1253 -0
  24. tinygrad/frontend/torch.py +5 -0
  25. tinygrad/gradient.py +19 -27
  26. tinygrad/helpers.py +95 -47
  27. tinygrad/nn/__init__.py +7 -8
  28. tinygrad/nn/optim.py +72 -41
  29. tinygrad/nn/state.py +37 -23
  30. tinygrad/renderer/__init__.py +40 -60
  31. tinygrad/renderer/cstyle.py +143 -128
  32. tinygrad/renderer/llvmir.py +113 -62
  33. tinygrad/renderer/ptx.py +50 -32
  34. tinygrad/renderer/wgsl.py +27 -23
  35. tinygrad/runtime/autogen/am/am.py +5861 -0
  36. tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
  37. tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
  38. tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
  39. tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
  40. tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
  41. tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
  42. tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
  43. tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
  44. tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
  45. tinygrad/runtime/autogen/comgr.py +35 -9
  46. tinygrad/runtime/autogen/comgr_3.py +906 -0
  47. tinygrad/runtime/autogen/cuda.py +2419 -494
  48. tinygrad/runtime/autogen/hsa.py +57 -16
  49. tinygrad/runtime/autogen/ib.py +7171 -0
  50. tinygrad/runtime/autogen/io_uring.py +917 -118
  51. tinygrad/runtime/autogen/kfd.py +748 -26
  52. tinygrad/runtime/autogen/libc.py +613 -218
  53. tinygrad/runtime/autogen/libusb.py +1643 -0
  54. tinygrad/runtime/autogen/nv/nv.py +8602 -0
  55. tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
  56. tinygrad/runtime/autogen/opencl.py +2 -4
  57. tinygrad/runtime/autogen/sqtt.py +1789 -0
  58. tinygrad/runtime/autogen/vfio.py +3 -3
  59. tinygrad/runtime/autogen/webgpu.py +273 -264
  60. tinygrad/runtime/graph/cuda.py +3 -3
  61. tinygrad/runtime/graph/hcq.py +68 -29
  62. tinygrad/runtime/graph/metal.py +29 -13
  63. tinygrad/runtime/graph/remote.py +114 -0
  64. tinygrad/runtime/ops_amd.py +537 -320
  65. tinygrad/runtime/ops_cpu.py +108 -7
  66. tinygrad/runtime/ops_cuda.py +12 -14
  67. tinygrad/runtime/ops_disk.py +13 -10
  68. tinygrad/runtime/ops_dsp.py +47 -40
  69. tinygrad/runtime/ops_gpu.py +13 -11
  70. tinygrad/runtime/ops_hip.py +6 -9
  71. tinygrad/runtime/ops_llvm.py +35 -15
  72. tinygrad/runtime/ops_metal.py +29 -19
  73. tinygrad/runtime/ops_npy.py +5 -3
  74. tinygrad/runtime/ops_null.py +28 -0
  75. tinygrad/runtime/ops_nv.py +306 -234
  76. tinygrad/runtime/ops_python.py +62 -52
  77. tinygrad/runtime/ops_qcom.py +28 -39
  78. tinygrad/runtime/ops_remote.py +482 -0
  79. tinygrad/runtime/ops_webgpu.py +28 -28
  80. tinygrad/runtime/support/am/amdev.py +114 -249
  81. tinygrad/runtime/support/am/ip.py +211 -172
  82. tinygrad/runtime/support/amd.py +138 -0
  83. tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
  84. tinygrad/runtime/support/compiler_cuda.py +8 -11
  85. tinygrad/runtime/support/elf.py +2 -1
  86. tinygrad/runtime/support/hcq.py +184 -97
  87. tinygrad/runtime/support/ib.py +172 -0
  88. tinygrad/runtime/support/llvm.py +3 -4
  89. tinygrad/runtime/support/memory.py +251 -0
  90. tinygrad/runtime/support/nv/__init__.py +0 -0
  91. tinygrad/runtime/support/nv/ip.py +581 -0
  92. tinygrad/runtime/support/nv/nvdev.py +183 -0
  93. tinygrad/runtime/support/system.py +170 -0
  94. tinygrad/runtime/support/usb.py +268 -0
  95. tinygrad/runtime/support/webgpu.py +18 -0
  96. tinygrad/schedule/__init__.py +0 -0
  97. tinygrad/schedule/grouper.py +119 -0
  98. tinygrad/schedule/kernelize.py +368 -0
  99. tinygrad/schedule/multi.py +231 -0
  100. tinygrad/shape/shapetracker.py +40 -46
  101. tinygrad/shape/view.py +88 -52
  102. tinygrad/tensor.py +968 -542
  103. tinygrad/uop/__init__.py +117 -0
  104. tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
  105. tinygrad/uop/mathtraits.py +169 -0
  106. tinygrad/uop/ops.py +1021 -0
  107. tinygrad/uop/spec.py +228 -0
  108. tinygrad/{codegen → uop}/symbolic.py +239 -216
  109. tinygrad/uop/upat.py +163 -0
  110. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
  111. tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
  112. tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
  113. tinygrad/viz/index.html +203 -403
  114. tinygrad/viz/js/index.js +718 -0
  115. tinygrad/viz/js/worker.js +29 -0
  116. tinygrad/viz/serve.py +224 -102
  117. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
  118. tinygrad-0.11.0.dist-info/RECORD +141 -0
  119. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
  120. tinygrad/codegen/kernel.py +0 -693
  121. tinygrad/engine/multi.py +0 -161
  122. tinygrad/ops.py +0 -1003
  123. tinygrad/runtime/ops_cloud.py +0 -220
  124. tinygrad/runtime/support/allocator.py +0 -94
  125. tinygrad/spec.py +0 -155
  126. tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
  127. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
  128. tinygrad/viz/perfetto.html +0 -178
  129. tinygrad-0.10.2.dist-info/RECORD +0 -99
  130. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
  131. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,20 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
2
+ import os, ctypes, contextlib, re, functools, mmap, struct, array, sys, weakref
3
3
  assert sys.platform != 'win32'
4
- from typing import Any, cast, Union, Type
4
+ from typing import cast, ClassVar
5
5
  from dataclasses import dataclass
6
6
  from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
7
- from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
8
- from tinygrad.ops import sint
9
- from tinygrad.device import BufferSpec, CPUProgram
10
- from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX
7
+ from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU
8
+ from tinygrad.uop.ops import sint
9
+ from tinygrad.device import BufferSpec
10
+ from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, suppress_finalizing
11
11
  from tinygrad.renderer.ptx import PTXRenderer
12
12
  from tinygrad.renderer.cstyle import NVRenderer
13
13
  from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
14
- from tinygrad.runtime.autogen import nv_gpu
14
+ from tinygrad.runtime.autogen import nv_gpu, pci
15
15
  from tinygrad.runtime.support.elf import elf_loader
16
+ from tinygrad.runtime.support.nv.nvdev import NVDev, NVMemoryManager
17
+ from tinygrad.runtime.support.system import System, PCIIfaceBase, MAP_FIXED
16
18
  if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
17
19
 
18
20
  def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
@@ -20,33 +22,11 @@ def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status
20
22
  NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
21
23
  NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
22
24
 
23
- def nv_iowr(fd:HWInterface, nr, args):
25
+ def nv_iowr(fd:FileIOInterface, nr, args):
24
26
  ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
25
27
  if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
26
28
 
27
- def rm_alloc(fd, clss, root, parant, params):
28
- made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
29
- pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
30
- nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
31
- if made.status != 0:
32
- if made.status == nv_gpu.NV_ERR_NO_MEMORY: raise MemoryError(f"rm_alloc returned {get_error_str(made.status)}")
33
- raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
34
- return made
35
-
36
- def rm_control(cmd, sttyp, fd, client, obj, **kwargs):
37
- made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params:=sttyp(**kwargs)),
38
- params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
39
- nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
40
- if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
41
- return params
42
-
43
- def make_rmctrl_type():
44
- return type("NVRMCTRL", (object,), {name[name.find("_CTRL_CMD_")+10:].lower(): functools.partial(rm_control, dt, sttyp)
45
- for name,dt in nv_gpu.__dict__.items() if name.find("_CTRL_CMD_")>=0 and (sttyp:=getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_")+"_PARAMS", \
46
- getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
47
- rmctrl = make_rmctrl_type()
48
-
49
- def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
29
+ def uvm_ioctl(cmd, sttyp, fd:FileIOInterface, **kwargs):
50
30
  ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
51
31
  if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
52
32
  if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
@@ -57,28 +37,41 @@ def make_uvm_type():
57
37
  for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
58
38
  uvm = make_uvm_type()
59
39
 
60
- def make_qmd_struct_type():
61
- fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
62
- bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
63
- bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
64
- bits = sorted(bits, key=lambda x: x[1][1])
65
- for i,(name, data) in enumerate(bits):
66
- if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
67
- fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
68
- if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
69
- fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
70
- return init_c_struct_t(tuple(fields))
71
- qmd_struct_t = make_qmd_struct_type()
72
- assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
73
-
74
- class NVSignal(HCQSignal):
75
- def __init__(self, base_addr:int|None=None, **kwargs):
76
- super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
40
+ class QMD:
41
+ fields: dict[str, dict[str, tuple[int, int]]] = {}
77
42
 
78
- def __del__(self):
79
- if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
43
+ def __init__(self, dev:NVDevice, addr:int|None=None, **kwargs):
44
+ self.ver, self.sz = (5, 0x60) if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else (3, 0x40)
45
+
46
+ # Init fields from module
47
+ if (pref:="NVCEC0_QMDV05_00" if self.ver == 5 else "NVC6C0_QMDV03_00") not in QMD.fields:
48
+ QMD.fields[pref] = {**{name[len(pref)+1:]: dt for name,dt in nv_gpu.__dict__.items() if name.startswith(pref) and isinstance(dt, tuple)},
49
+ **{name[len(pref)+1:]+f"_{i}": dt(i) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith(pref) and callable(dt)}}
50
+
51
+ self.mv, self.pref = (memoryview(bytearray(self.sz * 4)) if addr is None else to_mv(addr, self.sz * 4)), pref
52
+ if kwargs: self.write(**kwargs)
53
+
54
+ def _rw_bits(self, hi:int, lo:int, value:int|None=None):
55
+ mask = ((1 << (width:=hi - lo + 1)) - 1) << (lo % 8)
56
+ num = int.from_bytes(self.mv[lo//8:hi//8+1], "little")
80
57
 
81
- class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
58
+ if value is None: return (num & mask) >> (lo % 8)
59
+
60
+ if value >= (1 << width): raise ValueError(f"{value:#x} does not fit.")
61
+ self.mv[lo//8:hi//8+1] = int((num & ~mask) | ((value << (lo % 8)) & mask)).to_bytes((hi//8 - lo//8 + 1), "little")
62
+
63
+ def write(self, **kwargs):
64
+ for k,val in kwargs.items(): self._rw_bits(*QMD.fields[self.pref][k.upper()], value=val) # type: ignore [misc]
65
+
66
+ def read(self, k, val=0): return self._rw_bits(*QMD.fields[self.pref][k.upper()])
67
+
68
+ def field_offset(self, k): return QMD.fields[self.pref][k.upper()][1] // 8
69
+
70
+ def set_constant_buf_addr(self, i, addr):
71
+ if self.ver < 4: self.write(**{f'constant_buffer_addr_upper_{i}':hi32(addr), f'constant_buffer_addr_lower_{i}':lo32(addr)})
72
+ else: self.write(**{f'constant_buffer_addr_upper_shifted6_{i}':hi32(addr >> 6), f'constant_buffer_addr_lower_shifted6_{i}':lo32(addr >> 6)})
73
+
74
+ class NVCommandQueue(HWQueue[HCQSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
82
75
  def __init__(self):
83
76
  self.active_qmd = None
84
77
  super().__init__()
@@ -97,17 +90,17 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
97
90
  if local_mem_tpc_bytes: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, *data64(local_mem_tpc_bytes), 0xff)
98
91
  return self
99
92
 
100
- def wait(self, signal:NVSignal, value:sint=0):
93
+ def wait(self, signal:HCQSignal, value:sint=0):
101
94
  self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT
102
95
  self.active_qmd = None
103
96
  return self
104
97
 
105
- def timestamp(self, signal:NVSignal): return self.signal(signal, 0)
98
+ def timestamp(self, signal:HCQSignal): return self.signal(signal, 0)
106
99
 
107
100
  def bind(self, dev:NVDevice):
108
101
  self.binded_device = dev
109
102
  self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
110
- hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
103
+ hw_view = self.hw_page.cpu_view().view(fmt='I')
111
104
  for i, value in enumerate(self._q): hw_view[i] = value
112
105
 
113
106
  # From now on, the queue is on the device for faster submission.
@@ -123,48 +116,48 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
123
116
  gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
124
117
  gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
125
118
 
126
- if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
119
+ System.memory_barrier()
127
120
  dev.gpu_mmio[0x90 // 4] = gpfifo.token
128
121
  gpfifo.put_value += 1
129
122
 
130
123
  class NVComputeQueue(NVCommandQueue):
131
124
  def memory_barrier(self):
132
125
  self.nvm(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, (1 << 12) | (1 << 4) | (1 << 0))
133
- self.active_qmd = None
126
+ self.active_qmd:QMD|None = None
134
127
  return self
135
128
 
136
129
  def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
137
130
  self.bind_args_state(args_state)
138
131
 
139
- ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
140
- assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
132
+ qmd_buf = args_state.buf.offset(round_up(prg.constbufs[0][1], 1 << 8))
133
+ qmd_buf.cpu_view().view(size=prg.qmd.mv.nbytes, fmt='B')[:] = prg.qmd.mv
134
+ assert qmd_buf.va_addr < (1 << 40), f"large qmd addr {qmd_buf.va_addr:x}"
141
135
 
142
- qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
136
+ qmd = QMD(dev=prg.dev, addr=cast(int, qmd_buf.va_addr)) # Save qmd for later update
143
137
 
144
- self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
145
- self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
146
- self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I')
147
- qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
138
+ self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=qmd.field_offset('cta_raster_width' if qmd.ver<4 else 'grid_width'))
139
+ self.bind_sints_to_mem(*(local_size[:2]), mem=qmd_buf.cpu_view(), fmt='H', offset=qmd.field_offset('cta_thread_dimension0'))
140
+ self.bind_sints_to_mem(local_size[2], mem=qmd_buf.cpu_view(), fmt='B', offset=qmd.field_offset('cta_thread_dimension2'))
141
+ qmd.set_constant_buf_addr(0, args_state.buf.va_addr)
148
142
 
149
143
  if self.active_qmd is None:
150
- self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
144
+ self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_buf.va_addr >> 8)
151
145
  self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
152
146
  else:
153
- self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
154
- self.active_qmd.dependent_qmd0_action = 1
155
- self.active_qmd.dependent_qmd0_prefetch = 1
156
- self.active_qmd.dependent_qmd0_enable = 1
147
+ self.active_qmd.write(dependent_qmd0_pointer=qmd_buf.va_addr >> 8, dependent_qmd0_action=1, dependent_qmd0_prefetch=1, dependent_qmd0_enable=1)
157
148
 
158
- self.active_qmd = qmd
149
+ self.active_qmd, self.active_qmd_buf = qmd, qmd_buf
159
150
  return self
160
151
 
161
- def signal(self, signal:NVSignal, value:sint=0):
152
+ def signal(self, signal:HCQSignal, value:sint=0):
162
153
  if self.active_qmd is not None:
163
154
  for i in range(2):
164
- if getattr(self.active_qmd, f'release{i}_enable') == 0:
165
- setattr(self.active_qmd, f'release{i}_enable', 1)
166
- self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
167
- self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
155
+ if self.active_qmd.read(f'release{i}_enable') == 0:
156
+ self.active_qmd.write(**{f'release{i}_enable': 1})
157
+ self.bind_sints_to_mem(signal.value_addr, mem=self.active_qmd_buf.cpu_view(), fmt='Q', mask=0xfffffffff,
158
+ offset=self.active_qmd.field_offset(f'release{i}_address_lower' if self.active_qmd.ver<4 else f'release_semaphore{i}_addr_lower'))
159
+ self.bind_sints_to_mem(value, mem=self.active_qmd_buf.cpu_view(), fmt='Q',
160
+ offset=self.active_qmd.field_offset(f'release{i}_payload_lower' if self.active_qmd.ver<4 else f'release_semaphore{i}_payload_lower'))
168
161
  return self
169
162
 
170
163
  self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
@@ -177,12 +170,13 @@ class NVComputeQueue(NVCommandQueue):
177
170
 
178
171
  class NVCopyQueue(NVCommandQueue):
179
172
  def copy(self, dest:sint, src:sint, copy_size:int):
180
- self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src), *data64(dest))
181
- self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, copy_size)
182
- self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
173
+ for off in range(0, copy_size, step:=(1 << 31)):
174
+ self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src+off), *data64(dest+off))
175
+ self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, min(copy_size-off, step))
176
+ self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
183
177
  return self
184
178
 
185
- def signal(self, signal:NVSignal, value:sint=0):
179
+ def signal(self, signal:HCQSignal, value:sint=0):
186
180
  self.nvm(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, *data64(signal.value_addr), value)
187
181
  self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x14)
188
182
  return self
@@ -190,31 +184,34 @@ class NVCopyQueue(NVCommandQueue):
190
184
  def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
191
185
 
192
186
  class NVArgsState(CLikeArgsState):
193
- def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
187
+ def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
194
188
  if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
195
- super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
189
+ super().__init__(buf, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
196
190
 
197
191
  class NVProgram(HCQProgram):
198
192
  def __init__(self, dev:NVDevice, name:str, lib:bytes):
199
193
  self.dev, self.name, self.lib = dev, name, lib
200
194
 
195
+ # For MOCKGPU, the lib is PTX code, so some values are emulated.
196
+ cbuf0_size = 0 if not MOCKGPU else 0x160
197
+
201
198
  if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
202
199
  else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
203
200
 
204
201
  # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
205
- self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
202
+ self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, buf_spec:=BufferSpec(cpu_access=True))
206
203
 
207
204
  self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
208
205
  self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
209
206
  for sh in sections:
210
207
  if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
211
- if sh.name == f".text.{self.name}":
212
- self.prog_addr, self.prog_sz, self.regs_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, max(sh.header.sh_info>>24, 16)
208
+ if sh.name == f".text.{self.name}": self.prog_addr, self.prog_sz = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size
213
209
  elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
214
- elif sh.name == ".nv.info":
215
- for off in range(0, sh.header.sh_size, 12):
216
- typ, _, val = struct.unpack_from("III", sh.content, off)
217
- if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
210
+ elif sh.name.startswith(".nv.info"):
211
+ for typ, param, data in self._parse_elf_info(sh):
212
+ if sh.name == f".nv.info.{name}" and param == 0xa: cbuf0_size = struct.unpack_from("IH", data)[1] # EIATTR_PARAM_CBANK
213
+ elif sh.name == ".nv.info" and param == 0x12: self.lcmem_usage = struct.unpack_from("II", data)[1] + 0x240 # EIATTR_MIN_STACK_SIZE
214
+ elif sh.name == ".nv.info" and param == 0x2f: self.regs_usage = struct.unpack_from("II", data)[1] # EIATTR_REGCOUNT
218
215
 
219
216
  # Ensure device has enough local memory to run the program
220
217
  self.dev._ensure_has_local_memory(self.lcmem_usage)
@@ -229,33 +226,44 @@ class NVProgram(HCQProgram):
229
226
 
230
227
  ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
231
228
 
232
- self.constbuffer_0 = [0] * 88
233
- self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
229
+ self.constbuffer_0 = [0] * (cbuf0_size // 4)
230
+
231
+ if dev.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A:
232
+ self.constbuffer_0[188:192], self.constbuffer_0[223] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window)], 0xfffdc0
233
+ qmd = {'qmd_major_version':5, 'qmd_type':nv_gpu.NVCEC0_QMDV05_00_QMD_TYPE_GRID_CTA, 'register_count':self.regs_usage,
234
+ 'program_address_upper_shifted4':hi32(self.prog_addr>>4), 'program_address_lower_shifted4':lo32(self.prog_addr>>4),
235
+ 'shared_memory_size_shifted7':self.shmem_usage>>7, 'shader_local_memory_high_size_shifted4':self.dev.slm_per_thread>>4}
236
+ else:
237
+ self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
238
+ qmd = {'qmd_major_version':3, 'sm_global_caching_enable':1, 'shader_local_memory_high_size':self.dev.slm_per_thread,
239
+ 'program_address_upper':hi32(self.prog_addr), 'program_address_lower':lo32(self.prog_addr), 'shared_memory_size':self.shmem_usage,
240
+ 'register_count_v':self.regs_usage}
234
241
 
235
242
  smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
236
- self.qmd: ctypes.Structure = \
237
- qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
238
- invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
239
- cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
240
- shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
241
- max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
242
- barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
243
- program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
243
+
244
+ self.qmd:QMD = QMD(dev, **qmd, qmd_group_id=0x3f, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
245
+ invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, barrier_count=1,
246
+ cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, constant_buffer_invalidate_0=1,
247
+ min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg, max_sm_config_shared_mem_size=0x1a,
248
+ program_prefetch_size=min(self.prog_sz>>8, 0x1ff), sass_version=dev.sass_version,
249
+ program_prefetch_addr_upper_shifted=self.prog_addr>>40, program_prefetch_addr_lower_shifted=self.prog_addr>>8)
244
250
 
245
251
  for i,(addr,sz) in self.constbufs.items():
246
- self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (addr) >> 32)
247
- self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (addr) & 0xffffffff)
248
- self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
249
- self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
252
+ self.qmd.set_constant_buf_addr(i, addr)
253
+ self.qmd.write(**{f'constant_buffer_size_shifted4_{i}': sz, f'constant_buffer_valid_{i}': 1})
250
254
 
251
255
  # Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
252
256
  self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
253
257
 
254
- # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
258
+ # NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
255
259
  super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
260
+ weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
256
261
 
257
- def __del__(self):
258
- if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
262
+ def _parse_elf_info(self, sh, start_off=0):
263
+ while start_off < sh.header.sh_size:
264
+ typ, param, sz = struct.unpack_from("BBH", sh.content, start_off)
265
+ yield typ, param, sh.content[start_off+4:start_off+sz+4] if typ == 0x4 else sz
266
+ start_off += (sz if typ == 0x4 else 0) + 4
259
267
 
260
268
  def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
261
269
  if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
@@ -266,31 +274,28 @@ class NVProgram(HCQProgram):
266
274
 
267
275
  class NVAllocator(HCQAllocator['NVDevice']):
268
276
  def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
269
- if options.host: return self.dev._gpu_alloc(size, host=True, tag="user host memory")
270
- return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})")
277
+ return self.dev.iface.alloc(size, cpu_access=options.cpu_access, host=options.host)
271
278
 
279
+ @suppress_finalizing
272
280
  def _free(self, opaque:HCQBuffer, options:BufferSpec):
273
281
  self.dev.synchronize()
274
- self.dev._gpu_free(opaque)
282
+ self.dev.iface.free(opaque)
275
283
 
276
- def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
284
+ def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
277
285
 
278
286
  @dataclass
279
287
  class GPFifo:
280
- ring: memoryview
288
+ ring: MMIOInterface
281
289
  controls: nv_gpu.AmpereAControlGPFifo
282
290
  entries_count: int
283
291
  token: int
284
292
  put_value: int = 0
285
293
 
286
- MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
287
- class NVDevice(HCQCompiled[NVSignal]):
294
+ class NVKIface:
288
295
  root = None
289
- fd_ctl: HWInterface
290
- fd_uvm: HWInterface
291
- gpus_info: Union[list, ctypes.Array] = []
292
- signals_page: Any = None
293
- signals_pool: list[int] = []
296
+ fd_ctl: FileIOInterface
297
+ fd_uvm: FileIOInterface
298
+ gpus_info: list|ctypes.Array = []
294
299
 
295
300
  # TODO: Need a proper allocator for va addresses
296
301
  # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
@@ -299,34 +304,98 @@ class NVDevice(HCQCompiled[NVSignal]):
299
304
  uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False)
300
305
  host_object_enumerator: int = 0x1000
301
306
 
307
+ def __init__(self, dev, device_id):
308
+ if NVKIface.root is None:
309
+ NVKIface.fd_ctl = FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
310
+ NVKIface.fd_uvm = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
311
+ self.fd_uvm_2 = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
312
+ NVKIface.root = self.rm_alloc(0, nv_gpu.NV01_ROOT_CLIENT, None, root=0)
313
+ uvm.initialize(self.fd_uvm)
314
+ with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
315
+
316
+ nv_iowr(NVKIface.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
317
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
318
+ NVKIface.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
319
+
320
+ self.dev, self.device_id = dev, device_id
321
+ if self.device_id >= len(NVKIface.gpus_info) or not NVKIface.gpus_info[self.device_id].valid:
322
+ raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
323
+
324
+ self.fd_dev = self._new_gpu_fd()
325
+ self.gpu_info = self.rm_control(self.root, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2,
326
+ nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVKIface.gpus_info[self.device_id].gpu_id))
327
+ self.gpu_minor = NVKIface.gpus_info[self.device_id].minor_number
328
+ self.gpu_instance = self.gpu_info.deviceInstance
329
+
330
+ def rm_alloc(self, parent, clss, params=None, root=None) -> int:
331
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_ALLOC, made:=nv_gpu.NVOS21_PARAMETERS(hRoot=root if root is not None else self.root,
332
+ hObjectParent=parent, hClass=clss, pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None))
333
+ if made.status == nv_gpu.NV_ERR_NO_MEMORY: raise MemoryError(f"rm_alloc returned {get_error_str(made.status)}")
334
+ if made.status != 0: raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
335
+ return made.hObjectNew
336
+
337
+ def rm_control(self, obj, cmd, params=None):
338
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_CONTROL, made:=nv_gpu.NVOS54_PARAMETERS(hClient=self.root, hObject=obj, cmd=cmd,
339
+ paramsSize=ctypes.sizeof(params), params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None))
340
+ if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
341
+ return params
342
+
343
+ def setup_usermode(self):
344
+ clsinfo = self.rm_control(self.dev.nvdevice, nv_gpu.NV0080_CTRL_CMD_GPU_GET_CLASSLIST, nv_gpu.NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS(numClasses=100,
345
+ classList=mv_address(classlist:=memoryview(bytearray(100 * 4)).cast('I'))))
346
+ self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
347
+ self.usermode_class:int = next(c for c in [nv_gpu.HOPPER_USERMODE_A, nv_gpu.TURING_USERMODE_A] if c in self.nvclasses)
348
+ self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses)
349
+ self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses)
350
+ self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses)
351
+
352
+ usermode = self.rm_alloc(self.dev.subdevice, self.usermode_class)
353
+ return usermode, MMIOInterface(self._gpu_map_to_cpu(usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I')
354
+
355
+ def setup_vm(self, vaspace):
356
+ self.rm_control(self.dev.subdevice, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, raw_uuid:=nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(
357
+ flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16))
358
+ self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
359
+
360
+ uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
361
+ uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
362
+
363
+ for dev in cast(list[NVDevice], [d for pg in HCQCompiled.peer_groups.values() for d in pg if isinstance(d, NVDevice) and not d.is_nvd()]):
364
+ try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.iface.gpu_uuid)
365
+ except RuntimeError as e: raise RuntimeError(f"{e}. Make sure GPUs #{self.gpu_minor} & #{dev.iface.gpu_minor} have P2P enabled.") from e
366
+
367
+ def setup_gpfifo_vm(self, gpfifo):
368
+ uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
369
+ hChannel=gpfifo, base=self._alloc_gpu_vaddr(0x4000000, force_low=True), length=0x4000000)
370
+
302
371
  def _new_gpu_fd(self):
303
- fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
372
+ fd_dev = FileIOInterface(f"/dev/nvidia{NVKIface.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
304
373
  nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
305
374
  return fd_dev
306
375
 
307
376
  def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
308
- fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
377
+ fd_dev = self._new_gpu_fd() if not system else FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
309
378
  made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
310
- params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
379
+ params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.dev.nvdevice, hMemory=memory_handle, length=size, flags=flags))
311
380
  nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
312
381
  if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
313
382
  return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
314
383
 
315
- def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
384
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, cpu_addr=None) -> HCQBuffer:
316
385
  # Uncached memory is "system". Use huge pages only for gpu memory.
317
386
  page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10)))
318
387
  size = round_up(size, page_size)
319
388
  va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
320
389
 
321
390
  if host:
322
- va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
391
+ va_addr = cpu_addr or FileIOInterface.anon_mmap(va_addr, size, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, 0)
323
392
 
324
393
  flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
325
394
  | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
326
395
 
327
- NVDevice.host_object_enumerator += 1
328
- made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags,
329
- hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
396
+ NVKIface.host_object_enumerator += 1
397
+ made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.dev.nvdevice, flags=flags,
398
+ hObjectNew=NVKIface.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
330
399
  nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
331
400
 
332
401
  if made.params.status != 0: raise RuntimeError(f"host alloc returned {get_error_str(made.params.status)}")
@@ -344,169 +413,171 @@ class NVDevice(HCQCompiled[NVSignal]):
344
413
  alloc_func = nv_gpu.NV1_MEMORY_SYSTEM if uncached else nv_gpu.NV1_MEMORY_USER
345
414
  alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=page_size, offset=0, limit=size-1, format=6, size=size,
346
415
  type=nv_gpu.NVOS32_TYPE_NOTIFIER if uncached else nv_gpu.NVOS32_TYPE_IMAGE, attr=attr, attr2=attr2, flags=fl)
347
- mem_handle = rm_alloc(self.fd_ctl, alloc_func, self.root, self.nvdevice, alloc_params).hObjectNew
416
+ mem_handle = self.rm_alloc(self.dev.nvdevice, alloc_func, alloc_params)
348
417
 
349
418
  if cpu_access: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=uncached)
350
419
 
351
- return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
420
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host)
352
421
 
353
- def _gpu_free(self, mem:HCQBuffer):
354
- if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
355
- made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
422
+ def free(self, mem:HCQBuffer):
423
+ if mem.meta.hMemory > NVKIface.host_object_enumerator: # not a host object, clear phys mem.
424
+ made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.dev.nvdevice, hObjectOld=mem.meta.hMemory)
356
425
  nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
357
426
  if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
358
427
 
359
- self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
360
428
  uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
361
- if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
429
+ if mem.meta.has_cpu_mapping: FileIOInterface.munmap(cast(int, mem.va_addr), mem.size)
362
430
 
363
- def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
431
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> HCQBuffer:
364
432
  if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
365
433
  attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
366
434
 
367
435
  # NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
368
- self._debug_mappings[(va_base, size)] = tag
369
436
  return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
370
- hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
371
- mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
437
+ hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid],
438
+ has_cpu_mapping=has_cpu_mapping), view=MMIOInterface(va_base, size, fmt='B') if has_cpu_mapping else None, owner=self.dev)
372
439
 
373
- def _gpu_map(self, mem:HCQBuffer):
374
- if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
375
- mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
376
- self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
440
+ def map(self, mem:HCQBuffer):
441
+ if mem.owner is not None and mem.owner._is_cpu():
442
+ if not any(x.device.startswith("NV") for x in mem.mapped_devs): return self.alloc(mem.size, host=True, cpu_addr=mem.va_addr)
443
+ mem = mem.mappings[next(x for x in mem.mapped_devs if x.device.startswith("NV"))]
444
+ self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False)
377
445
 
378
446
  def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
379
- return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
447
+ return NVKIface.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVKIface.uvm_vaddr_allocator.alloc(size, alignment)
380
448
 
381
- def _setup_nvclasses(self):
382
- classlist = memoryview(bytearray(100 * 4)).cast('I')
383
- clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
384
- self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
385
- self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
449
+ class PCIIface(PCIIfaceBase):
450
+ gpus:ClassVar[list[str]] = []
386
451
 
387
- def __init__(self, device:str=""):
388
- if NVDevice.root is None:
389
- NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
390
- NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
391
- self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
392
- NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
393
- uvm.initialize(self.fd_uvm)
394
- with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
452
+ def __init__(self, dev, dev_id):
453
+ super().__init__(dev, dev_id, vendor=0x10de, devices=[0x2204, 0x2684, 0x2b85], bars=[0, 1], vram_bar=1,
454
+ va_start=NVMemoryManager.va_allocator.base, va_size=NVMemoryManager.va_allocator.size)
455
+ System.reserve_hugepages(64)
395
456
 
396
- nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
397
- visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
398
- NVDevice.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
457
+ self.pci_dev.write_config(pci.PCI_COMMAND, self.pci_dev.read_config(pci.PCI_COMMAND, 2) | pci.PCI_COMMAND_MASTER, 2)
458
+ self.dev_impl:NVDev = NVDev(self.pci_dev.pcibus, self.pci_dev.map_bar(0, fmt='I'), self.pci_dev.map_bar(1),
459
+ self.pci_dev.read_config(pci.PCI_VENDOR_ID, 4), self.pci_dev.read_config(pci.PCI_SUBSYSTEM_VENDOR_ID, 4),
460
+ self.pci_dev.read_config(pci.PCI_REVISION_ID, 1), self.pci_dev.bar_info)
461
+ self.root, self.gpu_instance, self.p2p_base_addr = 0xc1000000, 0, self.pci_dev.bar_info[1][0]
462
+ self.rm_alloc(0, nv_gpu.NV01_ROOT, nv_gpu.NV0000_ALLOC_PARAMETERS())
399
463
 
400
- self.device_id = int(device.split(":")[1]) if ":" in device else 0
464
+ # Setup classes for the GPU
465
+ self.gpfifo_class, self.compute_class, self.dma_class = (gsp:=self.dev_impl.gsp).gpfifo_class, gsp.compute_class, gsp.dma_class
401
466
 
402
- if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid:
403
- raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
467
+ def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, **kwargs) -> HCQBuffer:
468
+ # Force use of huge pages for large allocations. NVDev will attempt to use huge pages in any case,
469
+ # but if the size is not aligned, the tail will be allocated with 4KB pages, increasing TLB pressure.
470
+ page_size = (2 << 20) if size >= (8 << 20) and not uncached and not host else (4 << 10)
471
+ return super().alloc(round_up(size, page_size), host=host, uncached=uncached, cpu_access=cpu_access, contiguous=contiguous, **kwargs)
404
472
 
405
- self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
406
- self.gpu_minor = NVDevice.gpus_info[self.device_id].minor_number
407
- self.fd_dev = self._new_gpu_fd()
473
+ def setup_usermode(self): return 0xce000000, self.pci_dev.map_bar(bar=0, fmt='I', off=0xbb0000, size=0x10000)
474
+ def setup_vm(self, vaspace): pass
475
+ def setup_gpfifo_vm(self, gpfifo): pass
408
476
 
409
- device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
410
- vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
411
- self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
412
- self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
413
- self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
414
- self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
477
+ def rm_alloc(self, parent, clss, params=None, root=None) -> int: return self.dev_impl.gsp.rpc_rm_alloc(parent, clss, params, self.root)
478
+ def rm_control(self, obj, cmd, params=None): return self.dev_impl.gsp.rpc_rm_control(obj, cmd, params, self.root)
415
479
 
416
- self._setup_nvclasses()
417
- self._debug_mappings: dict[tuple[int, int], str] = dict()
480
+ def device_fini(self): self.dev_impl.fini()
418
481
 
419
- rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
420
- (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
482
+ class NVDevice(HCQCompiled[HCQSignal]):
483
+ def is_nvd(self) -> bool: return isinstance(self.iface, PCIIface)
421
484
 
422
- vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
423
- flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
424
- vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.nvdevice, vaspace_params).hObjectNew
485
+ def __init__(self, device:str=""):
486
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
487
+ self.iface = self._select_iface(NVKIface, PCIIface)
425
488
 
426
- raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
427
- self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
489
+ device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.iface.gpu_instance, hClientShare=self.iface.root,
490
+ vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
491
+ self.nvdevice = self.iface.rm_alloc(self.iface.root, nv_gpu.NV01_DEVICE_0, device_params)
492
+ self.subdevice = self.iface.rm_alloc(self.nvdevice, nv_gpu.NV20_SUBDEVICE_0, nv_gpu.NV2080_ALLOC_PARAMETERS())
493
+ self.usermode, self.gpu_mmio = self.iface.setup_usermode()
428
494
 
429
- uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
430
- uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
495
+ self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, nv_gpu.NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff,
496
+ flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | \
497
+ (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX))))
431
498
 
432
- for dev in cast(list[NVDevice], self.devices):
433
- try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
434
- except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
499
+ vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
500
+ flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
501
+ vaspace = self.iface.rm_alloc(self.nvdevice, nv_gpu.FERMI_VASPACE_A, vaspace_params)
435
502
 
436
- if NVDevice.signals_page is None:
437
- NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
438
- NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
439
- else: self._gpu_map(NVDevice.signals_page)
503
+ self.iface.setup_vm(vaspace)
440
504
 
441
505
  channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
442
- channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
506
+ channel_group = self.iface.rm_alloc(self.nvdevice, nv_gpu.KEPLER_CHANNEL_GROUP_A, channel_params)
443
507
 
444
- gpfifo_area = self._gpu_alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000, tag="gpfifo")
508
+ gpfifo_area = self.iface.alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000)
445
509
 
446
510
  ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
447
- ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
511
+ ctxshare = self.iface.rm_alloc(channel_group, nv_gpu.FERMI_CONTEXT_SHARE_A, ctxshare_params)
448
512
 
449
- self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, enable_debug=True)
450
- self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
513
+ self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, compute=True)
514
+ self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000, compute=False)
515
+ self.iface.rm_control(channel_group, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1))
451
516
 
452
- rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
453
-
454
- self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
517
+ self.cmdq_page:HCQBuffer = self.iface.alloc(0x200000, cpu_access=True)
455
518
  self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True)
456
- self.cmdq: memoryview = to_mv(cast(int, self.cmdq_page.va_addr), 0x200000).cast("I")
519
+ self.cmdq = MMIOInterface(cast(int, self.cmdq_page.va_addr), 0x200000, fmt='I')
457
520
 
458
521
  self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
459
522
  'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
460
- self.arch: str = f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
523
+
524
+ # FIXME: no idea how to convert this for blackwells
525
+ self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
526
+ self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)
461
527
 
462
528
  compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
463
529
  super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
464
- functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
530
+ functools.partial(NVProgram, self), HCQSignal, NVComputeQueue, NVCopyQueue)
465
531
 
466
532
  self._setup_gpfifos()
467
533
 
468
- def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
469
- notifier = self._gpu_alloc(48 << 20, uncached=True)
534
+ def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, compute=False) -> GPFifo:
535
+ notifier = self.iface.alloc(48 << 20, uncached=True)
470
536
  params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
471
537
  gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
472
538
  hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
473
- gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
474
- comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
475
- rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
539
+ gpfifo = self.iface.rm_alloc(channel_group, self.iface.gpfifo_class, params)
476
540
 
477
- if enable_debug:
478
- self.debug_compute_obj, self.debug_channel = comp, gpfifo
479
- debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
480
- self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.nvdevice, debugger_params).hObjectNew
541
+ if compute:
542
+ self.debug_compute_obj, self.debug_channel = self.iface.rm_alloc(gpfifo, self.iface.compute_class), gpfifo
543
+ debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.iface.root, hClass3dObject=self.debug_compute_obj)
544
+ self.debugger = self.iface.rm_alloc(self.nvdevice, nv_gpu.GT200_DEBUGGER, debugger_params)
545
+ else: self.iface.rm_alloc(gpfifo, self.iface.dma_class)
481
546
 
482
- ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
483
- assert ws_token_params.workSubmitToken != -1
484
-
485
- channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
486
- uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
487
- hChannel=gpfifo, base=channel_base, length=0x4000000)
547
+ ws_token_params = self.iface.rm_control(gpfifo, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN,
548
+ nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1))
549
+ self.iface.setup_gpfifo_vm(gpfifo)
488
550
 
489
- return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
551
+ return GPFifo(ring=MMIOInterface(gpfifo_area.va_addr + offset, entries*8, fmt='Q'), entries_count=entries, token=ws_token_params.workSubmitToken,
490
552
  controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))
491
553
 
492
554
  def _query_gpu_info(self, *reqs):
493
- nvrs = [getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_'+r.upper(), getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_LITTER_'+r.upper(),None)) for r in reqs]
555
+ nvrs = [getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_'+r.upper(), getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_LITTER_'+r.upper(), None)) for r in reqs]
556
+
557
+ if self.is_nvd():
558
+ x = self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_INTERNAL_STATIC_KGR_GET_INFO,
559
+ nv_gpu.NV2080_CTRL_INTERNAL_STATIC_GR_GET_INFO_PARAMS())
560
+ return [x.engineInfo[0].infoList[nvr].data for nvr in nvrs]
561
+
494
562
  infos = (nv_gpu.NV2080_CTRL_GR_INFO*len(nvrs))(*[nv_gpu.NV2080_CTRL_GR_INFO(index=nvr) for nvr in nvrs])
495
- rmctrl.gr_get_info(self.fd_ctl, self.root, self.subdevice, grInfoListSize=len(infos), grInfoList=ctypes.addressof(infos))
563
+ self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_GR_GET_INFO,
564
+ nv_gpu.NV2080_CTRL_GR_GET_INFO_PARAMS(grInfoListSize=len(infos), grInfoList=ctypes.addressof(infos)))
496
565
  return [x.data for x in infos]
497
566
 
498
567
  def _setup_gpfifos(self):
568
+ self.slm_per_thread, self.shader_local_mem = 0, None
569
+
499
570
  # Set windows addresses to not collide with other allocated buffers.
500
- self.shared_mem_window, self.local_mem_window, self.slm_per_thread, self.shader_local_mem = 0xfe000000, 0xff000000, 0, None
571
+ self.shared_mem_window, self.local_mem_window = 0x729400000000, 0x729300000000
501
572
 
502
- NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
503
- .signal(self.timeline_signal, self.timeline_value).submit(self)
573
+ NVComputeQueue().setup(compute_class=self.iface.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
574
+ .signal(self.timeline_signal, self.next_timeline()).submit(self)
504
575
 
505
- cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
506
- .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
507
- .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
576
+ NVCopyQueue().wait(self.timeline_signal, self.timeline_value - 1) \
577
+ .setup(copy_class=self.iface.dma_class) \
578
+ .signal(self.timeline_signal, self.next_timeline()).submit(self)
508
579
 
509
- self.timeline_value += 2
580
+ self.synchronize()
510
581
 
511
582
  def _ensure_has_local_memory(self, required):
512
583
  if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
@@ -520,30 +591,31 @@ class NVDevice(HCQCompiled[NVSignal]):
520
591
 
521
592
  cast(NVComputeQueue, NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1)) \
522
593
  .setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
523
- .signal(self.timeline_signal, self.timeline_value).submit(self)
524
- self.timeline_value += 1
594
+ .signal(self.timeline_signal, self.next_timeline()).submit(self)
525
595
 
526
596
  def invalidate_caches(self):
527
- rmctrl.fb_flush_gpu_cache(self.fd_ctl, self.root, self.subdevice,
528
- flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
529
- (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4)))
597
+ if self.is_nvd(): self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_INTERNAL_BUS_FLUSH_WITH_SYSMEMBAR, None)
598
+ else:
599
+ self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_FB_FLUSH_GPU_CACHE, nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_PARAMS(
600
+ flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
601
+ (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4))))
530
602
 
531
603
  def on_device_hang(self):
532
604
  # Prepare fault report.
533
605
  # TODO: Restore the GPU using NV83DE_CTRL_CMD_CLEAR_ALL_SM_ERROR_STATES if needed.
534
606
 
535
607
  report = []
536
- sm_errors = rmctrl.debug_read_all_sm_error_states(self.fd_ctl, self.root, self.debugger, hTargetChannel=self.debug_channel, numSMsToRead=100)
608
+ sm_errors = self.iface.rm_control(self.debugger, nv_gpu.NV83DE_CTRL_CMD_DEBUG_READ_ALL_SM_ERROR_STATES,
609
+ nv_gpu.NV83DE_CTRL_DEBUG_READ_ALL_SM_ERROR_STATES_PARAMS(hTargetChannel=self.debug_channel, numSMsToRead=100))
537
610
 
538
611
  if sm_errors.mmuFault.valid:
539
- mmu_info = rmctrl.debug_read_mmu_fault_info(self.fd_ctl, self.root, self.debugger)
540
- for i in range(mmu_info.count):
541
- pfinfo = mmu_info.mmuFaultInfoList[i]
612
+ mmu = self.iface.rm_control(self.debugger, nv_gpu.NV83DE_CTRL_CMD_DEBUG_READ_MMU_FAULT_INFO,
613
+ nv_gpu.NV83DE_CTRL_DEBUG_READ_MMU_FAULT_INFO_PARAMS())
614
+ for i in range(mmu.count):
615
+ pfinfo = mmu.mmuFaultInfoList[i]
542
616
  report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
543
- if DEBUG >= 5:
544
- report += ["GPU mappings:\n"+"\n".join(f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings))]
545
617
  else:
546
618
  for i, e in enumerate(sm_errors.smErrorStateArray):
547
- if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM {i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr} warp_pc={e.hwwWarpEsrPc64}"]
619
+ if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM {i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr:#x} warp_pc={e.hwwWarpEsrPc64:#x}"]
548
620
 
549
621
  raise RuntimeError("\n".join(report))