tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. tinygrad/__init__.py +11 -6
  2. tinygrad/codegen/kernel.py +308 -175
  3. tinygrad/codegen/linearize.py +95 -0
  4. tinygrad/codegen/lowerer.py +143 -0
  5. tinygrad/codegen/transcendental.py +257 -0
  6. tinygrad/codegen/uopgraph.py +506 -0
  7. tinygrad/device.py +72 -171
  8. tinygrad/dtype.py +122 -47
  9. tinygrad/engine/jit.py +184 -87
  10. tinygrad/{lazy.py → engine/lazy.py} +74 -66
  11. tinygrad/engine/memory.py +51 -0
  12. tinygrad/engine/realize.py +86 -61
  13. tinygrad/engine/schedule.py +366 -317
  14. tinygrad/engine/search.py +58 -47
  15. tinygrad/function.py +59 -58
  16. tinygrad/helpers.py +120 -102
  17. tinygrad/multi.py +82 -78
  18. tinygrad/nn/__init__.py +116 -67
  19. tinygrad/nn/datasets.py +12 -5
  20. tinygrad/nn/optim.py +1 -1
  21. tinygrad/nn/state.py +91 -6
  22. tinygrad/ops.py +1126 -143
  23. tinygrad/renderer/__init__.py +47 -23
  24. tinygrad/renderer/cstyle.py +338 -265
  25. tinygrad/renderer/llvmir.py +125 -143
  26. tinygrad/renderer/ptx.py +225 -0
  27. tinygrad/runtime/autogen/adreno.py +17904 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
  29. tinygrad/runtime/autogen/cuda.py +6 -162
  30. tinygrad/runtime/autogen/io_uring.py +97 -63
  31. tinygrad/runtime/autogen/kfd.py +60 -47
  32. tinygrad/runtime/autogen/kgsl.py +1386 -0
  33. tinygrad/runtime/autogen/libc.py +5462 -0
  34. tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
  35. tinygrad/runtime/autogen/nvrtc.py +579 -0
  36. tinygrad/runtime/autogen/opencl.py +11 -11
  37. tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
  38. tinygrad/runtime/graph/clang.py +3 -3
  39. tinygrad/runtime/graph/cuda.py +11 -15
  40. tinygrad/runtime/graph/hcq.py +120 -107
  41. tinygrad/runtime/graph/metal.py +71 -43
  42. tinygrad/runtime/ops_amd.py +244 -323
  43. tinygrad/runtime/ops_clang.py +12 -5
  44. tinygrad/runtime/ops_cloud.py +220 -0
  45. tinygrad/runtime/ops_cuda.py +42 -99
  46. tinygrad/runtime/ops_disk.py +25 -26
  47. tinygrad/runtime/ops_dsp.py +181 -0
  48. tinygrad/runtime/ops_gpu.py +29 -16
  49. tinygrad/runtime/ops_hip.py +68 -0
  50. tinygrad/runtime/ops_llvm.py +15 -10
  51. tinygrad/runtime/ops_metal.py +147 -64
  52. tinygrad/runtime/ops_nv.py +356 -397
  53. tinygrad/runtime/ops_python.py +78 -79
  54. tinygrad/runtime/ops_qcom.py +405 -0
  55. tinygrad/runtime/support/__init__.py +0 -0
  56. tinygrad/runtime/support/compiler_cuda.py +77 -0
  57. tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
  58. tinygrad/runtime/support/elf.py +38 -0
  59. tinygrad/runtime/support/hcq.py +539 -0
  60. tinygrad/shape/shapetracker.py +40 -50
  61. tinygrad/shape/view.py +102 -63
  62. tinygrad/tensor.py +1109 -365
  63. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
  64. tinygrad-0.10.0.dist-info/RECORD +77 -0
  65. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
  66. tinygrad/codegen/linearizer.py +0 -528
  67. tinygrad/codegen/uops.py +0 -451
  68. tinygrad/engine/graph.py +0 -100
  69. tinygrad/renderer/assembly.py +0 -269
  70. tinygrad/shape/symbolic.py +0 -327
  71. tinygrad-0.9.1.dist-info/RECORD +0 -63
  72. /tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
  73. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
  74. {tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,24 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
3
- from typing import Tuple, List, Any
2
+ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys
3
+ assert sys.platform != 'win32'
4
+ from typing import Tuple, List, Any, cast, Union, Dict, Type
4
5
  from dataclasses import dataclass
5
- from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
- from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
7
+ from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
8
+ from tinygrad.device import BufferOptions
9
+ from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
10
+ from tinygrad.renderer.ptx import PTXRenderer
7
11
  from tinygrad.renderer.cstyle import NVRenderer
8
- from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
9
- import tinygrad.runtime.autogen.cuda as cuda
10
- import tinygrad.runtime.autogen.nv_gpu as nv_gpu
11
- if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
12
-
13
- libc = ctypes.CDLL(ctypes.util.find_library("c"))
14
- libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
15
- libc.mmap.restype = ctypes.c_void_p
16
- libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
17
- libc.munmap.restype = ctypes.c_int
18
-
19
- if MOCKGPU:=getenv("MOCKGPU"):
20
- import extra.mockgpu.mockgpu # noqa: F401
21
- libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
22
- libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
12
+ from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
13
+ from tinygrad.runtime.autogen import nv_gpu, libc
14
+ from tinygrad.runtime.support.elf import elf_loader
15
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
16
+ if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
17
+
18
+ def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
19
+
20
+ NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
21
+ NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
23
22
 
24
23
  def nv_iowr(fd, nr, args):
25
24
  ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
@@ -27,117 +26,105 @@ def nv_iowr(fd, nr, args):
27
26
 
28
27
  def rm_alloc(fd, clss, root, parant, params):
29
28
  made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
30
- pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
29
+ pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
31
30
  nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
32
- if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
31
+ if made.status != 0:
32
+ if made.status == nv_gpu.NV_ERR_NO_MEMORY: raise MemoryError(f"rm_alloc returned {get_error_str(made.status)}")
33
+ raise RuntimeError(f"rm_alloc returned {get_error_str(made.status)}")
33
34
  return made
34
35
 
35
- def rm_control(fd, cmd, client, obj, params):
36
- made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
37
- params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
36
+ def rm_control(cmd, sttyp, fd, client, obj, **kwargs):
37
+ made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params:=sttyp(**kwargs)),
38
+ params=ctypes.cast(ctypes.byref(params), ctypes.c_void_p) if params is not None else None)
38
39
  nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
39
- if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
40
- return made
40
+ if made.status != 0: raise RuntimeError(f"rm_control returned {get_error_str(made.status)}")
41
+ return params
42
+
43
+ def make_rmctrl_type():
44
+ return type("NVRMCTRL", (object,), {name[name.find("_CTRL_CMD_")+10:].lower(): functools.partial(rm_control, dt, sttyp)
45
+ for name,dt in nv_gpu.__dict__.items() if name.find("_CTRL_CMD_")>=0 and (sttyp:=getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_")+"_PARAMS", \
46
+ getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
47
+ rmctrl = make_rmctrl_type()
41
48
 
42
49
  def uvm_ioctl(cmd, sttyp, fd, **kwargs):
43
50
  ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
44
51
  if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
45
- if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {made.rmStatus}: {nv_gpu.nv_status_codes.get(made.rmStatus, 'Unknown error')}")
52
+ if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
46
53
  return made
47
54
 
48
55
  def make_uvm_type():
49
- fxns = {name.replace("UVM_", "").lower():
50
- functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
51
- for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
52
- return type("NVUVM", (object, ), fxns)
56
+ return type("NVUVM", (object,), {name.replace("UVM_", "").lower(): functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
57
+ for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
53
58
  uvm = make_uvm_type()
54
59
 
55
60
  def make_qmd_struct_type():
56
- fields = []
61
+ fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
57
62
  bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
58
63
  bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
59
64
  bits = sorted(bits, key=lambda x: x[1][1])
60
65
  for i,(name, data) in enumerate(bits):
61
- if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
66
+ if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
62
67
  fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
68
+ if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
69
+ fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
63
70
  return init_c_struct_t(tuple(fields))
64
71
  qmd_struct_t = make_qmd_struct_type()
65
72
  assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
66
73
 
67
74
  def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
68
- def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
69
- def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
70
-
71
- class NVCompiler(Compiler):
72
- def __init__(self, arch:str):
73
- self.arch = arch
74
- #NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
75
- cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
76
- self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
77
- if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
78
- super().__init__(f"compile_nv_{self.arch}")
79
- def compile(self, src:str) -> bytes:
80
- cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
81
- status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
82
-
83
- if status != 0:
84
- raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
85
- return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
86
-
87
- class HWQueue:
88
- def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
89
- def __del__(self):
90
- if self.binded_device is not None:
91
- self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
92
- self.binded_device._gpu_free(self.hw_page)
93
75
 
94
- def _mark_command_end(self):
95
- self.cmd_offsets.append(len(self.q))
96
- return self
97
- def __len__(self): return len(self.cmd_offsets) - 1
98
-
99
- def memory_barrier(self): return self._mark_command_end()
100
-
101
- def wait(self, signal, value=0):
102
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
76
+ class NVSignal(HCQSignal):
77
+ def __init__(self, value=0, is_timeline=False):
78
+ self._signal = NVDevice.signals_pool.pop()
79
+ self.signal_addr = mv_address(self._signal)
80
+ super().__init__(value)
81
+ def __del__(self): NVDevice.signals_pool.append(self._signal)
82
+ def _get_value(self) -> int: return self._signal[0]
83
+ def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
84
+ def _set_value(self, new_value:int): self._signal[0] = new_value
85
+
86
+ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
87
+ def __del__(self):
88
+ if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
89
+
90
+ @hcq_command
91
+ def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
92
+ if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class]
93
+ if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class]
94
+ if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)]
95
+ if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)]
96
+ if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)]
97
+ if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff]
98
+
99
+ def _wait(self, signal, value=0):
100
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
103
101
  (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
104
- return self._mark_command_end()
105
102
 
106
- def timestamp(self, signal): return HWQueue.signal(self, signal, timestamp=True)
103
+ def _update_wait(self, cmd_idx, signal=None, value=None):
104
+ if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
105
+ if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
107
106
 
108
- def signal(self, signal, value=0, timestamp=False):
109
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
110
- (1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
111
- self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
112
- return self._mark_command_end()
107
+ def _timestamp(self, signal): return self._signal(signal, 0)
113
108
 
114
- def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
115
- def update_wait(self, cmd_idx, signal=None, value=None):
116
- if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
117
- if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
118
- return self
119
-
120
- def bind(self, device: NVDevice):
109
+ def bind(self, device):
121
110
  self.binded_device = device
122
- self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
123
- hw_view = to_mv(self.hw_page.base, self.hw_page.length).cast("I")
111
+ self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
112
+ hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
124
113
  for i, value in enumerate(self.q): hw_view[i] = value
125
114
 
126
115
  # From now on, the queue is on the device for faster submission.
127
116
  self.q = hw_view # type: ignore
128
117
 
129
- def _submit(self, dev, gpfifo:GPFifo):
130
- if len(self.q) == 0: return
131
-
132
- if dev == self.binded_device: cmdq_addr = self.hw_page.base
118
+ def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
119
+ if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
133
120
  else:
134
- if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.length:
135
- assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.base + len(self.q) * 4 or \
121
+ if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
122
+ assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
136
123
  gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
137
124
  dev.cmdq_wptr = 0
138
125
 
139
126
  dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
140
- cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
127
+ cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
141
128
  dev.cmdq_wptr += len(self.q) * 4
142
129
 
143
130
  gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
@@ -145,37 +132,26 @@ class HWQueue:
145
132
  dev.gpu_mmio[0x90 // 4] = gpfifo.token
146
133
  gpfifo.put_value += 1
147
134
 
148
- class HWComputeQueue(HWQueue):
135
+ class NVComputeQueue(NVCommandQueue, HWComputeQueue):
149
136
  def __init__(self):
137
+ self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
150
138
  super().__init__()
151
- self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
152
139
 
153
- def copy_from_cpu(self, gpuaddr, data):
154
- self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
155
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
156
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
157
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
158
- return self._mark_command_end()
140
+ def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
159
141
 
160
- def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0):
161
- ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
162
- self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
163
- self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
164
- self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
142
+ def _exec(self, prg, args_state, global_size, local_size):
143
+ ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
144
+ assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
145
+
146
+ self.cmd_idx_to_qmd[self._cur_cmd_idx()] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
147
+ self.cmd_idx_to_global_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
148
+ self.cmd_idx_to_local_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
165
149
 
166
150
  qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
167
151
  qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
168
- qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
169
- qmd.constant_buffer_addr_upper_0 = kernargs >> 32
170
- if signal is not None:
171
- qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
172
- qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
173
- qmd.release0_payload_lower = signal_value & 0xffffffff
174
- qmd.release0_payload_upper = signal_value >> 32
175
- qmd.release0_enable = 1
176
-
177
- if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
178
- self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
152
+ qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
153
+
154
+ if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is None:
179
155
  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
180
156
  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
181
157
  else:
@@ -183,180 +159,146 @@ class HWComputeQueue(HWQueue):
183
159
  prev_qmd.dependent_qmd0_action = 1
184
160
  prev_qmd.dependent_qmd0_prefetch = 1
185
161
  prev_qmd.dependent_qmd0_enable = 1
186
- return self._mark_command_end()
187
162
 
188
- def update_exec(self, cmd_idx, global_size, local_size):
163
+ def _update_exec(self, cmd_idx, global_size, local_size):
189
164
  # Patch the exec cmd with new launch dims
190
- self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
191
- self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
165
+ if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
166
+ if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
167
+
168
+ def _signal(self, signal, value=0):
169
+ if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is not None:
170
+ for i in range(2):
171
+ if getattr(prev_qmd, f'release{i}_enable') == 0:
172
+ setattr(prev_qmd, f'release{i}_enable', 1)
173
+ setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
174
+ setattr(prev_qmd, f'release{i}_payload', value)
175
+ self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd
176
+ self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i
177
+ return
178
+
179
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
180
+ (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
181
+ self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
182
+
183
+ def _update_signal(self, cmd_idx, signal=None, value=None):
184
+ if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
185
+ if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
186
+ if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
192
187
 
193
- def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
188
+ def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
194
189
 
195
- class HWCopyQueue(HWQueue):
196
- def copy(self, dest, src, copy_size):
197
- self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
190
+ class NVCopyQueue(NVCommandQueue, HWCopyQueue):
191
+ def _copy(self, dest, src, copy_size):
192
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
198
193
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
199
194
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
200
- return self._mark_command_end()
201
195
 
202
- def signal(self, signal, value=0):
203
- self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
196
+ def _update_copy(self, cmd_idx, dest=None, src=None):
197
+ if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
198
+ if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
199
+
200
+ def _signal(self, signal, value=0):
201
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value]
204
202
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
205
- return self._mark_command_end()
206
203
 
207
- def update_signal(self, cmd_idx, signal=None, value=None):
208
- if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
209
- if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
210
- return self
204
+ def _update_signal(self, cmd_idx, signal=None, value=None):
205
+ if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
206
+ if value is not None: self._patch(cmd_idx, offset=3, data=[value])
207
+
208
+ def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
211
209
 
212
- def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)
210
+ class NVArgsState(HCQArgsState):
211
+ def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
212
+ super().__init__(ptr, prg, bufs, vals=vals)
213
213
 
214
- SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
215
- class NVProgram:
214
+ if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
215
+ kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
216
+ to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
217
+ self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
218
+ self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
219
+
220
+ def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
221
+ def update_var(self, index:int, val:int): self.vals[index] = val
222
+
223
+ class NVProgram(HCQProgram):
216
224
  def __init__(self, device:NVDevice, name:str, lib:bytes):
217
225
  self.device, self.name, self.lib = device, name, lib
218
- if DEBUG >= 6:
219
- try:
220
- fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
221
- with open(fn + ".cubin", "wb") as f: f.write(lib)
222
- print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
223
- except Exception as e: print("failed to disasm cubin", str(e))
224
-
225
- self.rel_info, self.global_init, self.shmem_usage = None, None, 0
226
- constant_buffers_data = {}
227
-
228
- if MOCKGPU:
229
- self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
230
- constant_buffers_data[0] = memoryview(bytearray(0x190))
231
- else:
232
- _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
233
- sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
234
- shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
235
- for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
236
- section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
237
- if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
238
- elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
239
- self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
240
- self.registers_usage = sh_info >> 24
241
- if match := re.match(r'\.nv\.constant(\d+)', section_name):
242
- constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
243
- if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
244
- elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
245
- elif section_name == ".nv.info":
246
- section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
247
- for i in range(sh_size // 12):
248
- if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
249
- raise RuntimeError("too high local memory")
250
226
 
251
- # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
252
- self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
227
+ if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
228
+ else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
253
229
 
254
- # Load program and constant buffers (if any)
255
230
  # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
256
- self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
257
- round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
258
- self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
231
+ self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True))
232
+
233
+ self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
234
+ self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
235
+ for sh in sections:
236
+ if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
237
+ if sh.name == f".text.{self.name}":
238
+ self.prog_addr, self.prog_sz, self.regs_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, max(sh.header.sh_info>>24, 16)
239
+ elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
240
+ elif sh.name == ".nv.info":
241
+ for off in range(0, sh.header.sh_size, 12):
242
+ typ, _, val = struct.unpack_from("III", sh.content, off)
243
+ if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
244
+
245
+ # Ensure device has enough local memory to run the program
246
+ self.device._ensure_has_local_memory(self.lcmem_usage)
247
+
248
+ # Apply relocs
249
+ for apply_image_offset, rel_sym_offset, typ, _ in relocs:
250
+ # These types are CUDA-specific, applying them here
251
+ if typ == 2: image[apply_image_offset:apply_image_offset+8] = struct.pack('<Q', self.lib_gpu.va_addr + rel_sym_offset) # R_CUDA_64
252
+ elif typ == 0x38: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) & 0xffffffff)
253
+ elif typ == 0x39: image[apply_image_offset+4:apply_image_offset+8] = struct.pack('<I', (self.lib_gpu.va_addr + rel_sym_offset) >> 32)
254
+ else: raise RuntimeError(f"unknown NV reloc {typ}")
255
+
256
+ ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
259
257
 
260
258
  self.constbuffer_0 = [0] * 88
261
- self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
259
+ self.constbuffer_0[6:12] = [*data64_le(self.device.shared_mem_window), *data64_le(self.device.local_mem_window), *data64_le(0xfffdc0)]
262
260
 
263
- smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
261
+ smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
264
262
  self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
265
263
  invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
266
- cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
267
- shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
268
- max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
269
- barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
270
- program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
271
- program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
272
- constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
273
-
274
- # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
275
- self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
276
- self.kernargs_alloc_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
277
- self.kernargs_offset = 0x160
278
-
279
- # constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
280
- if 0 in constant_buffers_data: constant_buffers_data.pop(0)
281
-
282
- off = round_up(self.program.nbytes, 128)
283
-
284
- if self.rel_info is not None:
285
- assert self.global_init is not None
286
- global_init_addr = self.lib_gpu.base + off
287
- for rel_i in range(0, len(self.rel_info), 4):
288
- if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
289
- elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
290
- else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
291
-
292
- HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
293
- for st in range(0, len(self.program), 4095):
294
- HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
295
-
296
- if self.global_init is not None:
297
- HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
298
- off += round_up(self.global_init.nbytes, 128)
299
- if 4 in constant_buffers_data: # >= 12.4
300
- # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
301
- assert constant_buffers_data[4].nbytes == 8
302
- constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
303
-
304
- for i,data in constant_buffers_data.items():
305
- self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
306
- self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
307
- self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
264
+ cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
265
+ shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
266
+ max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
267
+ barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
268
+ program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
269
+
270
+ for i,(addr,sz) in self.constbufs.items():
271
+ self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (addr) >> 32)
272
+ self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (addr) & 0xffffffff)
273
+ self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
308
274
  self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
309
275
 
310
- HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
311
- off += round_up(data.nbytes, 128)
276
+ # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
277
+ self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
312
278
 
313
- HWComputeQueue().signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
314
- self.device.timeline_value += 1
315
- self.device.synchronize()
279
+ # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
280
+ super().__init__(NVArgsState, self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
316
281
 
317
282
  def __del__(self):
318
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
283
+ if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
319
284
 
320
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
321
- if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
285
+ def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
286
+ if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.device).slm_per_thread:
287
+ raise RuntimeError("Too many resources requested for launch")
322
288
  if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
323
- raise RuntimeError("Invalid global/local dims")
289
+ raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
290
+ return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
324
291
 
325
- if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_alloc_size):
326
- self.device.kernargs_ptr = self.device.kernargs_page.base
327
-
328
- # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
329
- if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
330
- kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
331
-
332
- sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
333
-
334
- queue = HWComputeQueue()
335
- queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
336
- if wait or PROFILE: queue.timestamp(sig_st)
337
- queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
338
- queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
339
- if wait or PROFILE: queue.timestamp(sig_en)
340
- queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
341
- self.device.timeline_value += 1
342
- self.device.kernargs_ptr += self.kernargs_alloc_size
343
-
344
- if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
345
- if wait:
346
- self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
347
- return (sig_en[1] - sig_st[1]) / 1e9
348
-
349
- class NVAllocator(HCQCompatAllocator):
350
- def __init__(self, device:NVDevice): super().__init__(device)
351
-
352
- def _alloc(self, size:int, options:BufferOptions):
353
- if options.host: return self.device._gpu_host_alloc(size)
354
- return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
292
+ class NVAllocator(HCQAllocator):
293
+ def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
294
+ if options.host: return self.device._gpu_host_alloc(size, tag="user host memory")
295
+ return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)), tag=f"user memory ({options})")
355
296
 
356
297
  def _free(self, opaque, options:BufferOptions):
357
298
  self.device.synchronize()
358
- if options.host: self.device._gpu_host_free(opaque)
359
- else: self.device._gpu_free(opaque)
299
+ self.device._gpu_free(opaque)
300
+
301
+ def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
360
302
 
361
303
  @dataclass
362
304
  class GPFifo:
@@ -367,19 +309,19 @@ class GPFifo:
367
309
  put_value: int = 0
368
310
 
369
311
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
370
- class NVDevice(HCQCompatCompiled):
312
+ class NVDevice(HCQCompiled):
371
313
  root = None
372
314
  fd_ctl: int = -1
373
315
  fd_uvm: int = -1
374
- gpus_info = None
375
- signals_page:Any = None
316
+ gpus_info: Union[List, ctypes.Array] = []
317
+ signals_page: Any = None
376
318
  signals_pool: List[Any] = []
377
- uvm_vaddr: int = 0x1000000000
319
+ low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
320
+ uvm_vaddr: int = 0x2000000000 # 0x2000000000+
378
321
  host_object_enumerator: int = 0x1000
379
- devices: List[NVDevice] = []
380
322
 
381
323
  def _new_gpu_fd(self):
382
- fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
324
+ fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
383
325
  nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
384
326
  return fd_dev
385
327
 
@@ -388,10 +330,12 @@ class NVDevice(HCQCompatCompiled):
388
330
  made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
389
331
  params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
390
332
  nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
391
- if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
392
- return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
333
+ if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
334
+ res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
335
+ os.close(fd_dev)
336
+ return res
393
337
 
394
- def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
338
+ def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
395
339
  size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
396
340
  alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
397
341
  attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
@@ -402,11 +346,11 @@ class NVDevice(HCQCompatCompiled):
402
346
  nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
403
347
  mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
404
348
 
405
- if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
349
+ if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu)
406
350
  if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
407
- return self._gpu_uvm_map(va_addr, size, mem_handle)
351
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
408
352
 
409
- def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
353
+ def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
410
354
  alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
411
355
  attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
412
356
  attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
@@ -414,54 +358,62 @@ class NVDevice(HCQCompatCompiled):
414
358
  nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
415
359
  mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
416
360
 
417
- if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
361
+ if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True)
418
362
  if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
419
363
 
420
- return self._gpu_uvm_map(va_addr, size, mem_handle)
364
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
421
365
 
422
- def _gpu_host_alloc(self, size):
423
- va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
424
- libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
425
- return self._map_to_gpu(va_base, sz)
366
+ def _gpu_host_alloc(self, size, tag=""):
367
+ va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
368
+ mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
369
+ assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
426
370
 
427
- def _gpu_free(self, mem):
428
- made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
429
- nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
430
- if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
431
- uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
432
-
433
- def _gpu_host_free(self, mem):
434
- uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
435
- libc.munmap(mem.base, mem.length)
436
-
437
- def _map_to_gpu(self, va_base, size):
438
371
  NVDevice.host_object_enumerator += 1
439
372
  flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
440
373
  (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
441
374
  made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
442
- hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
375
+ hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1)
443
376
  nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
444
- if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
445
- return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
446
377
 
447
- def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
378
+ if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
379
+ return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
380
+
381
+ def _gpu_free(self, mem):
382
+ if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
383
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
384
+ if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
385
+
386
+ self._debug_mappings.pop((mem.va_addr, mem.size))
387
+ uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
388
+ if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
389
+
390
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
448
391
  if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
449
- gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
450
- nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
392
+ attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
451
393
 
452
- # NOTE: va_addr is set to make rawbufs compatable with AMD.
394
+ # NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
395
+ self._debug_mappings[(va_base, size)] = tag
453
396
  return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
454
- gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size)
397
+ gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
455
398
 
456
399
  def _gpu_map(self, mem):
457
- if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
458
- mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_uuid])
459
- return self._gpu_uvm_map(mem.base, mem.length, mem.hMemory, create_range=False)
460
-
461
- def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
462
- NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
400
+ if self.gpu_uuid in mem.mapped_gpu_ids: return
401
+ mem.mapped_gpu_ids.append(self.gpu_uuid)
402
+ self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
403
+
404
+ def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
405
+ if force_low:
406
+ NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
407
+ assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
408
+ else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
463
409
  return res_va
464
410
 
411
+ def _setup_nvclasses(self):
412
+ classlist = memoryview(bytearray(100 * 4)).cast('I')
413
+ clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist))
414
+ self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
415
+ self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
416
+
465
417
  def __init__(self, device:str=""):
466
418
  if NVDevice.root is None:
467
419
  NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
@@ -471,155 +423,162 @@ class NVDevice(HCQCompatCompiled):
471
423
  uvm.initialize(self.fd_uvm)
472
424
  with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
473
425
 
474
- NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
475
- nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
426
+ nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
427
+ visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
428
+ NVDevice.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
476
429
 
477
- # TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
478
430
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
479
- self.fd_dev = self._new_gpu_fd()
480
431
 
481
- assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
482
- gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
483
- rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
484
- device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
485
- self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A
432
+ if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid:
433
+ raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
486
434
 
487
- device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
435
+ self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
436
+ self.gpu_minor = NVDevice.gpus_info[self.device_id].minor_number
437
+ self.fd_dev = self._new_gpu_fd()
438
+
439
+ device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
488
440
  vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
489
441
  self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
490
442
  self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
491
443
  self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
492
444
  self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
493
445
 
494
- boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
446
+ self._setup_nvclasses()
447
+ self._debug_mappings: Dict[Tuple[int, int], str] = dict()
448
+
449
+ rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
495
450
  (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
496
- rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
497
451
 
498
452
  vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
499
453
  flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
500
454
  vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
501
455
 
502
- gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
503
- rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
504
- self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
456
+ raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
457
+ self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
505
458
 
506
- uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
507
- uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
508
- hClient=self.root, hVaSpace=vaspace)
459
+ uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
460
+ uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
509
461
 
510
- for dev in self.devices:
511
- uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
462
+ for dev in cast(List[NVDevice], self.devices):
463
+ try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
464
+ except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
512
465
 
513
466
  if NVDevice.signals_page is None:
514
467
  NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
515
- NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
468
+ NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
516
469
  else: self._gpu_map(NVDevice.signals_page)
517
470
 
518
471
  channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
519
472
  channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
520
473
 
521
- gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
474
+ gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo")
522
475
 
523
476
  ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
524
477
  ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
525
478
 
526
- self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
479
+ self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, enable_debug=True)
527
480
  self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
528
481
 
529
- en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
530
- rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
531
-
532
- self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
482
+ rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
533
483
 
534
- self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
535
- self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
484
+ self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True, tag="cmdq")
485
+ self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
536
486
  self.cmdq_wptr: int = 0 # in bytes
537
487
 
538
- self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
539
- self.kernargs_ptr: int = self.kernargs_page.base
488
+ self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
489
+ 'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
490
+ self.arch: str = f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
540
491
 
541
- self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
492
+ compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
493
+ super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
494
+ functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
542
495
 
543
- super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
544
- functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
496
+ self._setup_gpfifos()
545
497
 
546
- self._cmdq_setup_compute_gpfifo()
547
- self._cmdq_setup_dma_gpfifo()
548
-
549
- NVDevice.devices.append(self)
498
+ def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
499
+ notifier = self._gpu_system_alloc(48 << 20)
500
+ params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
501
+ gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
502
+ hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
503
+ gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
504
+ comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
505
+ rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
550
506
 
551
- @classmethod
552
- def _read_signal(self, sig): return sig[0]
507
+ if enable_debug:
508
+ self.debug_compute_obj, self.debug_channel = comp, gpfifo
509
+ debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
510
+ self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.device, debugger_params).hObjectNew
553
511
 
554
- @classmethod
555
- def _read_timestamp(self, sig): return sig[1]
512
+ ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
513
+ assert ws_token_params.workSubmitToken != -1
556
514
 
557
- @classmethod
558
- def _set_signal(self, sig, value): sig[0] = value
515
+ channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
516
+ uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
517
+ hChannel=gpfifo, base=channel_base, length=0x4000000)
559
518
 
560
- @classmethod
561
- def _get_signal(self, value=0, **kwargs) -> memoryview:
562
- self._set_signal(sig := self.signals_pool.pop(), value)
563
- return sig
519
+ return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
520
+ controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))
564
521
 
565
- @classmethod
566
- def _wait_signal(self, signal, value=0, timeout=10000):
567
- start_time = time.time() * 1000
568
- while time.time() * 1000 - start_time < timeout:
569
- if signal[0] >= value: return
570
- raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
522
+ def _query_gpu_info(self, *reqs):
523
+ nvrs = [getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_'+r.upper(), getattr(nv_gpu,'NV2080_CTRL_GR_INFO_INDEX_LITTER_'+r.upper(),None)) for r in reqs]
524
+ infos = (nv_gpu.NV2080_CTRL_GR_INFO*len(nvrs))(*[nv_gpu.NV2080_CTRL_GR_INFO(index=nvr) for nvr in nvrs])
525
+ rmctrl.gr_get_info(self.fd_ctl, self.root, self.subdevice, grInfoListSize=len(infos), grInfoList=ctypes.addressof(infos))
526
+ return [x.data for x in infos]
571
527
 
572
- def _gpu2cpu_time(self, gpu_time, is_copy): return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e3
528
+ def _setup_gpfifos(self):
529
+ # Set windows addresses to not collide with other allocated buffers.
530
+ self.shared_mem_window, self.local_mem_window, self.slm_per_thread, self.shader_local_mem = 0xfe000000, 0xff000000, 0, None
573
531
 
574
- def synchronize(self):
575
- NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
576
- self.cmdq_wptr = 0
532
+ NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
533
+ .signal(self.timeline_signal, self.timeline_value).submit(self)
577
534
 
578
- if self.timeline_value > (1 << 63): self._wrap_timeline_signal()
579
- if PROFILE: self._prof_process_events()
535
+ NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
536
+ .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
537
+ .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
580
538
 
581
- def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
582
- notifier = self._gpu_system_alloc(48 << 20)
583
- params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
584
- gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
585
- hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
586
- gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
587
- rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
588
- rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
539
+ self.timeline_value += 2
589
540
 
590
- ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
591
- rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
592
- assert ws_token_params.workSubmitToken != -1
541
+ def _ensure_has_local_memory(self, required):
542
+ if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
593
543
 
594
- channel_base = self._alloc_gpu_vaddr(0x4000000)
595
- uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
596
- hChannel=gpfifo, base=channel_base, length=0x4000000)
544
+ if self.shader_local_mem is not None: self.allocator.free(self.shader_local_mem, self.shader_local_mem.size)
597
545
 
598
- return GPFifo(ring=to_mv(gpfifo_area.base + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
599
- controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.base + offset + entries * 8))
546
+ self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread
547
+ bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
600
548
 
601
- def _cmdq_setup_compute_gpfifo(self):
602
- self.slm_per_thread = 0x900
603
- bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
604
- bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
605
- self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
549
+ try: self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
550
+ except MemoryError:
551
+ # If can't allocate a new size, reallocator the old buffer.
552
+ self.slm_per_thread = old_slm_per_thread
553
+ bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
554
+ self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
606
555
 
607
- # Set windows addresses to not collide with other allocated buffers.
608
- self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
609
-
610
- queue = HWComputeQueue()
611
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
612
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
613
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
614
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
615
- queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
616
- queue.signal(self.timeline_signal, self.timeline_value).submit(self)
556
+ NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
557
+ .setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
558
+ .signal(self.timeline_signal, self.timeline_value).submit(self)
617
559
  self.timeline_value += 1
618
- self.synchronize()
619
560
 
620
- def _cmdq_setup_dma_gpfifo(self):
621
- queue = HWCopyQueue()
622
- queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
623
- queue.signal(self.timeline_signal, self.timeline_value).submit(self)
624
- self.timeline_value += 1
625
- self.synchronize()
561
+ def invalidate_caches(self):
562
+ rmctrl.fb_flush_gpu_cache(self.fd_ctl, self.root, self.subdevice,
563
+ flags=((nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_WRITE_BACK_YES << 2) | (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_INVALIDATE_YES << 3) |
564
+ (nv_gpu.NV2080_CTRL_FB_FLUSH_GPU_CACHE_FLAGS_FLUSH_MODE_FULL_CACHE << 4)))
565
+
566
+ def on_device_hang(self):
567
+ # Prepare fault report.
568
+ # TODO: Restore the GPU using NV83DE_CTRL_CMD_CLEAR_ALL_SM_ERROR_STATES if needed.
569
+
570
+ report = []
571
+ sm_errors = rmctrl.debug_read_all_sm_error_states(self.fd_ctl, self.root, self.debugger, hTargetChannel=self.debug_channel, numSMsToRead=100)
572
+
573
+ if sm_errors.mmuFault.valid:
574
+ mmu_info = rmctrl.debug_read_mmu_fault_info(self.fd_ctl, self.root, self.debugger)
575
+ for i in range(mmu_info.count):
576
+ pfinfo = mmu_info.mmuFaultInfoList[i]
577
+ report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
578
+ if DEBUG >= 5:
579
+ report += ["GPU mappings:\n"+"\n".join(f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings))]
580
+ else:
581
+ for i, e in enumerate(sm_errors.smErrorStateArray):
582
+ if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM {i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr} warp_pc={e.hwwWarpEsrPc64}"]
583
+
584
+ raise RuntimeError("\n".join(report))