tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. tinygrad/__init__.py +6 -6
  2. tinygrad/codegen/__init__.py +0 -0
  3. tinygrad/codegen/kernel.py +253 -225
  4. tinygrad/codegen/linearizer.py +398 -436
  5. tinygrad/codegen/uops.py +451 -0
  6. tinygrad/device.py +268 -274
  7. tinygrad/dtype.py +56 -40
  8. tinygrad/engine/__init__.py +0 -0
  9. tinygrad/engine/graph.py +100 -0
  10. tinygrad/engine/jit.py +198 -0
  11. tinygrad/engine/realize.py +192 -0
  12. tinygrad/engine/schedule.py +370 -0
  13. tinygrad/engine/search.py +199 -0
  14. tinygrad/{mlops.py → function.py} +40 -32
  15. tinygrad/helpers.py +144 -46
  16. tinygrad/lazy.py +143 -242
  17. tinygrad/multi.py +173 -0
  18. tinygrad/nn/__init__.py +180 -9
  19. tinygrad/nn/datasets.py +8 -0
  20. tinygrad/nn/optim.py +106 -28
  21. tinygrad/nn/state.py +87 -19
  22. tinygrad/ops.py +104 -45
  23. tinygrad/renderer/__init__.py +65 -0
  24. tinygrad/renderer/assembly.py +269 -0
  25. tinygrad/renderer/cstyle.py +308 -210
  26. tinygrad/renderer/llvmir.py +119 -124
  27. tinygrad/runtime/__init__.py +0 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +13403 -0
  29. tinygrad/runtime/autogen/comgr.py +891 -0
  30. tinygrad/runtime/autogen/cuda.py +5923 -0
  31. tinygrad/runtime/autogen/hip.py +5909 -0
  32. tinygrad/runtime/autogen/hsa.py +5893 -0
  33. tinygrad/runtime/autogen/io_uring.py +1486 -0
  34. tinygrad/runtime/autogen/kfd.py +812 -0
  35. tinygrad/runtime/autogen/nv_gpu.py +33597 -0
  36. tinygrad/runtime/autogen/opencl.py +1795 -0
  37. tinygrad/runtime/driver/__init__.py +0 -0
  38. tinygrad/runtime/driver/hip_comgr.py +56 -0
  39. tinygrad/runtime/graph/__init__.py +0 -0
  40. tinygrad/runtime/graph/clang.py +39 -0
  41. tinygrad/runtime/graph/cuda.py +59 -54
  42. tinygrad/runtime/graph/hcq.py +187 -0
  43. tinygrad/runtime/graph/metal.py +37 -41
  44. tinygrad/runtime/ops_amd.py +550 -0
  45. tinygrad/runtime/ops_clang.py +16 -14
  46. tinygrad/runtime/ops_cuda.py +129 -37
  47. tinygrad/runtime/ops_disk.py +111 -43
  48. tinygrad/runtime/ops_gpu.py +52 -50
  49. tinygrad/runtime/ops_llvm.py +36 -56
  50. tinygrad/runtime/ops_metal.py +41 -24
  51. tinygrad/runtime/ops_npy.py +9 -0
  52. tinygrad/runtime/ops_nv.py +625 -0
  53. tinygrad/runtime/ops_python.py +208 -0
  54. tinygrad/shape/__init__.py +0 -0
  55. tinygrad/shape/shapetracker.py +46 -107
  56. tinygrad/shape/symbolic.py +99 -98
  57. tinygrad/shape/view.py +162 -45
  58. tinygrad/tensor.py +2492 -483
  59. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
  60. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
  61. tinygrad-0.9.1.dist-info/RECORD +63 -0
  62. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  63. tinygrad/features/image.py +0 -93
  64. tinygrad/features/multi.py +0 -103
  65. tinygrad/features/search.py +0 -160
  66. tinygrad/graph.py +0 -106
  67. tinygrad/jit.py +0 -152
  68. tinygrad/realize.py +0 -50
  69. tinygrad/runtime/graph/hip.py +0 -24
  70. tinygrad/runtime/ops_cpu.py +0 -45
  71. tinygrad/runtime/ops_hip.py +0 -97
  72. tinygrad/runtime/ops_torch.py +0 -49
  73. tinygrad-0.8.0.dist-info/RECORD +0 -41
  74. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,625 @@
1
+ from __future__ import annotations
2
+ import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
3
+ from typing import Tuple, List, Any
4
+ from dataclasses import dataclass
5
+ from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
+ from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
7
+ from tinygrad.renderer.cstyle import NVRenderer
8
+ from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
9
+ import tinygrad.runtime.autogen.cuda as cuda
10
+ import tinygrad.runtime.autogen.nv_gpu as nv_gpu
11
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
12
+
13
+ libc = ctypes.CDLL(ctypes.util.find_library("c"))
14
+ libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
15
+ libc.mmap.restype = ctypes.c_void_p
16
+ libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
17
+ libc.munmap.restype = ctypes.c_int
18
+
19
+ if MOCKGPU:=getenv("MOCKGPU"):
20
+ import extra.mockgpu.mockgpu # noqa: F401
21
+ libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
22
+ libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
23
+
24
+ def nv_iowr(fd, nr, args):
25
+ ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
26
+ if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
27
+
28
+ def rm_alloc(fd, clss, root, parant, params):
29
+ made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
30
+ pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
31
+ nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
32
+ if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
33
+ return made
34
+
35
+ def rm_control(fd, cmd, client, obj, params):
36
+ made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
37
+ params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
38
+ nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
39
+ if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
40
+ return made
41
+
42
+ def uvm_ioctl(cmd, sttyp, fd, **kwargs):
43
+ ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
44
+ if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
45
+ if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {made.rmStatus}: {nv_gpu.nv_status_codes.get(made.rmStatus, 'Unknown error')}")
46
+ return made
47
+
48
+ def make_uvm_type():
49
+ fxns = {name.replace("UVM_", "").lower():
50
+ functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
51
+ for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
52
+ return type("NVUVM", (object, ), fxns)
53
+ uvm = make_uvm_type()
54
+
55
+ def make_qmd_struct_type():
56
+ fields = []
57
+ bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
58
+ bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
59
+ bits = sorted(bits, key=lambda x: x[1][1])
60
+ for i,(name, data) in enumerate(bits):
61
+ if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
62
+ fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
63
+ return init_c_struct_t(tuple(fields))
64
+ qmd_struct_t = make_qmd_struct_type()
65
+ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
66
+
67
+ def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
68
+ def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
69
+ def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
70
+
71
+ class NVCompiler(Compiler):
72
+ def __init__(self, arch:str):
73
+ self.arch = arch
74
+ #NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
75
+ cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
76
+ self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
77
+ if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
78
+ super().__init__(f"compile_nv_{self.arch}")
79
+ def compile(self, src:str) -> bytes:
80
+ cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
81
+ status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
82
+
83
+ if status != 0:
84
+ raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
85
+ return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
86
+
87
+ class HWQueue:
88
+ def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
89
+ def __del__(self):
90
+ if self.binded_device is not None:
91
+ self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
92
+ self.binded_device._gpu_free(self.hw_page)
93
+
94
+ def _mark_command_end(self):
95
+ self.cmd_offsets.append(len(self.q))
96
+ return self
97
+ def __len__(self): return len(self.cmd_offsets) - 1
98
+
99
+ def memory_barrier(self): return self._mark_command_end()
100
+
101
+ def wait(self, signal, value=0):
102
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
103
+ (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
104
+ return self._mark_command_end()
105
+
106
+ def timestamp(self, signal): return HWQueue.signal(self, signal, timestamp=True)
107
+
108
+ def signal(self, signal, value=0, timestamp=False):
109
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
110
+ (1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
111
+ self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
112
+ return self._mark_command_end()
113
+
114
+ def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
115
+ def update_wait(self, cmd_idx, signal=None, value=None):
116
+ if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
117
+ if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
118
+ return self
119
+
120
+ def bind(self, device: NVDevice):
121
+ self.binded_device = device
122
+ self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
123
+ hw_view = to_mv(self.hw_page.base, self.hw_page.length).cast("I")
124
+ for i, value in enumerate(self.q): hw_view[i] = value
125
+
126
+ # From now on, the queue is on the device for faster submission.
127
+ self.q = hw_view # type: ignore
128
+
129
+ def _submit(self, dev, gpfifo:GPFifo):
130
+ if len(self.q) == 0: return
131
+
132
+ if dev == self.binded_device: cmdq_addr = self.hw_page.base
133
+ else:
134
+ if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.length:
135
+ assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.base + len(self.q) * 4 or \
136
+ gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
137
+ dev.cmdq_wptr = 0
138
+
139
+ dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
140
+ cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
141
+ dev.cmdq_wptr += len(self.q) * 4
142
+
143
+ gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
144
+ gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
145
+ dev.gpu_mmio[0x90 // 4] = gpfifo.token
146
+ gpfifo.put_value += 1
147
+
148
+ class HWComputeQueue(HWQueue):
149
+ def __init__(self):
150
+ super().__init__()
151
+ self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
152
+
153
+ def copy_from_cpu(self, gpuaddr, data):
154
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
155
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
156
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
157
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
158
+ return self._mark_command_end()
159
+
160
+ def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0):
161
+ ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
162
+ self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
163
+ self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
164
+ self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
165
+
166
+ qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
167
+ qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
168
+ qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
169
+ qmd.constant_buffer_addr_upper_0 = kernargs >> 32
170
+ if signal is not None:
171
+ qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
172
+ qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
173
+ qmd.release0_payload_lower = signal_value & 0xffffffff
174
+ qmd.release0_payload_upper = signal_value >> 32
175
+ qmd.release0_enable = 1
176
+
177
+ if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
178
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
179
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
180
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
181
+ else:
182
+ prev_qmd.dependent_qmd0_pointer = qmd_addr >> 8
183
+ prev_qmd.dependent_qmd0_action = 1
184
+ prev_qmd.dependent_qmd0_prefetch = 1
185
+ prev_qmd.dependent_qmd0_enable = 1
186
+ return self._mark_command_end()
187
+
188
+ def update_exec(self, cmd_idx, global_size, local_size):
189
+ # Patch the exec cmd with new launch dims
190
+ self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
191
+ self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
192
+
193
+ def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
194
+
195
+ class HWCopyQueue(HWQueue):
196
+ def copy(self, dest, src, copy_size):
197
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
198
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
199
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
200
+ return self._mark_command_end()
201
+
202
+ def signal(self, signal, value=0):
203
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
204
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
205
+ return self._mark_command_end()
206
+
207
+ def update_signal(self, cmd_idx, signal=None, value=None):
208
+ if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
209
+ if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
210
+ return self
211
+
212
+ def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)
213
+
214
+ SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
215
+ class NVProgram:
216
+ def __init__(self, device:NVDevice, name:str, lib:bytes):
217
+ self.device, self.name, self.lib = device, name, lib
218
+ if DEBUG >= 6:
219
+ try:
220
+ fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
221
+ with open(fn + ".cubin", "wb") as f: f.write(lib)
222
+ print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
223
+ except Exception as e: print("failed to disasm cubin", str(e))
224
+
225
+ self.rel_info, self.global_init, self.shmem_usage = None, None, 0
226
+ constant_buffers_data = {}
227
+
228
+ if MOCKGPU:
229
+ self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
230
+ constant_buffers_data[0] = memoryview(bytearray(0x190))
231
+ else:
232
+ _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
233
+ sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
234
+ shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
235
+ for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
236
+ section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
237
+ if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
238
+ elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
239
+ self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
240
+ self.registers_usage = sh_info >> 24
241
+ if match := re.match(r'\.nv\.constant(\d+)', section_name):
242
+ constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
243
+ if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
244
+ elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
245
+ elif section_name == ".nv.info":
246
+ section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
247
+ for i in range(sh_size // 12):
248
+ if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
249
+ raise RuntimeError("too high local memory")
250
+
251
+ # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
252
+ self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
253
+
254
+ # Load program and constant buffers (if any)
255
+ # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
256
+ self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
257
+ round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
258
+ self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
259
+
260
+ self.constbuffer_0 = [0] * 88
261
+ self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
262
+
263
+ smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
264
+ self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
265
+ invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
266
+ cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
267
+ shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
268
+ max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
269
+ barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
270
+ program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
271
+ program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
272
+ constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
273
+
274
+ # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
275
+ self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
276
+ self.kernargs_alloc_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
277
+ self.kernargs_offset = 0x160
278
+
279
+ # constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
280
+ if 0 in constant_buffers_data: constant_buffers_data.pop(0)
281
+
282
+ off = round_up(self.program.nbytes, 128)
283
+
284
+ if self.rel_info is not None:
285
+ assert self.global_init is not None
286
+ global_init_addr = self.lib_gpu.base + off
287
+ for rel_i in range(0, len(self.rel_info), 4):
288
+ if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
289
+ elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
290
+ else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
291
+
292
+ HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
293
+ for st in range(0, len(self.program), 4095):
294
+ HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
295
+
296
+ if self.global_init is not None:
297
+ HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
298
+ off += round_up(self.global_init.nbytes, 128)
299
+ if 4 in constant_buffers_data: # >= 12.4
300
+ # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
301
+ assert constant_buffers_data[4].nbytes == 8
302
+ constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
303
+
304
+ for i,data in constant_buffers_data.items():
305
+ self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
306
+ self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
307
+ self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
308
+ self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
309
+
310
+ HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
311
+ off += round_up(data.nbytes, 128)
312
+
313
+ HWComputeQueue().signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
314
+ self.device.timeline_value += 1
315
+ self.device.synchronize()
316
+
317
+ def __del__(self):
318
+ if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
319
+
320
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
321
+ if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
322
+ if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
323
+ raise RuntimeError("Invalid global/local dims")
324
+
325
+ if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_alloc_size):
326
+ self.device.kernargs_ptr = self.device.kernargs_page.base
327
+
328
+ # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
329
+ if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
330
+ kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
331
+
332
+ sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
333
+
334
+ queue = HWComputeQueue()
335
+ queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
336
+ if wait or PROFILE: queue.timestamp(sig_st)
337
+ queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
338
+ queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
339
+ if wait or PROFILE: queue.timestamp(sig_en)
340
+ queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
341
+ self.device.timeline_value += 1
342
+ self.device.kernargs_ptr += self.kernargs_alloc_size
343
+
344
+ if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
345
+ if wait:
346
+ self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
347
+ return (sig_en[1] - sig_st[1]) / 1e9
348
+
349
+ class NVAllocator(HCQCompatAllocator):
350
+ def __init__(self, device:NVDevice): super().__init__(device)
351
+
352
+ def _alloc(self, size:int, options:BufferOptions):
353
+ if options.host: return self.device._gpu_host_alloc(size)
354
+ return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
355
+
356
+ def _free(self, opaque, options:BufferOptions):
357
+ self.device.synchronize()
358
+ if options.host: self.device._gpu_host_free(opaque)
359
+ else: self.device._gpu_free(opaque)
360
+
361
+ @dataclass
362
+ class GPFifo:
363
+ ring: memoryview
364
+ controls: nv_gpu.AmpereAControlGPFifo
365
+ entries_count: int
366
+ token: int
367
+ put_value: int = 0
368
+
369
+ MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
370
+ class NVDevice(HCQCompatCompiled):
371
+ root = None
372
+ fd_ctl: int = -1
373
+ fd_uvm: int = -1
374
+ gpus_info = None
375
+ signals_page:Any = None
376
+ signals_pool: List[Any] = []
377
+ uvm_vaddr: int = 0x1000000000
378
+ host_object_enumerator: int = 0x1000
379
+ devices: List[NVDevice] = []
380
+
381
+ def _new_gpu_fd(self):
382
+ fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
383
+ nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
384
+ return fd_dev
385
+
386
+ def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
387
+ fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
388
+ made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
389
+ params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
390
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
391
+ if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
392
+ return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
393
+
394
+ def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
395
+ size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
396
+ alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
397
+ attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
398
+ ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
399
+ attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
400
+ ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
401
+ flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
402
+ nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
403
+ mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
404
+
405
+ if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
406
+ if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
407
+ return self._gpu_uvm_map(va_addr, size, mem_handle)
408
+
409
+ def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
410
+ alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
411
+ attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
412
+ attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
413
+ flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
414
+ nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
415
+ mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
416
+
417
+ if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
418
+ if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
419
+
420
+ return self._gpu_uvm_map(va_addr, size, mem_handle)
421
+
422
+ def _gpu_host_alloc(self, size):
423
+ va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
424
+ libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
425
+ return self._map_to_gpu(va_base, sz)
426
+
427
+ def _gpu_free(self, mem):
428
+ made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
429
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
430
+ if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
431
+ uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
432
+
433
+ def _gpu_host_free(self, mem):
434
+ uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
435
+ libc.munmap(mem.base, mem.length)
436
+
437
+ def _map_to_gpu(self, va_base, size):
438
+ NVDevice.host_object_enumerator += 1
439
+ flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
440
+ (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
441
+ made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
442
+ hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
443
+ nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
444
+ if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
445
+ return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
446
+
447
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
448
+ if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
449
+ gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
450
+ nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
451
+
452
+ # NOTE: va_addr is set to make rawbufs compatable with AMD.
453
+ return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
454
+ gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size)
455
+
456
+ def _gpu_map(self, mem):
457
+ if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
458
+ mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_uuid])
459
+ return self._gpu_uvm_map(mem.base, mem.length, mem.hMemory, create_range=False)
460
+
461
+ def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
462
+ NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
463
+ return res_va
464
+
465
+ def __init__(self, device:str=""):
466
+ if NVDevice.root is None:
467
+ NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
468
+ NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
469
+ fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
470
+ NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
471
+ uvm.initialize(self.fd_uvm)
472
+ with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
473
+
474
+ NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
475
+ nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
476
+
477
+ # TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
478
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
479
+ self.fd_dev = self._new_gpu_fd()
480
+
481
+ assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
482
+ gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
483
+ rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
484
+ device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
485
+ self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A
486
+
487
+ device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
488
+ vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
489
+ self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
490
+ self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
491
+ self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
492
+ self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
493
+
494
+ boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
495
+ (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
496
+ rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
497
+
498
+ vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
499
+ flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
500
+ vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
501
+
502
+ gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
503
+ rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
504
+ self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
505
+
506
+ uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
507
+ uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
508
+ hClient=self.root, hVaSpace=vaspace)
509
+
510
+ for dev in self.devices:
511
+ uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
512
+
513
+ if NVDevice.signals_page is None:
514
+ NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
515
+ NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
516
+ else: self._gpu_map(NVDevice.signals_page)
517
+
518
+ channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
519
+ channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
520
+
521
+ gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
522
+
523
+ ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
524
+ ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
525
+
526
+ self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
527
+ self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
528
+
529
+ en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
530
+ rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
531
+
532
+ self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
533
+
534
+ self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
535
+ self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
536
+ self.cmdq_wptr: int = 0 # in bytes
537
+
538
+ self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
539
+ self.kernargs_ptr: int = self.kernargs_page.base
540
+
541
+ self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
542
+
543
+ super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
544
+ functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
545
+
546
+ self._cmdq_setup_compute_gpfifo()
547
+ self._cmdq_setup_dma_gpfifo()
548
+
549
+ NVDevice.devices.append(self)
550
+
551
+ @classmethod
552
+ def _read_signal(self, sig): return sig[0]
553
+
554
+ @classmethod
555
+ def _read_timestamp(self, sig): return sig[1]
556
+
557
+ @classmethod
558
+ def _set_signal(self, sig, value): sig[0] = value
559
+
560
+ @classmethod
561
+ def _get_signal(self, value=0, **kwargs) -> memoryview:
562
+ self._set_signal(sig := self.signals_pool.pop(), value)
563
+ return sig
564
+
565
+ @classmethod
566
+ def _wait_signal(self, signal, value=0, timeout=10000):
567
+ start_time = time.time() * 1000
568
+ while time.time() * 1000 - start_time < timeout:
569
+ if signal[0] >= value: return
570
+ raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
571
+
572
+ def _gpu2cpu_time(self, gpu_time, is_copy): return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e3
573
+
574
+ def synchronize(self):
575
+ NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
576
+ self.cmdq_wptr = 0
577
+
578
+ if self.timeline_value > (1 << 63): self._wrap_timeline_signal()
579
+ if PROFILE: self._prof_process_events()
580
+
581
+ def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
582
+ notifier = self._gpu_system_alloc(48 << 20)
583
+ params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
584
+ gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
585
+ hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
586
+ gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
587
+ rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
588
+ rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
589
+
590
+ ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
591
+ rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
592
+ assert ws_token_params.workSubmitToken != -1
593
+
594
+ channel_base = self._alloc_gpu_vaddr(0x4000000)
595
+ uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
596
+ hChannel=gpfifo, base=channel_base, length=0x4000000)
597
+
598
+ return GPFifo(ring=to_mv(gpfifo_area.base + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
599
+ controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.base + offset + entries * 8))
600
+
601
+ def _cmdq_setup_compute_gpfifo(self):
602
+ self.slm_per_thread = 0x900
603
+ bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
604
+ bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
605
+ self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
606
+
607
+ # Set windows addresses to not collide with other allocated buffers.
608
+ self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
609
+
610
+ queue = HWComputeQueue()
611
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
612
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
613
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
614
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
615
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
616
+ queue.signal(self.timeline_signal, self.timeline_value).submit(self)
617
+ self.timeline_value += 1
618
+ self.synchronize()
619
+
620
+ def _cmdq_setup_dma_gpfifo(self):
621
+ queue = HWCopyQueue()
622
+ queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
623
+ queue.signal(self.timeline_signal, self.timeline_value).submit(self)
624
+ self.timeline_value += 1
625
+ self.synchronize()