tinygrad 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. tinygrad/__init__.py +6 -6
  2. tinygrad/codegen/kernel.py +230 -190
  3. tinygrad/codegen/linearizer.py +278 -384
  4. tinygrad/codegen/uops.py +415 -0
  5. tinygrad/device.py +132 -275
  6. tinygrad/dtype.py +53 -37
  7. tinygrad/engine/__init__.py +0 -0
  8. tinygrad/engine/graph.py +100 -0
  9. tinygrad/engine/jit.py +195 -0
  10. tinygrad/engine/realize.py +191 -0
  11. tinygrad/engine/schedule.py +362 -0
  12. tinygrad/engine/search.py +196 -0
  13. tinygrad/{mlops.py → function.py} +28 -14
  14. tinygrad/helpers.py +72 -43
  15. tinygrad/lazy.py +141 -240
  16. tinygrad/multi.py +169 -0
  17. tinygrad/nn/__init__.py +179 -8
  18. tinygrad/nn/datasets.py +7 -0
  19. tinygrad/nn/optim.py +106 -28
  20. tinygrad/nn/state.py +86 -17
  21. tinygrad/ops.py +70 -44
  22. tinygrad/renderer/__init__.py +61 -0
  23. tinygrad/renderer/assembly.py +276 -0
  24. tinygrad/renderer/cstyle.py +299 -206
  25. tinygrad/renderer/llvmir.py +118 -123
  26. tinygrad/runtime/autogen/amd_gpu.py +1900 -0
  27. tinygrad/runtime/autogen/comgr.py +865 -0
  28. tinygrad/runtime/autogen/cuda.py +5923 -0
  29. tinygrad/runtime/autogen/hip.py +5909 -0
  30. tinygrad/runtime/autogen/hsa.py +5761 -0
  31. tinygrad/runtime/autogen/kfd.py +812 -0
  32. tinygrad/runtime/autogen/nv_gpu.py +33328 -0
  33. tinygrad/runtime/autogen/opencl.py +1795 -0
  34. tinygrad/runtime/driver/hip_comgr.py +47 -0
  35. tinygrad/runtime/driver/hsa.py +143 -0
  36. tinygrad/runtime/graph/clang.py +38 -0
  37. tinygrad/runtime/graph/cuda.py +59 -54
  38. tinygrad/runtime/graph/hcq.py +143 -0
  39. tinygrad/runtime/graph/hsa.py +171 -0
  40. tinygrad/runtime/graph/metal.py +37 -41
  41. tinygrad/runtime/ops_amd.py +564 -0
  42. tinygrad/runtime/ops_clang.py +16 -14
  43. tinygrad/runtime/ops_cuda.py +130 -38
  44. tinygrad/runtime/ops_disk.py +45 -42
  45. tinygrad/runtime/ops_gpu.py +52 -50
  46. tinygrad/runtime/ops_hsa.py +278 -0
  47. tinygrad/runtime/ops_llvm.py +36 -56
  48. tinygrad/runtime/ops_metal.py +42 -24
  49. tinygrad/runtime/ops_npy.py +9 -0
  50. tinygrad/runtime/ops_nv.py +630 -0
  51. tinygrad/runtime/ops_python.py +204 -0
  52. tinygrad/shape/shapetracker.py +41 -105
  53. tinygrad/shape/symbolic.py +98 -95
  54. tinygrad/shape/view.py +137 -35
  55. tinygrad/tensor.py +2367 -442
  56. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
  57. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/METADATA +19 -9
  58. tinygrad-0.9.0.dist-info/RECORD +60 -0
  59. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
  60. tinygrad/features/image.py +0 -93
  61. tinygrad/features/multi.py +0 -103
  62. tinygrad/features/search.py +0 -160
  63. tinygrad/graph.py +0 -106
  64. tinygrad/jit.py +0 -152
  65. tinygrad/realize.py +0 -50
  66. tinygrad/runtime/graph/hip.py +0 -24
  67. tinygrad/runtime/ops_cpu.py +0 -45
  68. tinygrad/runtime/ops_hip.py +0 -97
  69. tinygrad/runtime/ops_torch.py +0 -49
  70. tinygrad-0.8.0.dist-info/RECORD +0 -41
  71. {tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,630 @@
1
+ from __future__ import annotations
2
+ import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
3
+ from typing import Tuple, List, Any, cast
4
+ from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, BufferOptions
5
+ from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
6
+ from tinygrad.renderer.cstyle import NVRenderer
7
+ from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
8
+ import tinygrad.runtime.autogen.cuda as cuda
9
+ import tinygrad.runtime.autogen.nv_gpu as nv_gpu
10
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
11
+
12
+ libc = ctypes.CDLL(ctypes.util.find_library("c"))
13
+ libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
14
+ libc.mmap.restype = ctypes.c_void_p
15
+ libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
16
+ libc.munmap.restype = ctypes.c_int
17
+
18
+ if MOCKGPU:=getenv("MOCKGPU"):
19
+ import extra.mockgpu.mockgpu # noqa: F401
20
+ libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
21
+ libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
22
+
23
+ def nv_iowr(fd, nr, args):
24
+ ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
25
+ if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
26
+
27
+ def rm_alloc(fd, clss, root, parant, params):
28
+ made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
29
+ pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
30
+ nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
31
+ if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}")
32
+ return made
33
+
34
+ def rm_control(fd, cmd, client, obj, params):
35
+ made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
36
+ params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
37
+ nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
38
+ if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}")
39
+ return made
40
+
41
+ def uvm_ioctl(cmd, sttyp, fd, **kwargs):
42
+ ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
43
+ if ret != 0: raise RuntimeError(f"uvm_ioctl returned {ret}")
44
+ if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl struct returned {made.rmStatus}")
45
+ return made
46
+
47
+ def make_uvm_type():
48
+ fxns = {name.replace("UVM_", "").lower():
49
+ functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
50
+ for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
51
+ return type("NVUVM", (object, ), fxns)
52
+ uvm = make_uvm_type()
53
+
54
+ def make_qmd_struct_type():
55
+ fields = []
56
+ bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
57
+ bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
58
+ bits = sorted(bits, key=lambda x: x[1][1])
59
+ for i,(name, data) in enumerate(bits):
60
+ if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
61
+ fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
62
+ return init_c_struct_t(tuple(fields))
63
+ qmd_struct_t = make_qmd_struct_type()
64
+ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
65
+
66
+ def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
67
+ def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
68
+ def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
69
+
70
+ class NVCompiler(Compiler):
71
+ def __init__(self, arch:str):
72
+ self.arch = arch
73
+ #NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
74
+ cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
75
+ self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
76
+ if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
77
+ super().__init__(f"compile_nv_{self.arch}")
78
+ def compile(self, src:str) -> bytes:
79
+ cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
80
+ status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
81
+
82
+ if status != 0:
83
+ raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
84
+ return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
85
+
86
+ class HWQueue:
87
+ def __init__(self): self.q, self.binded_device, self.next_cmd_index = [], None, 0
88
+ def __del__(self):
89
+ if self.binded_device is not None:
90
+ self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
91
+ self.binded_device._gpu_free(self.hw_page)
92
+
93
+ def ptr(self) -> int: return self.next_cmd_index
94
+
95
+ def wait(self, signal, value=0):
96
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
97
+ (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
98
+ self.next_cmd_index += 1
99
+ return self
100
+
101
+ def signal(self, signal, value=0, timestamp=False):
102
+ self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
103
+ (1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
104
+ self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
105
+ self.next_cmd_index += 1
106
+ return self
107
+
108
+ def bind(self, device: NVDevice):
109
+ self.binded_device = device
110
+ self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
111
+ hw_view = to_mv(self.hw_page.base, self.hw_page.length).cast("I")
112
+ for i, value in enumerate(self.q): hw_view[i] = value
113
+
114
+ # From now on, the queue is on the device for faster submission.
115
+ self.q = hw_view # type: ignore
116
+
117
+ def _submit(self, dev, gpu_ring, put_value, gpfifo_entries, gpfifo_token, gpu_ring_controls):
118
+ if dev == self.binded_device: cmdq_addr = self.hw_page.base
119
+ else:
120
+ dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
121
+ cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
122
+ dev.cmdq_wptr += len(self.q) * 4
123
+
124
+ gpu_ring[put_value % gpfifo_entries] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
125
+ gpu_ring_controls.GPPut = (put_value + 1) % gpfifo_entries
126
+ dev.gpu_mmio[0x90 // 4] = gpfifo_token
127
+ return put_value + 1
128
+
129
+ class HWComputeQueue(HWQueue):
130
+ def __init__(self):
131
+ super().__init__()
132
+ self.ptr_to_qmd = {}
133
+
134
+ def copy_from_cpu(self, gpuaddr, data):
135
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
136
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
137
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
138
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
139
+ self.next_cmd_index += 1
140
+ return self
141
+
142
+ def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0, chain_exec_ptr=None):
143
+ prg.qmd.cta_raster_width, prg.qmd.cta_raster_height, prg.qmd.cta_raster_depth = global_size
144
+ prg.qmd.cta_thread_dimension0, prg.qmd.cta_thread_dimension1, prg.qmd.cta_thread_dimension2 = local_size
145
+ prg.qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
146
+ prg.qmd.constant_buffer_addr_upper_0 = kernargs >> 32
147
+ if signal is not None:
148
+ prg.qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
149
+ prg.qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
150
+ prg.qmd.release0_payload_lower = signal_value & 0xffffffff
151
+ prg.qmd.release0_payload_upper = signal_value >> 32
152
+ prg.qmd.release0_enable = 1
153
+ else: prg.qmd.release0_enable = 0
154
+
155
+ ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
156
+ self.ptr_to_qmd[self.ptr()] = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
157
+
158
+ if chain_exec_ptr is None:
159
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
160
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
161
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
162
+ else:
163
+ self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
164
+ self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
165
+ self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
166
+ self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
167
+ self.next_cmd_index += 1
168
+ return self
169
+
170
+ def update_exec(self, cmd_ptr, global_size, local_size):
171
+ # Patch the exec cmd with new launch dims
172
+ qmd = self.ptr_to_qmd[cmd_ptr]
173
+ qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
174
+ qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
175
+
176
+ def submit(self, dev:NVDevice):
177
+ if len(self.q) == 0: return
178
+ dev.compute_put_value = self._submit(dev, dev.compute_gpu_ring, dev.compute_put_value, dev.compute_gpfifo_entries,
179
+ dev.compute_gpfifo_token, dev.compute_gpu_ring_controls)
180
+
181
+ class HWCopyQueue(HWQueue):
182
+ def copy(self, dest, src, copy_size):
183
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
184
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
185
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
186
+ self.next_cmd_index += 1
187
+ return self
188
+
189
+ def submit(self, dev:NVDevice):
190
+ if len(self.q) == 0: return
191
+ dev.dma_put_value = self._submit(dev, dev.dma_gpu_ring, dev.dma_put_value, dev.dma_gpfifo_entries,
192
+ dev.dma_gpfifo_token, dev.dma_gpu_ring_controls)
193
+
194
+ SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
195
+ class NVProgram:
196
+ def __init__(self, device:NVDevice, name:str, lib:bytes):
197
+ self.device, self.name, self.lib = device, name, lib
198
+ if DEBUG >= 6:
199
+ try:
200
+ fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
201
+ with open(fn + ".cubin", "wb") as f: f.write(lib)
202
+ print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
203
+ except Exception as e: print("failed to disasm cubin", str(e))
204
+
205
+ self.global_init, self.shmem_usage = None, 0
206
+ constant_buffers_data = {}
207
+
208
+ if MOCKGPU:
209
+ self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
210
+ constant_buffers_data[0] = memoryview(bytearray(0x190))
211
+ else:
212
+ _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
213
+ sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
214
+ shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
215
+ for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
216
+ section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
217
+ if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
218
+ elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
219
+ self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
220
+ self.registers_usage = sh_info >> 24
221
+ if match := re.match(r'\.nv\.constant(\d+)', section_name):
222
+ constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
223
+ if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
224
+ elif section_name == ".nv.info":
225
+ section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
226
+ for i in range(sh_size // 12):
227
+ if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
228
+ raise RuntimeError("too high local memory")
229
+
230
+ # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
231
+ self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
232
+
233
+ # Load program and constant buffers (if any)
234
+ self.lib_sz = round_up(round_up(self.program.nbytes, 128) + round_up(0 if self.global_init is None else self.global_init.nbytes, 128) +
235
+ sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
236
+ self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
237
+ for st in range(0, len(self.program), 4095):
238
+ HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
239
+
240
+ self.constbuffer_0 = [0] * 88
241
+ self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
242
+
243
+ smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
244
+ self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
245
+ invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
246
+ cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
247
+ shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
248
+ max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
249
+ barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
250
+ program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
251
+ program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
252
+ constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
253
+
254
+ # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
255
+ self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
256
+ self.kernargs_segment_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
257
+ self.kernargs_offset = 0x160
258
+
259
+ # constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
260
+ if 0 in constant_buffers_data: constant_buffers_data.pop(0)
261
+
262
+ off = round_up(self.program.nbytes, 128)
263
+ if self.global_init is not None:
264
+ # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
265
+ assert 4 in constant_buffers_data and constant_buffers_data[4].nbytes == 8
266
+ HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
267
+ constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
268
+ off += round_up(self.global_init.nbytes, 128)
269
+
270
+ for i,data in constant_buffers_data.items():
271
+ self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
272
+ self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
273
+ self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
274
+ self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
275
+
276
+ HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
277
+ off += round_up(data.nbytes, 128)
278
+
279
+ HWComputeQueue().signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
280
+ self.device.timeline_value += 1
281
+ self.device.synchronize()
282
+
283
+ def __del__(self):
284
+ if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
285
+
286
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
287
+ if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
288
+ if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
289
+ raise RuntimeError("Invalid global/local dims")
290
+
291
+ if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_segment_size):
292
+ self.device.kernargs_ptr = self.device.kernargs_page.base
293
+
294
+ # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
295
+ if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
296
+ kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + [val for val in vals]
297
+
298
+ queue = HWComputeQueue()
299
+ queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
300
+ if wait: queue.signal(self.device.time_event_st, timestamp=True)
301
+ queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
302
+ queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
303
+ if wait: queue.signal(self.device.time_event_en, timestamp=True)
304
+ queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
305
+ self.device.timeline_value += 1
306
+ self.device.kernargs_ptr += self.kernargs_segment_size
307
+
308
+ if wait:
309
+ self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
310
+ return (self.device.time_event_en[1] - self.device.time_event_st[1]) / 1e9
311
+
312
+ class NVAllocator(LRUAllocator):
313
+ def __init__(self, device:NVDevice):
314
+ self.device = device
315
+ self.b = [self.device._gpu_host_alloc(2 << 20) for _ in range(16)]
316
+ self.b_timeline = [0] * len(self.b)
317
+ self.b_next = 0
318
+ super().__init__()
319
+
320
+ def _alloc(self, size:int, options:BufferOptions):
321
+ if options.host: return self.device._gpu_host_alloc(size)
322
+ else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access)
323
+
324
+ def _free(self, gpumem, options:BufferOptions):
325
+ NVDevice.synchronize_system()
326
+ if options.host: self.device._gpu_host_free(gpumem)
327
+ else: self.device._gpu_free(gpumem)
328
+
329
+ def copyin(self, dest, src: memoryview):
330
+ for i in range(0, src.nbytes, self.b[0].length):
331
+ self.b_next = (self.b_next + 1) % len(self.b)
332
+ NVDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
333
+ ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].length, src.nbytes-i))
334
+ HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
335
+ .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
336
+ .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
337
+ self.b_timeline[self.b_next] = self.device.timeline_value
338
+ self.device.timeline_value += 1
339
+
340
+ def copyout(self, dest:memoryview, src):
341
+ NVDevice.synchronize_system()
342
+ for i in range(0, dest.nbytes, self.b[0].length):
343
+ HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
344
+ .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].length, dest.nbytes-i)) \
345
+ .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
346
+ NVDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
347
+ self.device.timeline_value += 1
348
+
349
+ ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
350
+
351
+ def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
352
+ src_dev._gpu_map(dest)
353
+ HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
354
+ .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
355
+ .copy(dest.va_addr, src.va_addr, sz) \
356
+ .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
357
+ HWComputeQueue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
358
+ src_dev.timeline_value += 1
359
+
360
+ MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
361
+ class NVDevice(Compiled):
362
+ root = None
363
+ fd_ctl: int = -1
364
+ fd_uvm: int = -1
365
+ gpus_info = None
366
+ signals_page:Any = None
367
+ signals_pool: List[Any] = []
368
+ uvm_vaddr: int = 0x1000000000
369
+ host_object_enumerator: int = 0x1000
370
+ devices: List[NVDevice] = []
371
+
372
+ def _new_gpu_fd(self):
373
+ fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
374
+ nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
375
+ return fd_dev
376
+
377
+ def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
378
+ fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
379
+ made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
380
+ params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
381
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
382
+ if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
383
+ return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
384
+
385
+ def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
386
+ size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
387
+ alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
388
+ attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
389
+ ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
390
+ attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
391
+ ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
392
+ flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
393
+ nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
394
+ mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
395
+
396
+ if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
397
+ if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
398
+ return self._gpu_uvm_map(va_addr, size, mem_handle)
399
+
400
+ def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
401
+ alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
402
+ attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
403
+ attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
404
+ flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
405
+ nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
406
+ mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
407
+
408
+ if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
409
+ if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
410
+
411
+ return self._gpu_uvm_map(va_addr, size, mem_handle)
412
+
413
+ def _gpu_host_alloc(self, size):
414
+ va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
415
+ libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
416
+ return self._map_to_gpu(va_base, sz)
417
+
418
+ def _gpu_free(self, mem):
419
+ made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
420
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
421
+ if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
422
+ uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
423
+
424
+ def _gpu_host_free(self, mem):
425
+ uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
426
+ libc.munmap(mem.base, mem.length)
427
+
428
+ def _map_to_gpu(self, va_base, size):
429
+ NVDevice.host_object_enumerator += 1
430
+ flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
431
+ (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
432
+ made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
433
+ hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
434
+ nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
435
+ if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
436
+ return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
437
+
438
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
439
+ if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
440
+ gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
441
+ nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
442
+
443
+ # NOTE: va_addr is set to make rawbufs compatable with AMD.
444
+ return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
445
+ gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base)
446
+
447
+ def _gpu_map(self, mem):
448
+ if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
449
+ mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_uuid])
450
+ return self._gpu_uvm_map(mem.base, mem.length, mem.hMemory, create_range=False)
451
+
452
+ def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
453
+ NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
454
+ return res_va
455
+
456
+ def __init__(self, device:str=""):
457
+ if NVDevice.root is None:
458
+ NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
459
+ NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
460
+ fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
461
+ NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
462
+ uvm.initialize(self.fd_uvm)
463
+ try:
464
+ uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm)
465
+ except RuntimeError:
466
+ pass # this error is okay, CUDA hits it too
467
+
468
+ NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
469
+ nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
470
+
471
+ # TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
472
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
473
+ self.fd_dev = self._new_gpu_fd()
474
+
475
+ assert NVDevice.gpus_info[self.device_id].valid
476
+ gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
477
+ rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
478
+ device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
479
+ self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A
480
+
481
+ device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
482
+ vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
483
+ self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
484
+ self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
485
+ self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
486
+ gpu_mmio_ptr = self._gpu_map_to_cpu(self.usermode, 0x10000, flags=2)
487
+ self.gpu_mmio = to_mv(gpu_mmio_ptr, 0x10000).cast("I")
488
+
489
+ boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
490
+ (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
491
+ rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
492
+
493
+ vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
494
+ flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
495
+ vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
496
+
497
+ gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
498
+ rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
499
+ self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
500
+
501
+ uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
502
+ uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
503
+ hClient=self.root, hVaSpace=vaspace)
504
+
505
+ for dev in self.devices:
506
+ uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
507
+
508
+ if NVDevice.signals_page is None:
509
+ NVDevice.signals_page = self._gpu_system_alloc(0x10000, map_to_cpu=True)
510
+ NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
511
+ else: self._gpu_map(NVDevice.signals_page)
512
+
513
+ channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
514
+ channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
515
+
516
+ gpfifo = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
517
+
518
+ ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
519
+ ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
520
+
521
+ self.compute_gpfifo_entries: int = 0x10000
522
+ self.compute_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0, entries=self.compute_gpfifo_entries)
523
+ self.compute_gpu_ring: memoryview = to_mv(gpfifo.base, self.compute_gpfifo_entries * 8).cast("Q")
524
+ self.compute_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + self.compute_gpfifo_entries * 8)
525
+ self.compute_put_value: int = 0
526
+
527
+ self.dma_gpfifo_entries: int = 0x10000
528
+ self.dma_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0x100000, entries=self.dma_gpfifo_entries)
529
+ self.dma_gpu_ring: memoryview = to_mv(gpfifo.base + 0x100000, self.dma_gpfifo_entries * 8).cast("Q")
530
+ self.dma_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + 0x100000 + self.dma_gpfifo_entries * 8)
531
+ self.dma_put_value: int = 0
532
+
533
+ en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
534
+ rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
535
+
536
+ self.timeline_value: int = 1
537
+ self.timeline_signal, self._shadow_timeline_signal = NVDevice._get_signal(), NVDevice._get_signal()
538
+ self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
539
+
540
+ self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
541
+ self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
542
+ self.cmdq_wptr: int = 0 # in bytes
543
+
544
+ self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
545
+ self.kernargs_ptr: int = self.kernargs_page.base
546
+
547
+ self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
548
+
549
+ from tinygrad.runtime.graph.hcq import HCQGraph
550
+ super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
551
+ functools.partial(NVProgram, self), functools.partial(HCQGraph, NVDevice, HWComputeQueue, HWCopyQueue))
552
+
553
+ self._cmdq_setup_compute_gpfifo()
554
+ self._cmdq_setup_dma_gpfifo()
555
+
556
+ NVDevice.devices.append(self)
557
+
558
+ def synchronize(self):
559
+ NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
560
+ self.cmdq_wptr = 0
561
+
562
+ if self.timeline_value > (1 << 63):
563
+ self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
564
+ self.timeline_signal[0], self.timeline_value = 0, 1
565
+ cast(NVAllocator, self.allocator).b_timeline = [0] * len(cast(NVAllocator, self.allocator).b)
566
+
567
+ @staticmethod
568
+ def synchronize_system():
569
+ for d in NVDevice.devices: d.synchronize()
570
+
571
+ @classmethod
572
+ def _set_signal(self, sig, value): sig[0] = value
573
+
574
+ @classmethod
575
+ def _get_signal(self, value=0) -> memoryview:
576
+ self._set_signal(sig := self.signals_pool.pop(), value)
577
+ return sig
578
+
579
+ @classmethod
580
+ def _wait_signal(self, signal, value=0, timeout=10000):
581
+ start_time = time.time() * 1000
582
+ sem_value = signal[0]
583
+ while sem_value < value:
584
+ sem_value = signal[0]
585
+ if time.time() * 1000 - start_time > timeout: raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
586
+
587
+ def _gpu_fifo_setup(self, gpfifo, ctxshare, channel_group, offset, entries=0x400):
588
+ notifier = self._gpu_system_alloc(48 << 20)
589
+ params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo.hMemory,
590
+ gpFifoOffset=gpfifo.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
591
+ hUserdMemory=(ctypes.c_uint32*8)(gpfifo.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
592
+ gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
593
+ rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
594
+ rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
595
+
596
+ ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
597
+ rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
598
+ assert ws_token_params.workSubmitToken != -1
599
+
600
+ channel_base = self._alloc_gpu_vaddr(0x4000000)
601
+ uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
602
+ hChannel=gpfifo, base=channel_base, length=0x4000000)
603
+
604
+ return ws_token_params.workSubmitToken
605
+
606
+ def _cmdq_setup_compute_gpfifo(self):
607
+ self.slm_per_thread = 0x900
608
+ bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
609
+ bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
610
+ self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
611
+
612
+ # Set windows addresses to not collide with other allocated buffers.
613
+ self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
614
+
615
+ queue = HWComputeQueue()
616
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
617
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
618
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
619
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
620
+ queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
621
+ queue.signal(self.timeline_signal, self.timeline_value).submit(self)
622
+ self.timeline_value += 1
623
+ self.synchronize()
624
+
625
+ def _cmdq_setup_dma_gpfifo(self):
626
+ queue = HWCopyQueue()
627
+ queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
628
+ queue.signal(self.timeline_signal, self.timeline_value).submit(self)
629
+ self.timeline_value += 1
630
+ self.synchronize()