tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. tinygrad/codegen/kernel.py +114 -172
  2. tinygrad/codegen/linearize.py +211 -81
  3. tinygrad/codegen/lowerer.py +30 -35
  4. tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
  5. tinygrad/codegen/transcendental.py +12 -13
  6. tinygrad/device.py +170 -47
  7. tinygrad/dtype.py +28 -26
  8. tinygrad/engine/jit.py +80 -63
  9. tinygrad/engine/memory.py +4 -5
  10. tinygrad/engine/multi.py +162 -0
  11. tinygrad/engine/realize.py +58 -107
  12. tinygrad/engine/schedule.py +381 -314
  13. tinygrad/engine/search.py +40 -44
  14. tinygrad/gradient.py +70 -0
  15. tinygrad/helpers.py +77 -58
  16. tinygrad/nn/__init__.py +30 -32
  17. tinygrad/nn/datasets.py +1 -2
  18. tinygrad/nn/optim.py +22 -26
  19. tinygrad/nn/state.py +89 -64
  20. tinygrad/ops.py +562 -446
  21. tinygrad/renderer/__init__.py +79 -36
  22. tinygrad/renderer/cstyle.py +70 -84
  23. tinygrad/renderer/llvmir.py +32 -20
  24. tinygrad/renderer/ptx.py +79 -99
  25. tinygrad/renderer/wgsl.py +87 -0
  26. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  27. tinygrad/runtime/autogen/comgr.py +2 -0
  28. tinygrad/runtime/autogen/kfd.py +4 -3
  29. tinygrad/runtime/autogen/kgsl.py +1 -1
  30. tinygrad/runtime/autogen/libpciaccess.py +2023 -0
  31. tinygrad/runtime/autogen/llvm.py +11379 -0
  32. tinygrad/runtime/autogen/vfio.py +891 -0
  33. tinygrad/runtime/graph/cuda.py +8 -9
  34. tinygrad/runtime/graph/hcq.py +84 -79
  35. tinygrad/runtime/graph/metal.py +19 -21
  36. tinygrad/runtime/ops_amd.py +488 -327
  37. tinygrad/runtime/ops_clang.py +15 -28
  38. tinygrad/runtime/ops_cloud.py +34 -34
  39. tinygrad/runtime/ops_cuda.py +30 -27
  40. tinygrad/runtime/ops_disk.py +62 -63
  41. tinygrad/runtime/ops_dsp.py +129 -38
  42. tinygrad/runtime/ops_gpu.py +30 -30
  43. tinygrad/runtime/ops_hip.py +29 -31
  44. tinygrad/runtime/ops_llvm.py +45 -40
  45. tinygrad/runtime/ops_metal.py +93 -73
  46. tinygrad/runtime/ops_npy.py +2 -2
  47. tinygrad/runtime/ops_nv.py +232 -270
  48. tinygrad/runtime/ops_python.py +51 -46
  49. tinygrad/runtime/ops_qcom.py +129 -157
  50. tinygrad/runtime/ops_webgpu.py +63 -0
  51. tinygrad/runtime/support/allocator.py +94 -0
  52. tinygrad/runtime/support/am/__init__.py +0 -0
  53. tinygrad/runtime/support/am/amdev.py +384 -0
  54. tinygrad/runtime/support/am/ip.py +463 -0
  55. tinygrad/runtime/support/compiler_cuda.py +4 -2
  56. tinygrad/runtime/support/elf.py +26 -4
  57. tinygrad/runtime/support/hcq.py +254 -324
  58. tinygrad/runtime/support/llvm.py +32 -0
  59. tinygrad/shape/shapetracker.py +84 -53
  60. tinygrad/shape/view.py +103 -138
  61. tinygrad/spec.py +154 -0
  62. tinygrad/tensor.py +744 -496
  63. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
  64. tinygrad-0.10.1.dist-info/RECORD +86 -0
  65. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
  66. tinygrad/engine/lazy.py +0 -228
  67. tinygrad/function.py +0 -212
  68. tinygrad/multi.py +0 -177
  69. tinygrad/runtime/graph/clang.py +0 -39
  70. tinygrad-0.10.0.dist-info/RECORD +0 -77
  71. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
  72. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,27 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys
2
+ import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
3
3
  assert sys.platform != 'win32'
4
- from typing import Tuple, List, Any, cast, Union, Dict, Type
4
+ from typing import Any, cast, Union, Type
5
5
  from dataclasses import dataclass
6
- from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
7
- from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
8
- from tinygrad.device import BufferOptions
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
7
+ from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
8
+ from tinygrad.ops import sint
9
+ from tinygrad.device import BufferSpec
9
10
  from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
10
11
  from tinygrad.renderer.ptx import PTXRenderer
11
12
  from tinygrad.renderer.cstyle import NVRenderer
12
13
  from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
13
- from tinygrad.runtime.autogen import nv_gpu, libc
14
+ from tinygrad.runtime.autogen import nv_gpu
14
15
  from tinygrad.runtime.support.elf import elf_loader
15
16
  if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
16
- if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
17
17
 
18
18
  def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
19
19
 
20
20
  NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
21
21
  NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
22
22
 
23
- def nv_iowr(fd, nr, args):
24
- ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
23
+ def nv_iowr(fd:HWInterface, nr, args):
24
+ ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
25
25
  if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
26
26
 
27
27
  def rm_alloc(fd, clss, root, parant, params):
@@ -46,8 +46,8 @@ def make_rmctrl_type():
46
46
  getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
47
47
  rmctrl = make_rmctrl_type()
48
48
 
49
- def uvm_ioctl(cmd, sttyp, fd, **kwargs):
50
- ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
49
+ def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
50
+ ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
51
51
  if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
52
52
  if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
53
53
  return made
@@ -58,7 +58,7 @@ def make_uvm_type():
58
58
  uvm = make_uvm_type()
59
59
 
60
60
  def make_qmd_struct_type():
61
- fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
61
+ fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
62
62
  bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
63
63
  bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
64
64
  bits = sorted(bits, key=lambda x: x[1][1])
@@ -71,167 +71,138 @@ def make_qmd_struct_type():
71
71
  qmd_struct_t = make_qmd_struct_type()
72
72
  assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
73
73
 
74
- def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
75
-
76
74
  class NVSignal(HCQSignal):
77
- def __init__(self, value=0, is_timeline=False):
78
- self._signal = NVDevice.signals_pool.pop()
79
- self.signal_addr = mv_address(self._signal)
80
- super().__init__(value)
81
- def __del__(self): NVDevice.signals_pool.append(self._signal)
82
- def _get_value(self) -> int: return self._signal[0]
83
- def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
84
- def _set_value(self, new_value:int): self._signal[0] = new_value
85
-
86
- class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
75
+ def __init__(self, base_addr:int|None=None, **kwargs):
76
+ super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
77
+
78
+ def __del__(self):
79
+ if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
80
+
81
+ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
82
+ def __init__(self):
83
+ self.active_qmd = None
84
+ super().__init__()
85
+
87
86
  def __del__(self):
88
- if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
87
+ if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
88
+
89
+ def nvm(self, subchannel, mthd, *args, typ=2): self.q((typ << 28) | (len(args) << 16) | (subchannel << 13) | (mthd >> 2), *args)
89
90
 
90
- @hcq_command
91
91
  def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
92
- if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class]
93
- if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class]
94
- if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)]
95
- if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)]
96
- if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)]
97
- if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff]
98
-
99
- def _wait(self, signal, value=0):
100
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
101
- (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
102
-
103
- def _update_wait(self, cmd_idx, signal=None, value=None):
104
- if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
105
- if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
106
-
107
- def _timestamp(self, signal): return self._signal(signal, 0)
108
-
109
- def bind(self, device):
110
- self.binded_device = device
111
- self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
92
+ if compute_class: self.nvm(1, nv_gpu.NVC6C0_SET_OBJECT, compute_class)
93
+ if copy_class: self.nvm(4, nv_gpu.NVC6C0_SET_OBJECT, copy_class)
94
+ if local_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, *data64(local_mem_window))
95
+ if shared_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, *data64(shared_mem_window))
96
+ if local_mem: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, *data64(local_mem))
97
+ if local_mem_tpc_bytes: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, *data64(local_mem_tpc_bytes), 0xff)
98
+ return self
99
+
100
+ def wait(self, signal:NVSignal, value:sint=0):
101
+ self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT
102
+ self.active_qmd = None
103
+ return self
104
+
105
+ def timestamp(self, signal:NVSignal): return self.signal(signal, 0)
106
+
107
+ def bind(self, dev:NVDevice):
108
+ self.binded_device = dev
109
+ self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
112
110
  hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
113
- for i, value in enumerate(self.q): hw_view[i] = value
111
+ for i, value in enumerate(self._q): hw_view[i] = value
114
112
 
115
113
  # From now on, the queue is on the device for faster submission.
116
- self.q = hw_view # type: ignore
114
+ self._q = hw_view
117
115
 
118
- def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
116
+ def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):
119
117
  if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
120
118
  else:
121
- if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
122
- assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
123
- gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
124
- dev.cmdq_wptr = 0
119
+ cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4)
120
+ cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4
121
+ dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q)
125
122
 
126
- dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
127
- cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
128
- dev.cmdq_wptr += len(self.q) * 4
129
-
130
- gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
123
+ gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
131
124
  gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
132
125
  dev.gpu_mmio[0x90 // 4] = gpfifo.token
133
126
  gpfifo.put_value += 1
134
127
 
135
- class NVComputeQueue(NVCommandQueue, HWComputeQueue):
136
- def __init__(self):
137
- self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
138
- super().__init__()
128
+ class NVComputeQueue(NVCommandQueue):
129
+ def memory_barrier(self):
130
+ self.nvm(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, (1 << 12) | (1 << 4) | (1 << 0))
131
+ self.active_qmd = None
132
+ return self
139
133
 
140
- def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
134
+ def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
135
+ self.bind_args_state(args_state)
141
136
 
142
- def _exec(self, prg, args_state, global_size, local_size):
143
137
  ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
144
138
  assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
145
139
 
146
- self.cmd_idx_to_qmd[self._cur_cmd_idx()] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
147
- self.cmd_idx_to_global_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
148
- self.cmd_idx_to_local_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
140
+ qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
149
141
 
150
- qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
151
- qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
142
+ self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
143
+ self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
152
144
  qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
153
145
 
154
- if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is None:
155
- self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
156
- self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
146
+ if self.active_qmd is None:
147
+ self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
148
+ self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
157
149
  else:
158
- prev_qmd.dependent_qmd0_pointer = qmd_addr >> 8
159
- prev_qmd.dependent_qmd0_action = 1
160
- prev_qmd.dependent_qmd0_prefetch = 1
161
- prev_qmd.dependent_qmd0_enable = 1
162
-
163
- def _update_exec(self, cmd_idx, global_size, local_size):
164
- # Patch the exec cmd with new launch dims
165
- if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
166
- if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
167
-
168
- def _signal(self, signal, value=0):
169
- if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is not None:
170
- for i in range(2):
171
- if getattr(prev_qmd, f'release{i}_enable') == 0:
172
- setattr(prev_qmd, f'release{i}_enable', 1)
173
- setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
174
- setattr(prev_qmd, f'release{i}_payload', value)
175
- self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd
176
- self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i
177
- return
178
-
179
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
180
- (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
181
- self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
182
-
183
- def _update_signal(self, cmd_idx, signal=None, value=None):
184
- if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
185
- if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
186
- if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
187
-
188
- def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
189
-
190
- class NVCopyQueue(NVCommandQueue, HWCopyQueue):
191
- def _copy(self, dest, src, copy_size):
192
- self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
193
- self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
194
- self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
195
-
196
- def _update_copy(self, cmd_idx, dest=None, src=None):
197
- if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
198
- if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
199
-
200
- def _signal(self, signal, value=0):
201
- self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value]
202
- self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
203
-
204
- def _update_signal(self, cmd_idx, signal=None, value=None):
205
- if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
206
- if value is not None: self._patch(cmd_idx, offset=3, data=[value])
150
+ self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
151
+ self.active_qmd.dependent_qmd0_action = 1
152
+ self.active_qmd.dependent_qmd0_prefetch = 1
153
+ self.active_qmd.dependent_qmd0_enable = 1
207
154
 
208
- def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
209
-
210
- class NVArgsState(HCQArgsState):
211
- def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
212
- super().__init__(ptr, prg, bufs, vals=vals)
155
+ self.active_qmd = qmd
156
+ return self
213
157
 
158
+ def signal(self, signal:NVSignal, value:sint=0):
159
+ if self.active_qmd is not None:
160
+ for i in range(2):
161
+ if getattr(self.active_qmd, f'release{i}_enable') == 0:
162
+ setattr(self.active_qmd, f'release{i}_enable', 1)
163
+ self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
164
+ self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
165
+ return self
166
+
167
+ self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
168
+ (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)) # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
169
+ self.nvm(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 0x0)
170
+ self.active_qmd = None
171
+ return self
172
+
173
+ def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.compute_gpfifo)
174
+
175
+ class NVCopyQueue(NVCommandQueue):
176
+ def copy(self, dest:sint, src:sint, copy_size:int):
177
+ self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src), *data64(dest))
178
+ self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, copy_size)
179
+ self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
180
+ return self
181
+
182
+ def signal(self, signal:NVSignal, value:sint=0):
183
+ self.nvm(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, *data64(signal.value_addr), value)
184
+ self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x14)
185
+ return self
186
+
187
+ def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
188
+
189
+ class NVArgsState(CLikeArgsState):
190
+ def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
214
191
  if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
215
- kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
216
- to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
217
- self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
218
- self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
219
-
220
- def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
221
- def update_var(self, index:int, val:int): self.vals[index] = val
192
+ super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
222
193
 
223
194
  class NVProgram(HCQProgram):
224
- def __init__(self, device:NVDevice, name:str, lib:bytes):
225
- self.device, self.name, self.lib = device, name, lib
195
+ def __init__(self, dev:NVDevice, name:str, lib:bytes):
196
+ self.dev, self.name, self.lib = dev, name, lib
226
197
 
227
198
  if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
228
199
  else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
229
200
 
230
201
  # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
231
- self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True))
202
+ self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
232
203
 
233
204
  self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
234
- self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
205
+ self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
235
206
  for sh in sections:
236
207
  if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
237
208
  if sh.name == f".text.{self.name}":
@@ -243,7 +214,7 @@ class NVProgram(HCQProgram):
243
214
  if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
244
215
 
245
216
  # Ensure device has enough local memory to run the program
246
- self.device._ensure_has_local_memory(self.lcmem_usage)
217
+ self.dev._ensure_has_local_memory(self.lcmem_usage)
247
218
 
248
219
  # Apply relocs
249
220
  for apply_image_offset, rel_sym_offset, typ, _ in relocs:
@@ -256,15 +227,16 @@ class NVProgram(HCQProgram):
256
227
  ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
257
228
 
258
229
  self.constbuffer_0 = [0] * 88
259
- self.constbuffer_0[6:12] = [*data64_le(self.device.shared_mem_window), *data64_le(self.device.local_mem_window), *data64_le(0xfffdc0)]
230
+ self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
260
231
 
261
232
  smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
262
- self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
233
+ self.qmd: ctypes.Structure = \
234
+ qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
263
235
  invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
264
236
  cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
265
237
  shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
266
238
  max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
267
- barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
239
+ barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
268
240
  program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
269
241
 
270
242
  for i,(addr,sz) in self.constbufs.items():
@@ -273,32 +245,32 @@ class NVProgram(HCQProgram):
273
245
  self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
274
246
  self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
275
247
 
276
- # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
248
+ # Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
277
249
  self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
278
250
 
279
251
  # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
280
- super().__init__(NVArgsState, self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
252
+ super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
281
253
 
282
254
  def __del__(self):
283
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
255
+ if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
284
256
 
285
- def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
286
- if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.device).slm_per_thread:
287
- raise RuntimeError("Too many resources requested for launch")
257
+ def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
258
+ if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
259
+ raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}")
288
260
  if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
289
261
  raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
290
262
  return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
291
263
 
292
- class NVAllocator(HCQAllocator):
293
- def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
294
- if options.host: return self.device._gpu_host_alloc(size, tag="user host memory")
295
- return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)), tag=f"user memory ({options})")
264
+ class NVAllocator(HCQAllocator['NVDevice']):
265
+ def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
266
+ if options.host: return self.dev._gpu_alloc(size, host=True, tag="user host memory")
267
+ return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})")
296
268
 
297
- def _free(self, opaque, options:BufferOptions):
298
- self.device.synchronize()
299
- self.device._gpu_free(opaque)
269
+ def _free(self, opaque:HCQBuffer, options:BufferSpec):
270
+ self.dev.synchronize()
271
+ self.dev._gpu_free(opaque)
300
272
 
301
- def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
273
+ def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
302
274
 
303
275
  @dataclass
304
276
  class GPFifo:
@@ -309,119 +281,114 @@ class GPFifo:
309
281
  put_value: int = 0
310
282
 
311
283
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
312
- class NVDevice(HCQCompiled):
284
+ class NVDevice(HCQCompiled[NVSignal]):
313
285
  root = None
314
- fd_ctl: int = -1
315
- fd_uvm: int = -1
316
- gpus_info: Union[List, ctypes.Array] = []
286
+ fd_ctl: HWInterface
287
+ fd_uvm: HWInterface
288
+ gpus_info: Union[list, ctypes.Array] = []
317
289
  signals_page: Any = None
318
- signals_pool: List[Any] = []
319
- low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
320
- uvm_vaddr: int = 0x2000000000 # 0x2000000000+
290
+ signals_pool: list[int] = []
291
+
292
+ # TODO: Need a proper allocator for va addresses
293
+ # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
294
+ # VA space is 48bits.
295
+ low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x1000000000, wrap=False)
296
+ uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=0x2000000000, wrap=False)
321
297
  host_object_enumerator: int = 0x1000
322
298
 
323
299
  def _new_gpu_fd(self):
324
- fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
325
- nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
300
+ fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
301
+ nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
326
302
  return fd_dev
327
303
 
328
304
  def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
329
- fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
330
- made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
331
- params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
305
+ fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
306
+ made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
307
+ params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
332
308
  nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
333
309
  if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
334
- res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
335
- os.close(fd_dev)
336
- return res
337
-
338
- def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
339
- size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
340
- alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
341
- attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
342
- ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
343
- attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
344
- ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
345
- flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
346
- nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
347
- mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
348
-
349
- if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu)
350
- if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
351
- return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
352
-
353
- def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
354
- alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
355
- attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
356
- attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
357
- flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
358
- nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
359
- mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
360
-
361
- if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True)
362
- if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
363
-
364
- return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
365
-
366
- def _gpu_host_alloc(self, size, tag=""):
367
- va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
368
- mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
369
- assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
370
-
371
- NVDevice.host_object_enumerator += 1
372
- flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
373
- (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
374
- made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
375
- hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1)
376
- nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
377
-
378
- if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
379
- return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
380
-
381
- def _gpu_free(self, mem):
382
- if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
383
- nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
310
+ return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
311
+
312
+ def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
313
+ # Uncached memory is "system". Use huge pages only for gpu memory.
314
+ page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10))
315
+ size = round_up(size, page_size)
316
+ va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
317
+
318
+ if host:
319
+ va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
320
+
321
+ flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
322
+ | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
323
+
324
+ NVDevice.host_object_enumerator += 1
325
+ made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags,
326
+ hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
327
+ nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
328
+
329
+ if made.params.status != 0: raise RuntimeError(f"host alloc returned {get_error_str(made.params.status)}")
330
+ mem_handle = made.params.hObjectNew
331
+ else:
332
+ attr = ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contiguous else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27) \
333
+ | (nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE if page_size > 0x1000 else 0) << 23 | ((nv_gpu.NVOS32_ATTR_LOCATION_PCI if uncached else 0) << 25)
334
+
335
+ attr2 = ((nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO if uncached else nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES) << 2) \
336
+ | ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB if page_size > 0x1000 else 0) << 20) | nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC
337
+
338
+ fl = nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED | nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE \
339
+ | nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | (nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM if not uncached else 0)
340
+
341
+ alloc_func = nv_gpu.NV1_MEMORY_SYSTEM if uncached else nv_gpu.NV1_MEMORY_USER
342
+ alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=page_size, offset=0, limit=size-1, format=6, size=size,
343
+ type=nv_gpu.NVOS32_TYPE_NOTIFIER if uncached else nv_gpu.NVOS32_TYPE_IMAGE, attr=attr, attr2=attr2, flags=fl)
344
+ mem_handle = rm_alloc(self.fd_ctl, alloc_func, self.root, self.nvdevice, alloc_params).hObjectNew
345
+
346
+ if cpu_access: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=uncached)
347
+
348
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
349
+
350
+ def _gpu_free(self, mem:HCQBuffer):
351
+ if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
352
+ made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
353
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
384
354
  if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
385
355
 
386
- self._debug_mappings.pop((mem.va_addr, mem.size))
387
- uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
388
- if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
356
+ self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
357
+ uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
358
+ if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
389
359
 
390
- def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
360
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
391
361
  if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
392
362
  attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
393
363
 
394
- # NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
364
+ # NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
395
365
  self._debug_mappings[(va_base, size)] = tag
396
- return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
397
- gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
366
+ return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
367
+ hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
368
+ mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
398
369
 
399
- def _gpu_map(self, mem):
400
- if self.gpu_uuid in mem.mapped_gpu_ids: return
401
- mem.mapped_gpu_ids.append(self.gpu_uuid)
402
- self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
370
+ def _gpu_map(self, mem:HCQBuffer):
371
+ if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
372
+ mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
373
+ self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
403
374
 
404
375
  def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
405
- if force_low:
406
- NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
407
- assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
408
- else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
409
- return res_va
376
+ return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
410
377
 
411
378
  def _setup_nvclasses(self):
412
379
  classlist = memoryview(bytearray(100 * 4)).cast('I')
413
- clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist))
380
+ clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
414
381
  self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
415
382
  self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
416
383
 
417
384
  def __init__(self, device:str=""):
418
385
  if NVDevice.root is None:
419
- NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
420
- NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
421
- fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
386
+ NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
387
+ NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
388
+ self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
422
389
  NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
423
390
  uvm.initialize(self.fd_uvm)
424
- with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
391
+ with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
425
392
 
426
393
  nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
427
394
  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
@@ -438,40 +405,40 @@ class NVDevice(HCQCompiled):
438
405
 
439
406
  device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
440
407
  vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
441
- self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
442
- self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
408
+ self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
409
+ self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
443
410
  self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
444
411
  self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
445
412
 
446
413
  self._setup_nvclasses()
447
- self._debug_mappings: Dict[Tuple[int, int], str] = dict()
414
+ self._debug_mappings: dict[tuple[int, int], str] = dict()
448
415
 
449
416
  rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
450
417
  (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
451
418
 
452
419
  vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
453
420
  flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
454
- vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
421
+ vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.nvdevice, vaspace_params).hObjectNew
455
422
 
456
423
  raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
457
424
  self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
458
425
 
459
426
  uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
460
- uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
427
+ uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
461
428
 
462
- for dev in cast(List[NVDevice], self.devices):
429
+ for dev in cast(list[NVDevice], self.devices):
463
430
  try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
464
431
  except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
465
432
 
466
433
  if NVDevice.signals_page is None:
467
- NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
468
- NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
434
+ NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
435
+ NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
469
436
  else: self._gpu_map(NVDevice.signals_page)
470
437
 
471
438
  channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
472
- channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
439
+ channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
473
440
 
474
- gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo")
441
+ gpfifo_area = self._gpu_alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000, tag="gpfifo")
475
442
 
476
443
  ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
477
444
  ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
@@ -481,9 +448,9 @@ class NVDevice(HCQCompiled):
481
448
 
482
449
  rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
483
450
 
484
- self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True, tag="cmdq")
485
- self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
486
- self.cmdq_wptr: int = 0 # in bytes
451
+ self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
452
+ self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True)
453
+ self.cmdq: memoryview = to_mv(cast(int, self.cmdq_page.va_addr), 0x200000).cast("I")
487
454
 
488
455
  self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
489
456
  'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
@@ -496,10 +463,10 @@ class NVDevice(HCQCompiled):
496
463
  self._setup_gpfifos()
497
464
 
498
465
  def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
499
- notifier = self._gpu_system_alloc(48 << 20)
500
- params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
466
+ notifier = self._gpu_alloc(48 << 20, uncached=True)
467
+ params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
501
468
  gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
502
- hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
469
+ hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
503
470
  gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
504
471
  comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
505
472
  rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
@@ -507,13 +474,13 @@ class NVDevice(HCQCompiled):
507
474
  if enable_debug:
508
475
  self.debug_compute_obj, self.debug_channel = comp, gpfifo
509
476
  debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
510
- self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.device, debugger_params).hObjectNew
477
+ self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.nvdevice, debugger_params).hObjectNew
511
478
 
512
479
  ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
513
480
  assert ws_token_params.workSubmitToken != -1
514
481
 
515
482
  channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
516
- uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
483
+ uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
517
484
  hChannel=gpfifo, base=channel_base, length=0x4000000)
518
485
 
519
486
  return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
@@ -532,30 +499,25 @@ class NVDevice(HCQCompiled):
532
499
  NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
533
500
  .signal(self.timeline_signal, self.timeline_value).submit(self)
534
501
 
535
- NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
536
- .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
537
- .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
502
+ cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
503
+ .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
504
+ .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
538
505
 
539
506
  self.timeline_value += 2
540
507
 
541
508
  def _ensure_has_local_memory(self, required):
542
509
  if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
543
510
 
544
- if self.shader_local_mem is not None: self.allocator.free(self.shader_local_mem, self.shader_local_mem.size)
545
-
546
511
  self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread
547
512
  bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
513
+ self.shader_local_mem, ok = self._realloc(self.shader_local_mem, round_up(bytes_per_tpc*self.num_tpc_per_gpc*self.num_gpcs, 0x20000))
548
514
 
549
- try: self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
550
- except MemoryError:
551
- # If can't allocate a new size, reallocator the old buffer.
552
- self.slm_per_thread = old_slm_per_thread
553
- bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
554
- self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
515
+ # Realloc failed, restore the old value.
516
+ if not ok: self.slm_per_thread = old_slm_per_thread
555
517
 
556
- NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
557
- .setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
558
- .signal(self.timeline_signal, self.timeline_value).submit(self)
518
+ cast(NVComputeQueue, NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1)) \
519
+ .setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
520
+ .signal(self.timeline_signal, self.timeline_value).submit(self)
559
521
  self.timeline_value += 1
560
522
 
561
523
  def invalidate_caches(self):