tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +141 -201
  4. tinygrad/codegen/linearize.py +223 -84
  5. tinygrad/codegen/lowerer.py +60 -42
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +22 -13
  8. tinygrad/device.py +187 -47
  9. tinygrad/dtype.py +39 -28
  10. tinygrad/engine/jit.py +83 -65
  11. tinygrad/engine/memory.py +4 -5
  12. tinygrad/engine/multi.py +161 -0
  13. tinygrad/engine/realize.py +62 -108
  14. tinygrad/engine/schedule.py +396 -357
  15. tinygrad/engine/search.py +55 -66
  16. tinygrad/gradient.py +73 -0
  17. tinygrad/helpers.py +81 -59
  18. tinygrad/nn/__init__.py +30 -32
  19. tinygrad/nn/datasets.py +1 -2
  20. tinygrad/nn/optim.py +22 -26
  21. tinygrad/nn/state.py +91 -66
  22. tinygrad/ops.py +492 -641
  23. tinygrad/renderer/__init__.py +95 -36
  24. tinygrad/renderer/cstyle.py +99 -92
  25. tinygrad/renderer/llvmir.py +83 -34
  26. tinygrad/renderer/ptx.py +83 -99
  27. tinygrad/renderer/wgsl.py +95 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  29. tinygrad/runtime/autogen/comgr.py +2 -0
  30. tinygrad/runtime/autogen/kfd.py +4 -3
  31. tinygrad/runtime/autogen/kgsl.py +1 -1
  32. tinygrad/runtime/autogen/libc.py +404 -71
  33. tinygrad/runtime/autogen/llvm.py +11379 -0
  34. tinygrad/runtime/autogen/pci.py +1333 -0
  35. tinygrad/runtime/autogen/vfio.py +891 -0
  36. tinygrad/runtime/autogen/webgpu.py +6985 -0
  37. tinygrad/runtime/graph/cuda.py +8 -9
  38. tinygrad/runtime/graph/hcq.py +84 -79
  39. tinygrad/runtime/graph/metal.py +40 -43
  40. tinygrad/runtime/ops_amd.py +498 -334
  41. tinygrad/runtime/ops_cloud.py +34 -34
  42. tinygrad/runtime/ops_cpu.py +24 -0
  43. tinygrad/runtime/ops_cuda.py +30 -27
  44. tinygrad/runtime/ops_disk.py +62 -63
  45. tinygrad/runtime/ops_dsp.py +159 -42
  46. tinygrad/runtime/ops_gpu.py +30 -30
  47. tinygrad/runtime/ops_hip.py +29 -31
  48. tinygrad/runtime/ops_llvm.py +48 -41
  49. tinygrad/runtime/ops_metal.py +149 -113
  50. tinygrad/runtime/ops_npy.py +2 -2
  51. tinygrad/runtime/ops_nv.py +238 -273
  52. tinygrad/runtime/ops_python.py +55 -50
  53. tinygrad/runtime/ops_qcom.py +129 -157
  54. tinygrad/runtime/ops_webgpu.py +225 -0
  55. tinygrad/runtime/support/allocator.py +94 -0
  56. tinygrad/runtime/support/am/__init__.py +0 -0
  57. tinygrad/runtime/support/am/amdev.py +396 -0
  58. tinygrad/runtime/support/am/ip.py +463 -0
  59. tinygrad/runtime/support/compiler_cuda.py +4 -2
  60. tinygrad/runtime/support/elf.py +28 -4
  61. tinygrad/runtime/support/hcq.py +256 -324
  62. tinygrad/runtime/support/llvm.py +26 -0
  63. tinygrad/shape/shapetracker.py +85 -53
  64. tinygrad/shape/view.py +104 -140
  65. tinygrad/spec.py +155 -0
  66. tinygrad/tensor.py +835 -527
  67. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  68. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  69. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  70. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  71. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  72. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  73. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  74. tinygrad/viz/index.html +544 -0
  75. tinygrad/viz/perfetto.html +178 -0
  76. tinygrad/viz/serve.py +205 -0
  77. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
  78. tinygrad-0.10.2.dist-info/RECORD +99 -0
  79. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
  80. tinygrad/codegen/uopgraph.py +0 -506
  81. tinygrad/engine/lazy.py +0 -228
  82. tinygrad/function.py +0 -212
  83. tinygrad/multi.py +0 -177
  84. tinygrad/runtime/graph/clang.py +0 -39
  85. tinygrad/runtime/ops_clang.py +0 -35
  86. tinygrad-0.10.0.dist-info/RECORD +0 -77
  87. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  88. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,27 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys
2
+ import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
3
3
  assert sys.platform != 'win32'
4
- from typing import Tuple, List, Any, cast, Union, Dict, Type
4
+ from typing import Any, cast, Union, Type
5
5
  from dataclasses import dataclass
6
- from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
7
- from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
8
- from tinygrad.device import BufferOptions
9
- from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
6
+ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
7
+ from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
8
+ from tinygrad.ops import sint
9
+ from tinygrad.device import BufferSpec, CPUProgram
10
+ from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX
10
11
  from tinygrad.renderer.ptx import PTXRenderer
11
12
  from tinygrad.renderer.cstyle import NVRenderer
12
13
  from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
13
- from tinygrad.runtime.autogen import nv_gpu, libc
14
+ from tinygrad.runtime.autogen import nv_gpu
14
15
  from tinygrad.runtime.support.elf import elf_loader
15
16
  if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
16
- if MOCKGPU:=getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
17
17
 
18
18
  def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
19
19
 
20
20
  NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
21
21
  NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
22
22
 
23
- def nv_iowr(fd, nr, args):
24
- ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
23
+ def nv_iowr(fd:HWInterface, nr, args):
24
+ ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
25
25
  if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
26
26
 
27
27
  def rm_alloc(fd, clss, root, parant, params):
@@ -46,8 +46,8 @@ def make_rmctrl_type():
46
46
  getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
47
47
  rmctrl = make_rmctrl_type()
48
48
 
49
- def uvm_ioctl(cmd, sttyp, fd, **kwargs):
50
- ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
49
+ def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
50
+ ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
51
51
  if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
52
52
  if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
53
53
  return made
@@ -58,7 +58,7 @@ def make_uvm_type():
58
58
  uvm = make_uvm_type()
59
59
 
60
60
  def make_qmd_struct_type():
61
- fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
61
+ fields: list[tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
62
62
  bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
63
63
  bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
64
64
  bits = sorted(bits, key=lambda x: x[1][1])
@@ -71,167 +71,141 @@ def make_qmd_struct_type():
71
71
  qmd_struct_t = make_qmd_struct_type()
72
72
  assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
73
73
 
74
- def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
75
-
76
74
  class NVSignal(HCQSignal):
77
- def __init__(self, value=0, is_timeline=False):
78
- self._signal = NVDevice.signals_pool.pop()
79
- self.signal_addr = mv_address(self._signal)
80
- super().__init__(value)
81
- def __del__(self): NVDevice.signals_pool.append(self._signal)
82
- def _get_value(self) -> int: return self._signal[0]
83
- def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
84
- def _set_value(self, new_value:int): self._signal[0] = new_value
85
-
86
- class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
75
+ def __init__(self, base_addr:int|None=None, **kwargs):
76
+ super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
77
+
78
+ def __del__(self):
79
+ if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
80
+
81
+ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
82
+ def __init__(self):
83
+ self.active_qmd = None
84
+ super().__init__()
85
+
87
86
  def __del__(self):
88
- if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
87
+ if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
88
+
89
+ def nvm(self, subchannel, mthd, *args, typ=2): self.q((typ << 28) | (len(args) << 16) | (subchannel << 13) | (mthd >> 2), *args)
89
90
 
90
- @hcq_command
91
91
  def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
92
- if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class]
93
- if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class]
94
- if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)]
95
- if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)]
96
- if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)]
97
- if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff]
98
-
99
- def _wait(self, signal, value=0):
100
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
101
- (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
102
-
103
- def _update_wait(self, cmd_idx, signal=None, value=None):
104
- if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
105
- if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
106
-
107
- def _timestamp(self, signal): return self._signal(signal, 0)
108
-
109
- def bind(self, device):
110
- self.binded_device = device
111
- self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
92
+ if compute_class: self.nvm(1, nv_gpu.NVC6C0_SET_OBJECT, compute_class)
93
+ if copy_class: self.nvm(4, nv_gpu.NVC6C0_SET_OBJECT, copy_class)
94
+ if local_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, *data64(local_mem_window))
95
+ if shared_mem_window: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, *data64(shared_mem_window))
96
+ if local_mem: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, *data64(local_mem))
97
+ if local_mem_tpc_bytes: self.nvm(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, *data64(local_mem_tpc_bytes), 0xff)
98
+ return self
99
+
100
+ def wait(self, signal:NVSignal, value:sint=0):
101
+ self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT
102
+ self.active_qmd = None
103
+ return self
104
+
105
+ def timestamp(self, signal:NVSignal): return self.signal(signal, 0)
106
+
107
+ def bind(self, dev:NVDevice):
108
+ self.binded_device = dev
109
+ self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
112
110
  hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
113
- for i, value in enumerate(self.q): hw_view[i] = value
111
+ for i, value in enumerate(self._q): hw_view[i] = value
114
112
 
115
113
  # From now on, the queue is on the device for faster submission.
116
- self.q = hw_view # type: ignore
114
+ self._q = hw_view
117
115
 
118
- def _submit_to_gpfifo(self, dev, gpfifo:GPFifo):
116
+ def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):
119
117
  if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
120
118
  else:
121
- if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
122
- assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
123
- gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
124
- dev.cmdq_wptr = 0
119
+ cmdq_addr = dev.cmdq_allocator.alloc(len(self._q) * 4)
120
+ cmdq_wptr = (cmdq_addr - dev.cmdq_page.va_addr) // 4
121
+ dev.cmdq[cmdq_wptr : cmdq_wptr + len(self._q)] = array.array('I', self._q)
125
122
 
126
- dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
127
- cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
128
- dev.cmdq_wptr += len(self.q) * 4
129
-
130
- gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
123
+ gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
131
124
  gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
125
+
126
+ if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
132
127
  dev.gpu_mmio[0x90 // 4] = gpfifo.token
133
128
  gpfifo.put_value += 1
134
129
 
135
- class NVComputeQueue(NVCommandQueue, HWComputeQueue):
136
- def __init__(self):
137
- self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
138
- super().__init__()
130
+ class NVComputeQueue(NVCommandQueue):
131
+ def memory_barrier(self):
132
+ self.nvm(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, (1 << 12) | (1 << 4) | (1 << 0))
133
+ self.active_qmd = None
134
+ return self
139
135
 
140
- def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
136
+ def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
137
+ self.bind_args_state(args_state)
141
138
 
142
- def _exec(self, prg, args_state, global_size, local_size):
143
139
  ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
144
140
  assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
145
141
 
146
- self.cmd_idx_to_qmd[self._cur_cmd_idx()] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
147
- self.cmd_idx_to_global_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
148
- self.cmd_idx_to_local_dims[self._cur_cmd_idx()] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
142
+ qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
149
143
 
150
- qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
151
- qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
144
+ self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
145
+ self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
146
+ self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I')
152
147
  qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
153
148
 
154
- if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is None:
155
- self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
156
- self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
149
+ if self.active_qmd is None:
150
+ self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
151
+ self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
157
152
  else:
158
- prev_qmd.dependent_qmd0_pointer = qmd_addr >> 8
159
- prev_qmd.dependent_qmd0_action = 1
160
- prev_qmd.dependent_qmd0_prefetch = 1
161
- prev_qmd.dependent_qmd0_enable = 1
162
-
163
- def _update_exec(self, cmd_idx, global_size, local_size):
164
- # Patch the exec cmd with new launch dims
165
- if global_size is not None: self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
166
- if local_size is not None: self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
167
-
168
- def _signal(self, signal, value=0):
169
- if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is not None:
153
+ self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
154
+ self.active_qmd.dependent_qmd0_action = 1
155
+ self.active_qmd.dependent_qmd0_prefetch = 1
156
+ self.active_qmd.dependent_qmd0_enable = 1
157
+
158
+ self.active_qmd = qmd
159
+ return self
160
+
161
+ def signal(self, signal:NVSignal, value:sint=0):
162
+ if self.active_qmd is not None:
170
163
  for i in range(2):
171
- if getattr(prev_qmd, f'release{i}_enable') == 0:
172
- setattr(prev_qmd, f'release{i}_enable', 1)
173
- setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
174
- setattr(prev_qmd, f'release{i}_payload', value)
175
- self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd
176
- self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i
177
- return
178
-
179
- self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
180
- (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
181
- self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
182
-
183
- def _update_signal(self, cmd_idx, signal=None, value=None):
184
- if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
185
- if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
186
- if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
187
-
188
- def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).compute_gpfifo)
189
-
190
- class NVCopyQueue(NVCommandQueue, HWCopyQueue):
191
- def _copy(self, dest, src, copy_size):
192
- self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
193
- self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
194
- self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
195
-
196
- def _update_copy(self, cmd_idx, dest=None, src=None):
197
- if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest))
198
- if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
199
-
200
- def _signal(self, signal, value=0):
201
- self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value]
202
- self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
203
-
204
- def _update_signal(self, cmd_idx, signal=None, value=None):
205
- if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
206
- if value is not None: self._patch(cmd_idx, offset=3, data=[value])
207
-
208
- def _submit(self, device): self._submit_to_gpfifo(device, cast(NVDevice, device).dma_gpfifo)
209
-
210
- class NVArgsState(HCQArgsState):
211
- def __init__(self, ptr:int, prg:NVProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
212
- super().__init__(ptr, prg, bufs, vals=vals)
213
-
214
- if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
215
- kernargs = [arg_half for arg in bufs for arg_half in data64_le(arg.va_addr)] + list(vals)
216
- to_mv(self.ptr, (len(prg.constbuffer_0) + len(kernargs)) * 4).cast('I')[:] = array.array('I', prg.constbuffer_0 + kernargs)
217
- self.bufs = to_mv(self.ptr + len(prg.constbuffer_0) * 4, len(bufs) * 8).cast('Q')
218
- self.vals = to_mv(self.ptr + len(prg.constbuffer_0) * 4 + len(bufs) * 8, len(vals) * 4).cast('I')
219
-
220
- def update_buffer(self, index:int, buf:HCQBuffer): self.bufs[index] = buf.va_addr
221
- def update_var(self, index:int, val:int): self.vals[index] = val
164
+ if getattr(self.active_qmd, f'release{i}_enable') == 0:
165
+ setattr(self.active_qmd, f'release{i}_enable', 1)
166
+ self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
167
+ self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
168
+ return self
169
+
170
+ self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
171
+ (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)) # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
172
+ self.nvm(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 0x0)
173
+ self.active_qmd = None
174
+ return self
175
+
176
+ def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.compute_gpfifo)
177
+
178
+ class NVCopyQueue(NVCommandQueue):
179
+ def copy(self, dest:sint, src:sint, copy_size:int):
180
+ self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src), *data64(dest))
181
+ self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, copy_size)
182
+ self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
183
+ return self
184
+
185
+ def signal(self, signal:NVSignal, value:sint=0):
186
+ self.nvm(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, *data64(signal.value_addr), value)
187
+ self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA, 0x14)
188
+ return self
189
+
190
+ def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
191
+
192
+ class NVArgsState(CLikeArgsState):
193
+ def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
194
+ if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
195
+ super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
222
196
 
223
197
  class NVProgram(HCQProgram):
224
- def __init__(self, device:NVDevice, name:str, lib:bytes):
225
- self.device, self.name, self.lib = device, name, lib
198
+ def __init__(self, dev:NVDevice, name:str, lib:bytes):
199
+ self.dev, self.name, self.lib = dev, name, lib
226
200
 
227
201
  if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
228
202
  else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
229
203
 
230
204
  # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
231
- self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True))
205
+ self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
232
206
 
233
207
  self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
234
- self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]]
208
+ self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
235
209
  for sh in sections:
236
210
  if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
237
211
  if sh.name == f".text.{self.name}":
@@ -243,7 +217,7 @@ class NVProgram(HCQProgram):
243
217
  if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
244
218
 
245
219
  # Ensure device has enough local memory to run the program
246
- self.device._ensure_has_local_memory(self.lcmem_usage)
220
+ self.dev._ensure_has_local_memory(self.lcmem_usage)
247
221
 
248
222
  # Apply relocs
249
223
  for apply_image_offset, rel_sym_offset, typ, _ in relocs:
@@ -256,15 +230,16 @@ class NVProgram(HCQProgram):
256
230
  ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
257
231
 
258
232
  self.constbuffer_0 = [0] * 88
259
- self.constbuffer_0[6:12] = [*data64_le(self.device.shared_mem_window), *data64_le(self.device.local_mem_window), *data64_le(0xfffdc0)]
233
+ self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
260
234
 
261
235
  smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
262
- self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
236
+ self.qmd: ctypes.Structure = \
237
+ qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
263
238
  invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
264
239
  cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
265
240
  shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
266
241
  max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, program_address=self.prog_addr, sass_version=0x89,
267
- barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
242
+ barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=self.prog_sz>>8,
268
243
  program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
269
244
 
270
245
  for i,(addr,sz) in self.constbufs.items():
@@ -273,32 +248,32 @@ class NVProgram(HCQProgram):
273
248
  self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', sz)
274
249
  self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
275
250
 
276
- # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
251
+ # Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
277
252
  self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
278
253
 
279
254
  # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
280
- super().__init__(NVArgsState, self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
255
+ super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
281
256
 
282
257
  def __del__(self):
283
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferOptions(cpu_access=True))
258
+ if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
284
259
 
285
- def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
286
- if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.device).slm_per_thread:
287
- raise RuntimeError("Too many resources requested for launch")
260
+ def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
261
+ if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
262
+ raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}")
288
263
  if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
289
264
  raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
290
265
  return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
291
266
 
292
- class NVAllocator(HCQAllocator):
293
- def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
294
- if options.host: return self.device._gpu_host_alloc(size, tag="user host memory")
295
- return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)), tag=f"user memory ({options})")
267
+ class NVAllocator(HCQAllocator['NVDevice']):
268
+ def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
269
+ if options.host: return self.dev._gpu_alloc(size, host=True, tag="user host memory")
270
+ return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})")
296
271
 
297
- def _free(self, opaque, options:BufferOptions):
298
- self.device.synchronize()
299
- self.device._gpu_free(opaque)
272
+ def _free(self, opaque:HCQBuffer, options:BufferSpec):
273
+ self.dev.synchronize()
274
+ self.dev._gpu_free(opaque)
300
275
 
301
- def map(self, buf:HCQBuffer): self.device._gpu_map(buf._base if hasattr(buf, '_base') else buf)
276
+ def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
302
277
 
303
278
  @dataclass
304
279
  class GPFifo:
@@ -309,119 +284,114 @@ class GPFifo:
309
284
  put_value: int = 0
310
285
 
311
286
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
312
- class NVDevice(HCQCompiled):
287
+ class NVDevice(HCQCompiled[NVSignal]):
313
288
  root = None
314
- fd_ctl: int = -1
315
- fd_uvm: int = -1
316
- gpus_info: Union[List, ctypes.Array] = []
289
+ fd_ctl: HWInterface
290
+ fd_uvm: HWInterface
291
+ gpus_info: Union[list, ctypes.Array] = []
317
292
  signals_page: Any = None
318
- signals_pool: List[Any] = []
319
- low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
320
- uvm_vaddr: int = 0x2000000000 # 0x2000000000+
293
+ signals_pool: list[int] = []
294
+
295
+ # TODO: Need a proper allocator for va addresses
296
+ # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
297
+ # VA space is 48bits.
298
+ low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x8000000000 if OSX else 0x1000000000, wrap=False)
299
+ uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False)
321
300
  host_object_enumerator: int = 0x1000
322
301
 
323
302
  def _new_gpu_fd(self):
324
- fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
325
- nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
303
+ fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
304
+ nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
326
305
  return fd_dev
327
306
 
328
307
  def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
329
- fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
330
- made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
331
- params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
308
+ fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
309
+ made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
310
+ params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
332
311
  nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
333
312
  if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
334
- res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
335
- os.close(fd_dev)
336
- return res
337
-
338
- def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
339
- size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
340
- alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
341
- attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
342
- ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
343
- attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
344
- ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
345
- flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
346
- nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
347
- mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
348
-
349
- if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu)
350
- if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
351
- return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
352
-
353
- def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
354
- alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
355
- attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
356
- attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
357
- flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
358
- nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
359
- mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
360
-
361
- if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True)
362
- if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
363
-
364
- return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
365
-
366
- def _gpu_host_alloc(self, size, tag=""):
367
- va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
368
- mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
369
- assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
370
-
371
- NVDevice.host_object_enumerator += 1
372
- flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
373
- (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
374
- made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
375
- hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1)
376
- nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
377
-
378
- if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
379
- return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
380
-
381
- def _gpu_free(self, mem):
382
- if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
383
- nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
313
+ return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
314
+
315
+ def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
316
+ # Uncached memory is "system". Use huge pages only for gpu memory.
317
+ page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10)))
318
+ size = round_up(size, page_size)
319
+ va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
320
+
321
+ if host:
322
+ va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
323
+
324
+ flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
325
+ | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
326
+
327
+ NVDevice.host_object_enumerator += 1
328
+ made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags,
329
+ hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_addr, limit=size-1), fd=-1)
330
+ nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
331
+
332
+ if made.params.status != 0: raise RuntimeError(f"host alloc returned {get_error_str(made.params.status)}")
333
+ mem_handle = made.params.hObjectNew
334
+ else:
335
+ attr = ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contiguous else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27) \
336
+ | (nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE if page_size > 0x1000 else 0) << 23 | ((nv_gpu.NVOS32_ATTR_LOCATION_PCI if uncached else 0) << 25)
337
+
338
+ attr2 = ((nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO if uncached else nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES) << 2) \
339
+ | ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB if page_size > 0x1000 else 0) << 20) | nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC
340
+
341
+ fl = nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED | nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE \
342
+ | nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | (nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM if not uncached else 0)
343
+
344
+ alloc_func = nv_gpu.NV1_MEMORY_SYSTEM if uncached else nv_gpu.NV1_MEMORY_USER
345
+ alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=page_size, offset=0, limit=size-1, format=6, size=size,
346
+ type=nv_gpu.NVOS32_TYPE_NOTIFIER if uncached else nv_gpu.NVOS32_TYPE_IMAGE, attr=attr, attr2=attr2, flags=fl)
347
+ mem_handle = rm_alloc(self.fd_ctl, alloc_func, self.root, self.nvdevice, alloc_params).hObjectNew
348
+
349
+ if cpu_access: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=uncached)
350
+
351
+ return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
352
+
353
+ def _gpu_free(self, mem:HCQBuffer):
354
+ if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
355
+ made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
356
+ nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
384
357
  if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
385
358
 
386
- self._debug_mappings.pop((mem.va_addr, mem.size))
387
- uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
388
- if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
359
+ self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
360
+ uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
361
+ if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
389
362
 
390
- def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
363
+ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
391
364
  if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
392
365
  attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
393
366
 
394
- # NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
367
+ # NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
395
368
  self._debug_mappings[(va_base, size)] = tag
396
- return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
397
- gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
369
+ return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
370
+ hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
371
+ mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
398
372
 
399
- def _gpu_map(self, mem):
400
- if self.gpu_uuid in mem.mapped_gpu_ids: return
401
- mem.mapped_gpu_ids.append(self.gpu_uuid)
402
- self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
373
+ def _gpu_map(self, mem:HCQBuffer):
374
+ if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
375
+ mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
376
+ self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
403
377
 
404
378
  def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
405
- if force_low:
406
- NVDevice.low_uvm_vaddr = (res_va:=round_up(NVDevice.low_uvm_vaddr, alignment)) + size
407
- assert NVDevice.low_uvm_vaddr < 0x2000000000, "Exceed low vm addresses"
408
- else: NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
409
- return res_va
379
+ return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
410
380
 
411
381
  def _setup_nvclasses(self):
412
382
  classlist = memoryview(bytearray(100 * 4)).cast('I')
413
- clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist))
383
+ clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
414
384
  self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
415
385
  self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
416
386
 
417
387
  def __init__(self, device:str=""):
418
388
  if NVDevice.root is None:
419
- NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
420
- NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
421
- fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
389
+ NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
390
+ NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
391
+ self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
422
392
  NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
423
393
  uvm.initialize(self.fd_uvm)
424
- with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
394
+ with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
425
395
 
426
396
  nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
427
397
  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
@@ -438,40 +408,40 @@ class NVDevice(HCQCompiled):
438
408
 
439
409
  device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
440
410
  vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
441
- self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
442
- self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
411
+ self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
412
+ self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
443
413
  self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
444
414
  self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
445
415
 
446
416
  self._setup_nvclasses()
447
- self._debug_mappings: Dict[Tuple[int, int], str] = dict()
417
+ self._debug_mappings: dict[tuple[int, int], str] = dict()
448
418
 
449
419
  rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
450
420
  (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
451
421
 
452
422
  vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
453
423
  flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
454
- vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
424
+ vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.nvdevice, vaspace_params).hObjectNew
455
425
 
456
426
  raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
457
427
  self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
458
428
 
459
429
  uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
460
- uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
430
+ uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
461
431
 
462
- for dev in cast(List[NVDevice], self.devices):
432
+ for dev in cast(list[NVDevice], self.devices):
463
433
  try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
464
434
  except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
465
435
 
466
436
  if NVDevice.signals_page is None:
467
- NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
468
- NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
437
+ NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
438
+ NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
469
439
  else: self._gpu_map(NVDevice.signals_page)
470
440
 
471
441
  channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
472
- channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
442
+ channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
473
443
 
474
- gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo")
444
+ gpfifo_area = self._gpu_alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000, tag="gpfifo")
475
445
 
476
446
  ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
477
447
  ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
@@ -481,9 +451,9 @@ class NVDevice(HCQCompiled):
481
451
 
482
452
  rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
483
453
 
484
- self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True, tag="cmdq")
485
- self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
486
- self.cmdq_wptr: int = 0 # in bytes
454
+ self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
455
+ self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True)
456
+ self.cmdq: memoryview = to_mv(cast(int, self.cmdq_page.va_addr), 0x200000).cast("I")
487
457
 
488
458
  self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
489
459
  'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
@@ -496,10 +466,10 @@ class NVDevice(HCQCompiled):
496
466
  self._setup_gpfifos()
497
467
 
498
468
  def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
499
- notifier = self._gpu_system_alloc(48 << 20)
500
- params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
469
+ notifier = self._gpu_alloc(48 << 20, uncached=True)
470
+ params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
501
471
  gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
502
- hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
472
+ hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
503
473
  gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
504
474
  comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
505
475
  rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
@@ -507,13 +477,13 @@ class NVDevice(HCQCompiled):
507
477
  if enable_debug:
508
478
  self.debug_compute_obj, self.debug_channel = comp, gpfifo
509
479
  debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj)
510
- self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.device, debugger_params).hObjectNew
480
+ self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.nvdevice, debugger_params).hObjectNew
511
481
 
512
482
  ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
513
483
  assert ws_token_params.workSubmitToken != -1
514
484
 
515
485
  channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
516
- uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
486
+ uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
517
487
  hChannel=gpfifo, base=channel_base, length=0x4000000)
518
488
 
519
489
  return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
@@ -532,30 +502,25 @@ class NVDevice(HCQCompiled):
532
502
  NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
533
503
  .signal(self.timeline_signal, self.timeline_value).submit(self)
534
504
 
535
- NVCopyQueue().wait(self.timeline_signal, self.timeline_value) \
536
- .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
537
- .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
505
+ cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
506
+ .setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
507
+ .signal(self.timeline_signal, self.timeline_value + 1).submit(self)
538
508
 
539
509
  self.timeline_value += 2
540
510
 
541
511
  def _ensure_has_local_memory(self, required):
542
512
  if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return
543
513
 
544
- if self.shader_local_mem is not None: self.allocator.free(self.shader_local_mem, self.shader_local_mem.size)
545
-
546
514
  self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread
547
515
  bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
516
+ self.shader_local_mem, ok = self._realloc(self.shader_local_mem, round_up(bytes_per_tpc*self.num_tpc_per_gpc*self.num_gpcs, 0x20000))
548
517
 
549
- try: self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
550
- except MemoryError:
551
- # If can't allocate a new size, reallocator the old buffer.
552
- self.slm_per_thread = old_slm_per_thread
553
- bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)
554
- self.shader_local_mem = self.allocator.alloc(round_up(bytes_per_tpc * self.num_tpc_per_gpc * self.num_gpcs, 0x20000))
518
+ # Realloc failed, restore the old value.
519
+ if not ok: self.slm_per_thread = old_slm_per_thread
555
520
 
556
- NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
557
- .setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
558
- .signal(self.timeline_signal, self.timeline_value).submit(self)
521
+ cast(NVComputeQueue, NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1)) \
522
+ .setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
523
+ .signal(self.timeline_signal, self.timeline_value).submit(self)
559
524
  self.timeline_value += 1
560
525
 
561
526
  def invalidate_caches(self):