tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. tinygrad/codegen/__init__.py +0 -0
  2. tinygrad/codegen/kernel.py +78 -90
  3. tinygrad/codegen/linearizer.py +237 -169
  4. tinygrad/codegen/uops.py +278 -242
  5. tinygrad/device.py +147 -10
  6. tinygrad/dtype.py +7 -7
  7. tinygrad/engine/graph.py +16 -16
  8. tinygrad/engine/jit.py +39 -36
  9. tinygrad/engine/realize.py +6 -5
  10. tinygrad/engine/schedule.py +15 -7
  11. tinygrad/engine/search.py +6 -3
  12. tinygrad/function.py +17 -23
  13. tinygrad/helpers.py +77 -8
  14. tinygrad/lazy.py +26 -26
  15. tinygrad/multi.py +13 -9
  16. tinygrad/nn/__init__.py +1 -1
  17. tinygrad/nn/datasets.py +2 -1
  18. tinygrad/nn/state.py +3 -4
  19. tinygrad/ops.py +49 -16
  20. tinygrad/renderer/__init__.py +8 -4
  21. tinygrad/renderer/assembly.py +93 -100
  22. tinygrad/renderer/cstyle.py +47 -42
  23. tinygrad/renderer/llvmir.py +30 -30
  24. tinygrad/runtime/__init__.py +0 -0
  25. tinygrad/runtime/autogen/amd_gpu.py +11504 -1
  26. tinygrad/runtime/autogen/comgr.py +36 -10
  27. tinygrad/runtime/autogen/hsa.py +146 -14
  28. tinygrad/runtime/autogen/io_uring.py +1486 -0
  29. tinygrad/runtime/autogen/nv_gpu.py +269 -0
  30. tinygrad/runtime/driver/__init__.py +0 -0
  31. tinygrad/runtime/driver/hip_comgr.py +20 -11
  32. tinygrad/runtime/graph/__init__.py +0 -0
  33. tinygrad/runtime/graph/clang.py +3 -2
  34. tinygrad/runtime/graph/cuda.py +2 -2
  35. tinygrad/runtime/graph/hcq.py +122 -78
  36. tinygrad/runtime/ops_amd.py +302 -316
  37. tinygrad/runtime/ops_cuda.py +3 -3
  38. tinygrad/runtime/ops_disk.py +70 -5
  39. tinygrad/runtime/ops_gpu.py +2 -2
  40. tinygrad/runtime/ops_metal.py +5 -6
  41. tinygrad/runtime/ops_npy.py +1 -1
  42. tinygrad/runtime/ops_nv.py +161 -166
  43. tinygrad/runtime/ops_python.py +20 -16
  44. tinygrad/shape/__init__.py +0 -0
  45. tinygrad/shape/shapetracker.py +5 -2
  46. tinygrad/shape/symbolic.py +1 -3
  47. tinygrad/shape/view.py +34 -19
  48. tinygrad/tensor.py +219 -135
  49. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
  50. tinygrad-0.9.1.dist-info/RECORD +63 -0
  51. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  52. tinygrad/runtime/driver/hsa.py +0 -143
  53. tinygrad/runtime/graph/hsa.py +0 -171
  54. tinygrad/runtime/ops_hsa.py +0 -278
  55. tinygrad-0.9.0.dist-info/RECORD +0 -60
  56. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
  57. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
3
- from typing import Tuple, List, Any, cast
4
- from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, BufferOptions
5
- from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
2
+ import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
3
+ from typing import Tuple, List, Any
4
+ from dataclasses import dataclass
5
+ from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
6
+ from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
6
7
  from tinygrad.renderer.cstyle import NVRenderer
7
8
  from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
8
9
  import tinygrad.runtime.autogen.cuda as cuda
@@ -28,20 +29,20 @@ def rm_alloc(fd, clss, root, parant, params):
28
29
  made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
29
30
  pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
30
31
  nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
31
- if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}")
32
+ if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
32
33
  return made
33
34
 
34
35
  def rm_control(fd, cmd, client, obj, params):
35
36
  made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
36
37
  params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
37
38
  nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
38
- if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}")
39
+ if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
39
40
  return made
40
41
 
41
42
  def uvm_ioctl(cmd, sttyp, fd, **kwargs):
42
43
  ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
43
- if ret != 0: raise RuntimeError(f"uvm_ioctl returned {ret}")
44
- if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl struct returned {made.rmStatus}")
44
+ if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
45
+ if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {made.rmStatus}: {nv_gpu.nv_status_codes.get(made.rmStatus, 'Unknown error')}")
45
46
  return made
46
47
 
47
48
  def make_uvm_type():
@@ -84,25 +85,36 @@ class NVCompiler(Compiler):
84
85
  return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
85
86
 
86
87
  class HWQueue:
87
- def __init__(self): self.q, self.binded_device, self.next_cmd_index = [], None, 0
88
+ def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
88
89
  def __del__(self):
89
90
  if self.binded_device is not None:
90
91
  self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
91
92
  self.binded_device._gpu_free(self.hw_page)
92
93
 
93
- def ptr(self) -> int: return self.next_cmd_index
94
+ def _mark_command_end(self):
95
+ self.cmd_offsets.append(len(self.q))
96
+ return self
97
+ def __len__(self): return len(self.cmd_offsets) - 1
98
+
99
+ def memory_barrier(self): return self._mark_command_end()
94
100
 
95
101
  def wait(self, signal, value=0):
96
102
  self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
97
103
  (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
98
- self.next_cmd_index += 1
99
- return self
104
+ return self._mark_command_end()
105
+
106
+ def timestamp(self, signal): return HWQueue.signal(self, signal, timestamp=True)
100
107
 
101
108
  def signal(self, signal, value=0, timestamp=False):
102
109
  self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
103
110
  (1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
104
111
  self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
105
- self.next_cmd_index += 1
112
+ return self._mark_command_end()
113
+
114
+ def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
115
+ def update_wait(self, cmd_idx, signal=None, value=None):
116
+ if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
117
+ if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
106
118
  return self
107
119
 
108
120
  def bind(self, device: NVDevice):
@@ -114,82 +126,90 @@ class HWQueue:
114
126
  # From now on, the queue is on the device for faster submission.
115
127
  self.q = hw_view # type: ignore
116
128
 
117
- def _submit(self, dev, gpu_ring, put_value, gpfifo_entries, gpfifo_token, gpu_ring_controls):
129
+ def _submit(self, dev, gpfifo:GPFifo):
130
+ if len(self.q) == 0: return
131
+
118
132
  if dev == self.binded_device: cmdq_addr = self.hw_page.base
119
133
  else:
134
+ if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.length:
135
+ assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.base + len(self.q) * 4 or \
136
+ gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
137
+ dev.cmdq_wptr = 0
138
+
120
139
  dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
121
140
  cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
122
141
  dev.cmdq_wptr += len(self.q) * 4
123
142
 
124
- gpu_ring[put_value % gpfifo_entries] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
125
- gpu_ring_controls.GPPut = (put_value + 1) % gpfifo_entries
126
- dev.gpu_mmio[0x90 // 4] = gpfifo_token
127
- return put_value + 1
143
+ gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
144
+ gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
145
+ dev.gpu_mmio[0x90 // 4] = gpfifo.token
146
+ gpfifo.put_value += 1
128
147
 
129
148
  class HWComputeQueue(HWQueue):
130
149
  def __init__(self):
131
150
  super().__init__()
132
- self.ptr_to_qmd = {}
151
+ self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
133
152
 
134
153
  def copy_from_cpu(self, gpuaddr, data):
135
154
  self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
136
155
  self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
137
156
  self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
138
- self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
139
- self.next_cmd_index += 1
140
- return self
141
-
142
- def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0, chain_exec_ptr=None):
143
- prg.qmd.cta_raster_width, prg.qmd.cta_raster_height, prg.qmd.cta_raster_depth = global_size
144
- prg.qmd.cta_thread_dimension0, prg.qmd.cta_thread_dimension1, prg.qmd.cta_thread_dimension2 = local_size
145
- prg.qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
146
- prg.qmd.constant_buffer_addr_upper_0 = kernargs >> 32
147
- if signal is not None:
148
- prg.qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
149
- prg.qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
150
- prg.qmd.release0_payload_lower = signal_value & 0xffffffff
151
- prg.qmd.release0_payload_upper = signal_value >> 32
152
- prg.qmd.release0_enable = 1
153
- else: prg.qmd.release0_enable = 0
157
+ self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
158
+ return self._mark_command_end()
154
159
 
160
+ def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0):
155
161
  ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
156
- self.ptr_to_qmd[self.ptr()] = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
162
+ self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
163
+ self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
164
+ self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
157
165
 
158
- if chain_exec_ptr is None:
166
+ qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
167
+ qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
168
+ qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
169
+ qmd.constant_buffer_addr_upper_0 = kernargs >> 32
170
+ if signal is not None:
171
+ qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
172
+ qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
173
+ qmd.release0_payload_lower = signal_value & 0xffffffff
174
+ qmd.release0_payload_upper = signal_value >> 32
175
+ qmd.release0_enable = 1
176
+
177
+ if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
159
178
  self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
160
179
  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
161
180
  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
162
181
  else:
163
- self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
164
- self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
165
- self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
166
- self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
167
- self.next_cmd_index += 1
168
- return self
182
+ prev_qmd.dependent_qmd0_pointer = qmd_addr >> 8
183
+ prev_qmd.dependent_qmd0_action = 1
184
+ prev_qmd.dependent_qmd0_prefetch = 1
185
+ prev_qmd.dependent_qmd0_enable = 1
186
+ return self._mark_command_end()
169
187
 
170
- def update_exec(self, cmd_ptr, global_size, local_size):
188
+ def update_exec(self, cmd_idx, global_size, local_size):
171
189
  # Patch the exec cmd with new launch dims
172
- qmd = self.ptr_to_qmd[cmd_ptr]
173
- qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
174
- qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
190
+ self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
191
+ self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
175
192
 
176
- def submit(self, dev:NVDevice):
177
- if len(self.q) == 0: return
178
- dev.compute_put_value = self._submit(dev, dev.compute_gpu_ring, dev.compute_put_value, dev.compute_gpfifo_entries,
179
- dev.compute_gpfifo_token, dev.compute_gpu_ring_controls)
193
+ def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
180
194
 
181
195
  class HWCopyQueue(HWQueue):
182
196
  def copy(self, dest, src, copy_size):
183
197
  self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
184
198
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
185
199
  self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
186
- self.next_cmd_index += 1
200
+ return self._mark_command_end()
201
+
202
+ def signal(self, signal, value=0):
203
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
204
+ self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
205
+ return self._mark_command_end()
206
+
207
+ def update_signal(self, cmd_idx, signal=None, value=None):
208
+ if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
209
+ if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
187
210
  return self
188
211
 
189
- def submit(self, dev:NVDevice):
190
- if len(self.q) == 0: return
191
- dev.dma_put_value = self._submit(dev, dev.dma_gpu_ring, dev.dma_put_value, dev.dma_gpfifo_entries,
192
- dev.dma_gpfifo_token, dev.dma_gpu_ring_controls)
212
+ def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)
193
213
 
194
214
  SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
195
215
  class NVProgram:
@@ -202,7 +222,7 @@ class NVProgram:
202
222
  print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
203
223
  except Exception as e: print("failed to disasm cubin", str(e))
204
224
 
205
- self.global_init, self.shmem_usage = None, 0
225
+ self.rel_info, self.global_init, self.shmem_usage = None, None, 0
206
226
  constant_buffers_data = {}
207
227
 
208
228
  if MOCKGPU:
@@ -221,6 +241,7 @@ class NVProgram:
221
241
  if match := re.match(r'\.nv\.constant(\d+)', section_name):
222
242
  constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
223
243
  if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
244
+ elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
224
245
  elif section_name == ".nv.info":
225
246
  section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
226
247
  for i in range(sh_size // 12):
@@ -231,11 +252,10 @@ class NVProgram:
231
252
  self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
232
253
 
233
254
  # Load program and constant buffers (if any)
234
- self.lib_sz = round_up(round_up(self.program.nbytes, 128) + round_up(0 if self.global_init is None else self.global_init.nbytes, 128) +
235
- sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
255
+ # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
256
+ self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
257
+ round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
236
258
  self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
237
- for st in range(0, len(self.program), 4095):
238
- HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
239
259
 
240
260
  self.constbuffer_0 = [0] * 88
241
261
  self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
@@ -246,26 +266,40 @@ class NVProgram:
246
266
  cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
247
267
  shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
248
268
  max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
249
- barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
250
- program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
269
+ barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
270
+ program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
251
271
  program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
252
272
  constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
253
273
 
254
274
  # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
255
275
  self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
256
- self.kernargs_segment_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
276
+ self.kernargs_alloc_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
257
277
  self.kernargs_offset = 0x160
258
278
 
259
279
  # constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
260
280
  if 0 in constant_buffers_data: constant_buffers_data.pop(0)
261
281
 
262
282
  off = round_up(self.program.nbytes, 128)
283
+
284
+ if self.rel_info is not None:
285
+ assert self.global_init is not None
286
+ global_init_addr = self.lib_gpu.base + off
287
+ for rel_i in range(0, len(self.rel_info), 4):
288
+ if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
289
+ elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
290
+ else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
291
+
292
+ HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
293
+ for st in range(0, len(self.program), 4095):
294
+ HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
295
+
263
296
  if self.global_init is not None:
264
- # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
265
- assert 4 in constant_buffers_data and constant_buffers_data[4].nbytes == 8
266
297
  HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
267
- constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
268
298
  off += round_up(self.global_init.nbytes, 128)
299
+ if 4 in constant_buffers_data: # >= 12.4
300
+ # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
301
+ assert constant_buffers_data[4].nbytes == 8
302
+ constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
269
303
 
270
304
  for i,data in constant_buffers_data.items():
271
305
  self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
@@ -288,77 +322,52 @@ class NVProgram:
288
322
  if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
289
323
  raise RuntimeError("Invalid global/local dims")
290
324
 
291
- if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_segment_size):
325
+ if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_alloc_size):
292
326
  self.device.kernargs_ptr = self.device.kernargs_page.base
293
327
 
294
328
  # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
295
329
  if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
296
- kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + [val for val in vals]
330
+ kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
331
+
332
+ sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
297
333
 
298
334
  queue = HWComputeQueue()
299
335
  queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
300
- if wait: queue.signal(self.device.time_event_st, timestamp=True)
336
+ if wait or PROFILE: queue.timestamp(sig_st)
301
337
  queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
302
338
  queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
303
- if wait: queue.signal(self.device.time_event_en, timestamp=True)
339
+ if wait or PROFILE: queue.timestamp(sig_en)
304
340
  queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
305
341
  self.device.timeline_value += 1
306
- self.device.kernargs_ptr += self.kernargs_segment_size
342
+ self.device.kernargs_ptr += self.kernargs_alloc_size
307
343
 
344
+ if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
308
345
  if wait:
309
346
  self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
310
- return (self.device.time_event_en[1] - self.device.time_event_st[1]) / 1e9
311
-
312
- class NVAllocator(LRUAllocator):
313
- def __init__(self, device:NVDevice):
314
- self.device = device
315
- self.b = [self.device._gpu_host_alloc(2 << 20) for _ in range(16)]
316
- self.b_timeline = [0] * len(self.b)
317
- self.b_next = 0
318
- super().__init__()
347
+ return (sig_en[1] - sig_st[1]) / 1e9
348
+
349
+ class NVAllocator(HCQCompatAllocator):
350
+ def __init__(self, device:NVDevice): super().__init__(device)
319
351
 
320
352
  def _alloc(self, size:int, options:BufferOptions):
321
353
  if options.host: return self.device._gpu_host_alloc(size)
322
- else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access)
323
-
324
- def _free(self, gpumem, options:BufferOptions):
325
- NVDevice.synchronize_system()
326
- if options.host: self.device._gpu_host_free(gpumem)
327
- else: self.device._gpu_free(gpumem)
328
-
329
- def copyin(self, dest, src: memoryview):
330
- for i in range(0, src.nbytes, self.b[0].length):
331
- self.b_next = (self.b_next + 1) % len(self.b)
332
- NVDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
333
- ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].length, src.nbytes-i))
334
- HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
335
- .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
336
- .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
337
- self.b_timeline[self.b_next] = self.device.timeline_value
338
- self.device.timeline_value += 1
339
-
340
- def copyout(self, dest:memoryview, src):
341
- NVDevice.synchronize_system()
342
- for i in range(0, dest.nbytes, self.b[0].length):
343
- HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
344
- .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].length, dest.nbytes-i)) \
345
- .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
346
- NVDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
347
- self.device.timeline_value += 1
348
-
349
- ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
350
-
351
- def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
352
- src_dev._gpu_map(dest)
353
- HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
354
- .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
355
- .copy(dest.va_addr, src.va_addr, sz) \
356
- .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
357
- HWComputeQueue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
358
- src_dev.timeline_value += 1
354
+ return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
355
+
356
+ def _free(self, opaque, options:BufferOptions):
357
+ self.device.synchronize()
358
+ if options.host: self.device._gpu_host_free(opaque)
359
+ else: self.device._gpu_free(opaque)
360
+
361
+ @dataclass
362
+ class GPFifo:
363
+ ring: memoryview
364
+ controls: nv_gpu.AmpereAControlGPFifo
365
+ entries_count: int
366
+ token: int
367
+ put_value: int = 0
359
368
 
360
369
  MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
361
- class NVDevice(Compiled):
370
+ class NVDevice(HCQCompatCompiled):
362
371
  root = None
363
372
  fd_ctl: int = -1
364
373
  fd_uvm: int = -1
@@ -383,7 +392,7 @@ class NVDevice(Compiled):
383
392
  return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
384
393
 
385
394
  def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
386
- size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
395
+ size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
387
396
  alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
388
397
  attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
389
398
  ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
@@ -442,7 +451,7 @@ class NVDevice(Compiled):
442
451
 
443
452
  # NOTE: va_addr is set to make rawbufs compatable with AMD.
444
453
  return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
445
- gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base)
454
+ gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size)
446
455
 
447
456
  def _gpu_map(self, mem):
448
457
  if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
@@ -460,10 +469,7 @@ class NVDevice(Compiled):
460
469
  fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
461
470
  NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
462
471
  uvm.initialize(self.fd_uvm)
463
- try:
464
- uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm)
465
- except RuntimeError:
466
- pass # this error is okay, CUDA hits it too
472
+ with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
467
473
 
468
474
  NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
469
475
  nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
@@ -472,7 +478,7 @@ class NVDevice(Compiled):
472
478
  self.device_id = int(device.split(":")[1]) if ":" in device else 0
473
479
  self.fd_dev = self._new_gpu_fd()
474
480
 
475
- assert NVDevice.gpus_info[self.device_id].valid
481
+ assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
476
482
  gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
477
483
  rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
478
484
  device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
@@ -483,8 +489,7 @@ class NVDevice(Compiled):
483
489
  self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
484
490
  self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
485
491
  self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
486
- gpu_mmio_ptr = self._gpu_map_to_cpu(self.usermode, 0x10000, flags=2)
487
- self.gpu_mmio = to_mv(gpu_mmio_ptr, 0x10000).cast("I")
492
+ self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
488
493
 
489
494
  boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
490
495
  (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
@@ -506,35 +511,24 @@ class NVDevice(Compiled):
506
511
  uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
507
512
 
508
513
  if NVDevice.signals_page is None:
509
- NVDevice.signals_page = self._gpu_system_alloc(0x10000, map_to_cpu=True)
514
+ NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
510
515
  NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
511
516
  else: self._gpu_map(NVDevice.signals_page)
512
517
 
513
518
  channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
514
519
  channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
515
520
 
516
- gpfifo = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
521
+ gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
517
522
 
518
523
  ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
519
524
  ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
520
525
 
521
- self.compute_gpfifo_entries: int = 0x10000
522
- self.compute_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0, entries=self.compute_gpfifo_entries)
523
- self.compute_gpu_ring: memoryview = to_mv(gpfifo.base, self.compute_gpfifo_entries * 8).cast("Q")
524
- self.compute_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + self.compute_gpfifo_entries * 8)
525
- self.compute_put_value: int = 0
526
-
527
- self.dma_gpfifo_entries: int = 0x10000
528
- self.dma_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0x100000, entries=self.dma_gpfifo_entries)
529
- self.dma_gpu_ring: memoryview = to_mv(gpfifo.base + 0x100000, self.dma_gpfifo_entries * 8).cast("Q")
530
- self.dma_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + 0x100000 + self.dma_gpfifo_entries * 8)
531
- self.dma_put_value: int = 0
526
+ self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
527
+ self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
532
528
 
533
529
  en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
534
530
  rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
535
531
 
536
- self.timeline_value: int = 1
537
- self.timeline_signal, self._shadow_timeline_signal = NVDevice._get_signal(), NVDevice._get_signal()
538
532
  self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
539
533
 
540
534
  self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
@@ -546,49 +540,49 @@ class NVDevice(Compiled):
546
540
 
547
541
  self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
548
542
 
549
- from tinygrad.runtime.graph.hcq import HCQGraph
550
543
  super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
551
- functools.partial(NVProgram, self), functools.partial(HCQGraph, NVDevice, HWComputeQueue, HWCopyQueue))
544
+ functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
552
545
 
553
546
  self._cmdq_setup_compute_gpfifo()
554
547
  self._cmdq_setup_dma_gpfifo()
555
548
 
556
549
  NVDevice.devices.append(self)
557
550
 
558
- def synchronize(self):
559
- NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
560
- self.cmdq_wptr = 0
561
-
562
- if self.timeline_value > (1 << 63):
563
- self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
564
- self.timeline_signal[0], self.timeline_value = 0, 1
565
- cast(NVAllocator, self.allocator).b_timeline = [0] * len(cast(NVAllocator, self.allocator).b)
551
+ @classmethod
552
+ def _read_signal(self, sig): return sig[0]
566
553
 
567
- @staticmethod
568
- def synchronize_system():
569
- for d in NVDevice.devices: d.synchronize()
554
+ @classmethod
555
+ def _read_timestamp(self, sig): return sig[1]
570
556
 
571
557
  @classmethod
572
558
  def _set_signal(self, sig, value): sig[0] = value
573
559
 
574
560
  @classmethod
575
- def _get_signal(self, value=0) -> memoryview:
561
+ def _get_signal(self, value=0, **kwargs) -> memoryview:
576
562
  self._set_signal(sig := self.signals_pool.pop(), value)
577
563
  return sig
578
564
 
579
565
  @classmethod
580
566
  def _wait_signal(self, signal, value=0, timeout=10000):
581
567
  start_time = time.time() * 1000
582
- sem_value = signal[0]
583
- while sem_value < value:
584
- sem_value = signal[0]
585
- if time.time() * 1000 - start_time > timeout: raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
568
+ while time.time() * 1000 - start_time < timeout:
569
+ if signal[0] >= value: return
570
+ raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
571
+
572
+ def _gpu2cpu_time(self, gpu_time, is_copy): return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e3
573
+
574
+ def synchronize(self):
575
+ NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
576
+ self.cmdq_wptr = 0
577
+
578
+ if self.timeline_value > (1 << 63): self._wrap_timeline_signal()
579
+ if PROFILE: self._prof_process_events()
586
580
 
587
- def _gpu_fifo_setup(self, gpfifo, ctxshare, channel_group, offset, entries=0x400):
581
+ def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
588
582
  notifier = self._gpu_system_alloc(48 << 20)
589
- params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo.hMemory,
590
- gpFifoOffset=gpfifo.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
591
- hUserdMemory=(ctypes.c_uint32*8)(gpfifo.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
583
+ params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
584
+ gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
585
+ hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
592
586
  gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
593
587
  rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
594
588
  rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
@@ -601,7 +595,8 @@ class NVDevice(Compiled):
601
595
  uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
602
596
  hChannel=gpfifo, base=channel_base, length=0x4000000)
603
597
 
604
- return ws_token_params.workSubmitToken
598
+ return GPFifo(ring=to_mv(gpfifo_area.base + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
599
+ controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.base + offset + entries * 8))
605
600
 
606
601
  def _cmdq_setup_compute_gpfifo(self):
607
602
  self.slm_per_thread = 0x900