tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/__init__.py +0 -0
- tinygrad/codegen/kernel.py +78 -90
- tinygrad/codegen/linearizer.py +237 -169
- tinygrad/codegen/uops.py +278 -242
- tinygrad/device.py +147 -10
- tinygrad/dtype.py +7 -7
- tinygrad/engine/graph.py +16 -16
- tinygrad/engine/jit.py +39 -36
- tinygrad/engine/realize.py +6 -5
- tinygrad/engine/schedule.py +15 -7
- tinygrad/engine/search.py +6 -3
- tinygrad/function.py +17 -23
- tinygrad/helpers.py +77 -8
- tinygrad/lazy.py +26 -26
- tinygrad/multi.py +13 -9
- tinygrad/nn/__init__.py +1 -1
- tinygrad/nn/datasets.py +2 -1
- tinygrad/nn/state.py +3 -4
- tinygrad/ops.py +49 -16
- tinygrad/renderer/__init__.py +8 -4
- tinygrad/renderer/assembly.py +93 -100
- tinygrad/renderer/cstyle.py +47 -42
- tinygrad/renderer/llvmir.py +30 -30
- tinygrad/runtime/__init__.py +0 -0
- tinygrad/runtime/autogen/amd_gpu.py +11504 -1
- tinygrad/runtime/autogen/comgr.py +36 -10
- tinygrad/runtime/autogen/hsa.py +146 -14
- tinygrad/runtime/autogen/io_uring.py +1486 -0
- tinygrad/runtime/autogen/nv_gpu.py +269 -0
- tinygrad/runtime/driver/__init__.py +0 -0
- tinygrad/runtime/driver/hip_comgr.py +20 -11
- tinygrad/runtime/graph/__init__.py +0 -0
- tinygrad/runtime/graph/clang.py +3 -2
- tinygrad/runtime/graph/cuda.py +2 -2
- tinygrad/runtime/graph/hcq.py +122 -78
- tinygrad/runtime/ops_amd.py +302 -316
- tinygrad/runtime/ops_cuda.py +3 -3
- tinygrad/runtime/ops_disk.py +70 -5
- tinygrad/runtime/ops_gpu.py +2 -2
- tinygrad/runtime/ops_metal.py +5 -6
- tinygrad/runtime/ops_npy.py +1 -1
- tinygrad/runtime/ops_nv.py +161 -166
- tinygrad/runtime/ops_python.py +20 -16
- tinygrad/shape/__init__.py +0 -0
- tinygrad/shape/shapetracker.py +5 -2
- tinygrad/shape/symbolic.py +1 -3
- tinygrad/shape/view.py +34 -19
- tinygrad/tensor.py +219 -135
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
- tinygrad-0.9.1.dist-info/RECORD +63 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
- tinygrad/runtime/driver/hsa.py +0 -143
- tinygrad/runtime/graph/hsa.py +0 -171
- tinygrad/runtime/ops_hsa.py +0 -278
- tinygrad-0.9.0.dist-info/RECORD +0 -60
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_nv.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
|
3
|
-
from typing import Tuple, List, Any
|
4
|
-
from
|
5
|
-
from tinygrad.
|
2
|
+
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
|
3
|
+
from typing import Tuple, List, Any
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
|
6
|
+
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
|
6
7
|
from tinygrad.renderer.cstyle import NVRenderer
|
7
8
|
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
|
8
9
|
import tinygrad.runtime.autogen.cuda as cuda
|
@@ -28,20 +29,20 @@ def rm_alloc(fd, clss, root, parant, params):
|
|
28
29
|
made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
|
29
30
|
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
|
30
31
|
nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
|
31
|
-
if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}")
|
32
|
+
if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
|
32
33
|
return made
|
33
34
|
|
34
35
|
def rm_control(fd, cmd, client, obj, params):
|
35
36
|
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
|
36
37
|
params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
|
37
38
|
nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
|
38
|
-
if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}")
|
39
|
+
if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}: {nv_gpu.nv_status_codes.get(made.status, 'Unknown error')}")
|
39
40
|
return made
|
40
41
|
|
41
42
|
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
42
43
|
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
|
43
|
-
if ret != 0: raise RuntimeError(f"
|
44
|
-
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl
|
44
|
+
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
45
|
+
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {made.rmStatus}: {nv_gpu.nv_status_codes.get(made.rmStatus, 'Unknown error')}")
|
45
46
|
return made
|
46
47
|
|
47
48
|
def make_uvm_type():
|
@@ -84,25 +85,36 @@ class NVCompiler(Compiler):
|
|
84
85
|
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
|
85
86
|
|
86
87
|
class HWQueue:
|
87
|
-
def __init__(self): self.q, self.binded_device, self.
|
88
|
+
def __init__(self): self.q, self.binded_device, self.cmd_offsets = [], None, [0]
|
88
89
|
def __del__(self):
|
89
90
|
if self.binded_device is not None:
|
90
91
|
self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
|
91
92
|
self.binded_device._gpu_free(self.hw_page)
|
92
93
|
|
93
|
-
def
|
94
|
+
def _mark_command_end(self):
|
95
|
+
self.cmd_offsets.append(len(self.q))
|
96
|
+
return self
|
97
|
+
def __len__(self): return len(self.cmd_offsets) - 1
|
98
|
+
|
99
|
+
def memory_barrier(self): return self._mark_command_end()
|
94
100
|
|
95
101
|
def wait(self, signal, value=0):
|
96
102
|
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
97
103
|
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
|
98
|
-
self.
|
99
|
-
|
104
|
+
return self._mark_command_end()
|
105
|
+
|
106
|
+
def timestamp(self, signal): return HWQueue.signal(self, signal, timestamp=True)
|
100
107
|
|
101
108
|
def signal(self, signal, value=0, timestamp=False):
|
102
109
|
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
|
103
110
|
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
104
111
|
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
105
|
-
self.
|
112
|
+
return self._mark_command_end()
|
113
|
+
|
114
|
+
def update_signal(self, cmd_idx, signal=None, value=None): return self.update_wait(cmd_idx, signal, value) # the same offsets and commands
|
115
|
+
def update_wait(self, cmd_idx, signal=None, value=None):
|
116
|
+
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64_le(mv_address(signal))])
|
117
|
+
if value is not None: self.q[(valoff:=self.cmd_offsets[cmd_idx]+3):valoff+2] = array.array('I', [*nvdata64_le(value)])
|
106
118
|
return self
|
107
119
|
|
108
120
|
def bind(self, device: NVDevice):
|
@@ -114,82 +126,90 @@ class HWQueue:
|
|
114
126
|
# From now on, the queue is on the device for faster submission.
|
115
127
|
self.q = hw_view # type: ignore
|
116
128
|
|
117
|
-
def _submit(self, dev,
|
129
|
+
def _submit(self, dev, gpfifo:GPFifo):
|
130
|
+
if len(self.q) == 0: return
|
131
|
+
|
118
132
|
if dev == self.binded_device: cmdq_addr = self.hw_page.base
|
119
133
|
else:
|
134
|
+
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.length:
|
135
|
+
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.base + len(self.q) * 4 or \
|
136
|
+
gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
|
137
|
+
dev.cmdq_wptr = 0
|
138
|
+
|
120
139
|
dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
|
121
140
|
cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
|
122
141
|
dev.cmdq_wptr += len(self.q) * 4
|
123
142
|
|
124
|
-
|
125
|
-
|
126
|
-
dev.gpu_mmio[0x90 // 4] =
|
127
|
-
|
143
|
+
gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
|
144
|
+
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count
|
145
|
+
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
146
|
+
gpfifo.put_value += 1
|
128
147
|
|
129
148
|
class HWComputeQueue(HWQueue):
|
130
149
|
def __init__(self):
|
131
150
|
super().__init__()
|
132
|
-
self.
|
151
|
+
self.cmd_idx_to_qmd, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}
|
133
152
|
|
134
153
|
def copy_from_cpu(self, gpuaddr, data):
|
135
154
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
|
136
155
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
|
137
156
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
|
138
|
-
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] +
|
139
|
-
self.
|
140
|
-
return self
|
141
|
-
|
142
|
-
def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0, chain_exec_ptr=None):
|
143
|
-
prg.qmd.cta_raster_width, prg.qmd.cta_raster_height, prg.qmd.cta_raster_depth = global_size
|
144
|
-
prg.qmd.cta_thread_dimension0, prg.qmd.cta_thread_dimension1, prg.qmd.cta_thread_dimension2 = local_size
|
145
|
-
prg.qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
|
146
|
-
prg.qmd.constant_buffer_addr_upper_0 = kernargs >> 32
|
147
|
-
if signal is not None:
|
148
|
-
prg.qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
|
149
|
-
prg.qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
|
150
|
-
prg.qmd.release0_payload_lower = signal_value & 0xffffffff
|
151
|
-
prg.qmd.release0_payload_upper = signal_value >> 32
|
152
|
-
prg.qmd.release0_enable = 1
|
153
|
-
else: prg.qmd.release0_enable = 0
|
157
|
+
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + list(data)
|
158
|
+
return self._mark_command_end()
|
154
159
|
|
160
|
+
def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0):
|
155
161
|
ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
156
|
-
self.
|
162
|
+
self.cmd_idx_to_qmd[len(self)] = qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
163
|
+
self.cmd_idx_to_global_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, 12).cast('I')
|
164
|
+
self.cmd_idx_to_local_dims[len(self)] = to_mv(qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, 6).cast('H')
|
157
165
|
|
158
|
-
|
166
|
+
qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
|
167
|
+
qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
|
168
|
+
qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
|
169
|
+
qmd.constant_buffer_addr_upper_0 = kernargs >> 32
|
170
|
+
if signal is not None:
|
171
|
+
qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
|
172
|
+
qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
|
173
|
+
qmd.release0_payload_lower = signal_value & 0xffffffff
|
174
|
+
qmd.release0_payload_upper = signal_value >> 32
|
175
|
+
qmd.release0_enable = 1
|
176
|
+
|
177
|
+
if (prev_qmd:=self.cmd_idx_to_qmd.get(len(self) - 1)) is None:
|
159
178
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
|
160
179
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
|
161
180
|
self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
|
162
181
|
else:
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
self.
|
168
|
-
return self
|
182
|
+
prev_qmd.dependent_qmd0_pointer = qmd_addr >> 8
|
183
|
+
prev_qmd.dependent_qmd0_action = 1
|
184
|
+
prev_qmd.dependent_qmd0_prefetch = 1
|
185
|
+
prev_qmd.dependent_qmd0_enable = 1
|
186
|
+
return self._mark_command_end()
|
169
187
|
|
170
|
-
def update_exec(self,
|
188
|
+
def update_exec(self, cmd_idx, global_size, local_size):
|
171
189
|
# Patch the exec cmd with new launch dims
|
172
|
-
|
173
|
-
|
174
|
-
qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
|
190
|
+
self.cmd_idx_to_global_dims[cmd_idx][:] = array.array('I', global_size)
|
191
|
+
self.cmd_idx_to_local_dims[cmd_idx][:] = array.array('H', local_size)
|
175
192
|
|
176
|
-
def submit(self, dev:NVDevice):
|
177
|
-
if len(self.q) == 0: return
|
178
|
-
dev.compute_put_value = self._submit(dev, dev.compute_gpu_ring, dev.compute_put_value, dev.compute_gpfifo_entries,
|
179
|
-
dev.compute_gpfifo_token, dev.compute_gpu_ring_controls)
|
193
|
+
def submit(self, dev:NVDevice): self._submit(dev, dev.compute_gpfifo)
|
180
194
|
|
181
195
|
class HWCopyQueue(HWQueue):
|
182
196
|
def copy(self, dest, src, copy_size):
|
183
197
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
|
184
198
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
185
199
|
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
186
|
-
self.
|
200
|
+
return self._mark_command_end()
|
201
|
+
|
202
|
+
def signal(self, signal, value=0):
|
203
|
+
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
|
204
|
+
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
|
205
|
+
return self._mark_command_end()
|
206
|
+
|
207
|
+
def update_signal(self, cmd_idx, signal=None, value=None):
|
208
|
+
if signal is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(mv_address(signal))])
|
209
|
+
if value is not None: self.q[self.cmd_offsets[cmd_idx]+3] = value
|
187
210
|
return self
|
188
211
|
|
189
|
-
def submit(self, dev:NVDevice):
|
190
|
-
if len(self.q) == 0: return
|
191
|
-
dev.dma_put_value = self._submit(dev, dev.dma_gpu_ring, dev.dma_put_value, dev.dma_gpfifo_entries,
|
192
|
-
dev.dma_gpfifo_token, dev.dma_gpu_ring_controls)
|
212
|
+
def submit(self, dev:NVDevice): self._submit(dev, dev.dma_gpfifo)
|
193
213
|
|
194
214
|
SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
|
195
215
|
class NVProgram:
|
@@ -202,7 +222,7 @@ class NVProgram:
|
|
202
222
|
print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
|
203
223
|
except Exception as e: print("failed to disasm cubin", str(e))
|
204
224
|
|
205
|
-
self.global_init, self.shmem_usage = None, 0
|
225
|
+
self.rel_info, self.global_init, self.shmem_usage = None, None, 0
|
206
226
|
constant_buffers_data = {}
|
207
227
|
|
208
228
|
if MOCKGPU:
|
@@ -221,6 +241,7 @@ class NVProgram:
|
|
221
241
|
if match := re.match(r'\.nv\.constant(\d+)', section_name):
|
222
242
|
constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
223
243
|
if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
244
|
+
elif section_name.startswith(".rel.text"): self.rel_info = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast('I')
|
224
245
|
elif section_name == ".nv.info":
|
225
246
|
section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
226
247
|
for i in range(sh_size // 12):
|
@@ -231,11 +252,10 @@ class NVProgram:
|
|
231
252
|
self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
|
232
253
|
|
233
254
|
# Load program and constant buffers (if any)
|
234
|
-
|
235
|
-
|
255
|
+
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
256
|
+
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
|
257
|
+
round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
|
236
258
|
self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
|
237
|
-
for st in range(0, len(self.program), 4095):
|
238
|
-
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
|
239
259
|
|
240
260
|
self.constbuffer_0 = [0] * 88
|
241
261
|
self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
|
@@ -246,26 +266,40 @@ class NVProgram:
|
|
246
266
|
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
|
247
267
|
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
|
248
268
|
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
|
249
|
-
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=
|
250
|
-
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
|
269
|
+
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
|
270
|
+
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
|
251
271
|
program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
|
252
272
|
constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
|
253
273
|
|
254
274
|
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
255
275
|
self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
|
256
|
-
self.
|
276
|
+
self.kernargs_alloc_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
|
257
277
|
self.kernargs_offset = 0x160
|
258
278
|
|
259
279
|
# constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
|
260
280
|
if 0 in constant_buffers_data: constant_buffers_data.pop(0)
|
261
281
|
|
262
282
|
off = round_up(self.program.nbytes, 128)
|
283
|
+
|
284
|
+
if self.rel_info is not None:
|
285
|
+
assert self.global_init is not None
|
286
|
+
global_init_addr = self.lib_gpu.base + off
|
287
|
+
for rel_i in range(0, len(self.rel_info), 4):
|
288
|
+
if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
|
289
|
+
elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
|
290
|
+
else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")
|
291
|
+
|
292
|
+
HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
|
293
|
+
for st in range(0, len(self.program), 4095):
|
294
|
+
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
|
295
|
+
|
263
296
|
if self.global_init is not None:
|
264
|
-
# Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
|
265
|
-
assert 4 in constant_buffers_data and constant_buffers_data[4].nbytes == 8
|
266
297
|
HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
|
267
|
-
constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
|
268
298
|
off += round_up(self.global_init.nbytes, 128)
|
299
|
+
if 4 in constant_buffers_data: # >= 12.4
|
300
|
+
# Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
|
301
|
+
assert constant_buffers_data[4].nbytes == 8
|
302
|
+
constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
|
269
303
|
|
270
304
|
for i,data in constant_buffers_data.items():
|
271
305
|
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
|
@@ -288,77 +322,52 @@ class NVProgram:
|
|
288
322
|
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
|
289
323
|
raise RuntimeError("Invalid global/local dims")
|
290
324
|
|
291
|
-
if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.
|
325
|
+
if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_alloc_size):
|
292
326
|
self.device.kernargs_ptr = self.device.kernargs_page.base
|
293
327
|
|
294
328
|
# HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
|
295
329
|
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
|
296
|
-
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] +
|
330
|
+
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
|
331
|
+
|
332
|
+
sig_st, sig_en = (self.device._get_signal(), self.device._get_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
297
333
|
|
298
334
|
queue = HWComputeQueue()
|
299
335
|
queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
|
300
|
-
if wait: queue.
|
336
|
+
if wait or PROFILE: queue.timestamp(sig_st)
|
301
337
|
queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
|
302
338
|
queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
303
|
-
if wait: queue.
|
339
|
+
if wait or PROFILE: queue.timestamp(sig_en)
|
304
340
|
queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
305
341
|
self.device.timeline_value += 1
|
306
|
-
self.device.kernargs_ptr += self.
|
342
|
+
self.device.kernargs_ptr += self.kernargs_alloc_size
|
307
343
|
|
344
|
+
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
308
345
|
if wait:
|
309
346
|
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
310
|
-
return (
|
311
|
-
|
312
|
-
class NVAllocator(
|
313
|
-
def __init__(self, device:NVDevice):
|
314
|
-
self.device = device
|
315
|
-
self.b = [self.device._gpu_host_alloc(2 << 20) for _ in range(16)]
|
316
|
-
self.b_timeline = [0] * len(self.b)
|
317
|
-
self.b_next = 0
|
318
|
-
super().__init__()
|
347
|
+
return (sig_en[1] - sig_st[1]) / 1e9
|
348
|
+
|
349
|
+
class NVAllocator(HCQCompatAllocator):
|
350
|
+
def __init__(self, device:NVDevice): super().__init__(device)
|
319
351
|
|
320
352
|
def _alloc(self, size:int, options:BufferOptions):
|
321
353
|
if options.host: return self.device._gpu_host_alloc(size)
|
322
|
-
|
323
|
-
|
324
|
-
def _free(self,
|
325
|
-
|
326
|
-
if options.host: self.device._gpu_host_free(
|
327
|
-
else: self.device._gpu_free(
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
337
|
-
self.b_timeline[self.b_next] = self.device.timeline_value
|
338
|
-
self.device.timeline_value += 1
|
339
|
-
|
340
|
-
def copyout(self, dest:memoryview, src):
|
341
|
-
NVDevice.synchronize_system()
|
342
|
-
for i in range(0, dest.nbytes, self.b[0].length):
|
343
|
-
HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
344
|
-
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].length, dest.nbytes-i)) \
|
345
|
-
.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
346
|
-
NVDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
|
347
|
-
self.device.timeline_value += 1
|
348
|
-
|
349
|
-
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
350
|
-
|
351
|
-
def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
|
352
|
-
src_dev._gpu_map(dest)
|
353
|
-
HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
354
|
-
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
355
|
-
.copy(dest.va_addr, src.va_addr, sz) \
|
356
|
-
.signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
|
357
|
-
HWComputeQueue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
|
358
|
-
src_dev.timeline_value += 1
|
354
|
+
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
|
355
|
+
|
356
|
+
def _free(self, opaque, options:BufferOptions):
|
357
|
+
self.device.synchronize()
|
358
|
+
if options.host: self.device._gpu_host_free(opaque)
|
359
|
+
else: self.device._gpu_free(opaque)
|
360
|
+
|
361
|
+
@dataclass
|
362
|
+
class GPFifo:
|
363
|
+
ring: memoryview
|
364
|
+
controls: nv_gpu.AmpereAControlGPFifo
|
365
|
+
entries_count: int
|
366
|
+
token: int
|
367
|
+
put_value: int = 0
|
359
368
|
|
360
369
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
361
|
-
class NVDevice(
|
370
|
+
class NVDevice(HCQCompatCompiled):
|
362
371
|
root = None
|
363
372
|
fd_ctl: int = -1
|
364
373
|
fd_uvm: int = -1
|
@@ -383,7 +392,7 @@ class NVDevice(Compiled):
|
|
383
392
|
return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
|
384
393
|
|
385
394
|
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
|
386
|
-
size = round_up(size, align:=((
|
395
|
+
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
|
387
396
|
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
|
388
397
|
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
|
389
398
|
((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
|
@@ -442,7 +451,7 @@ class NVDevice(Compiled):
|
|
442
451
|
|
443
452
|
# NOTE: va_addr is set to make rawbufs compatable with AMD.
|
444
453
|
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
|
445
|
-
gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base)
|
454
|
+
gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base, size=size)
|
446
455
|
|
447
456
|
def _gpu_map(self, mem):
|
448
457
|
if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
|
@@ -460,10 +469,7 @@ class NVDevice(Compiled):
|
|
460
469
|
fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
461
470
|
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
|
462
471
|
uvm.initialize(self.fd_uvm)
|
463
|
-
|
464
|
-
uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm)
|
465
|
-
except RuntimeError:
|
466
|
-
pass # this error is okay, CUDA hits it too
|
472
|
+
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
467
473
|
|
468
474
|
NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
|
469
475
|
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
|
@@ -472,7 +478,7 @@ class NVDevice(Compiled):
|
|
472
478
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
473
479
|
self.fd_dev = self._new_gpu_fd()
|
474
480
|
|
475
|
-
assert NVDevice.gpus_info[self.device_id].valid
|
481
|
+
assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
|
476
482
|
gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
|
477
483
|
rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
|
478
484
|
device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
|
@@ -483,8 +489,7 @@ class NVDevice(Compiled):
|
|
483
489
|
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
484
490
|
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
|
485
491
|
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
486
|
-
|
487
|
-
self.gpu_mmio = to_mv(gpu_mmio_ptr, 0x10000).cast("I")
|
492
|
+
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
488
493
|
|
489
494
|
boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
490
495
|
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
@@ -506,35 +511,24 @@ class NVDevice(Compiled):
|
|
506
511
|
uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
|
507
512
|
|
508
513
|
if NVDevice.signals_page is None:
|
509
|
-
NVDevice.signals_page = self._gpu_system_alloc(
|
514
|
+
NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
|
510
515
|
NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
|
511
516
|
else: self._gpu_map(NVDevice.signals_page)
|
512
517
|
|
513
518
|
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
514
519
|
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
|
515
520
|
|
516
|
-
|
521
|
+
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
|
517
522
|
|
518
523
|
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
519
524
|
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
|
520
525
|
|
521
|
-
self.
|
522
|
-
self.
|
523
|
-
self.compute_gpu_ring: memoryview = to_mv(gpfifo.base, self.compute_gpfifo_entries * 8).cast("Q")
|
524
|
-
self.compute_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + self.compute_gpfifo_entries * 8)
|
525
|
-
self.compute_put_value: int = 0
|
526
|
-
|
527
|
-
self.dma_gpfifo_entries: int = 0x10000
|
528
|
-
self.dma_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0x100000, entries=self.dma_gpfifo_entries)
|
529
|
-
self.dma_gpu_ring: memoryview = to_mv(gpfifo.base + 0x100000, self.dma_gpfifo_entries * 8).cast("Q")
|
530
|
-
self.dma_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + 0x100000 + self.dma_gpfifo_entries * 8)
|
531
|
-
self.dma_put_value: int = 0
|
526
|
+
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000)
|
527
|
+
self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000)
|
532
528
|
|
533
529
|
en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
|
534
530
|
rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
|
535
531
|
|
536
|
-
self.timeline_value: int = 1
|
537
|
-
self.timeline_signal, self._shadow_timeline_signal = NVDevice._get_signal(), NVDevice._get_signal()
|
538
532
|
self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
|
539
533
|
|
540
534
|
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
|
@@ -546,49 +540,49 @@ class NVDevice(Compiled):
|
|
546
540
|
|
547
541
|
self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
|
548
542
|
|
549
|
-
from tinygrad.runtime.graph.hcq import HCQGraph
|
550
543
|
super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
|
551
|
-
functools.partial(NVProgram, self),
|
544
|
+
functools.partial(NVProgram, self), HWComputeQueue, HWCopyQueue, timeline_signals=[self._get_signal(), self._get_signal()])
|
552
545
|
|
553
546
|
self._cmdq_setup_compute_gpfifo()
|
554
547
|
self._cmdq_setup_dma_gpfifo()
|
555
548
|
|
556
549
|
NVDevice.devices.append(self)
|
557
550
|
|
558
|
-
|
559
|
-
|
560
|
-
self.cmdq_wptr = 0
|
561
|
-
|
562
|
-
if self.timeline_value > (1 << 63):
|
563
|
-
self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
|
564
|
-
self.timeline_signal[0], self.timeline_value = 0, 1
|
565
|
-
cast(NVAllocator, self.allocator).b_timeline = [0] * len(cast(NVAllocator, self.allocator).b)
|
551
|
+
@classmethod
|
552
|
+
def _read_signal(self, sig): return sig[0]
|
566
553
|
|
567
|
-
@
|
568
|
-
def
|
569
|
-
for d in NVDevice.devices: d.synchronize()
|
554
|
+
@classmethod
|
555
|
+
def _read_timestamp(self, sig): return sig[1]
|
570
556
|
|
571
557
|
@classmethod
|
572
558
|
def _set_signal(self, sig, value): sig[0] = value
|
573
559
|
|
574
560
|
@classmethod
|
575
|
-
def _get_signal(self, value=0) -> memoryview:
|
561
|
+
def _get_signal(self, value=0, **kwargs) -> memoryview:
|
576
562
|
self._set_signal(sig := self.signals_pool.pop(), value)
|
577
563
|
return sig
|
578
564
|
|
579
565
|
@classmethod
|
580
566
|
def _wait_signal(self, signal, value=0, timeout=10000):
|
581
567
|
start_time = time.time() * 1000
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
568
|
+
while time.time() * 1000 - start_time < timeout:
|
569
|
+
if signal[0] >= value: return
|
570
|
+
raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
|
571
|
+
|
572
|
+
def _gpu2cpu_time(self, gpu_time, is_copy): return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e3
|
573
|
+
|
574
|
+
def synchronize(self):
|
575
|
+
NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
|
576
|
+
self.cmdq_wptr = 0
|
577
|
+
|
578
|
+
if self.timeline_value > (1 << 63): self._wrap_timeline_signal()
|
579
|
+
if PROFILE: self._prof_process_events()
|
586
580
|
|
587
|
-
def
|
581
|
+
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
|
588
582
|
notifier = self._gpu_system_alloc(48 << 20)
|
589
|
-
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=
|
590
|
-
gpFifoOffset=
|
591
|
-
hUserdMemory=(ctypes.c_uint32*8)(
|
583
|
+
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
584
|
+
gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
585
|
+
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
592
586
|
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
593
587
|
rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
|
594
588
|
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
@@ -601,7 +595,8 @@ class NVDevice(Compiled):
|
|
601
595
|
uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
|
602
596
|
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
603
597
|
|
604
|
-
return ws_token_params.workSubmitToken
|
598
|
+
return GPFifo(ring=to_mv(gpfifo_area.base + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
|
599
|
+
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.base + offset + entries * 8))
|
605
600
|
|
606
601
|
def _cmdq_setup_compute_gpfifo(self):
|
607
602
|
self.slm_per_thread = 0x900
|