tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +6 -0
 - tinygrad/codegen/kernel.py +572 -83
 - tinygrad/codegen/linearizer.py +415 -395
 - tinygrad/codegen/uops.py +415 -0
 - tinygrad/device.py +183 -0
 - tinygrad/dtype.py +113 -0
 - tinygrad/engine/__init__.py +0 -0
 - tinygrad/engine/graph.py +100 -0
 - tinygrad/engine/jit.py +195 -0
 - tinygrad/engine/realize.py +191 -0
 - tinygrad/engine/schedule.py +362 -0
 - tinygrad/engine/search.py +196 -0
 - tinygrad/{mlops.py → function.py} +76 -55
 - tinygrad/helpers.py +196 -89
 - tinygrad/lazy.py +210 -371
 - tinygrad/multi.py +169 -0
 - tinygrad/nn/__init__.py +202 -22
 - tinygrad/nn/datasets.py +7 -0
 - tinygrad/nn/optim.py +112 -32
 - tinygrad/nn/state.py +136 -39
 - tinygrad/ops.py +119 -202
 - tinygrad/renderer/__init__.py +61 -0
 - tinygrad/renderer/assembly.py +276 -0
 - tinygrad/renderer/cstyle.py +353 -166
 - tinygrad/renderer/llvmir.py +150 -138
 - tinygrad/runtime/autogen/amd_gpu.py +1900 -0
 - tinygrad/runtime/autogen/comgr.py +865 -0
 - tinygrad/runtime/autogen/cuda.py +5923 -0
 - tinygrad/runtime/autogen/hip.py +5909 -0
 - tinygrad/runtime/autogen/hsa.py +5761 -0
 - tinygrad/runtime/autogen/kfd.py +812 -0
 - tinygrad/runtime/autogen/nv_gpu.py +33328 -0
 - tinygrad/runtime/autogen/opencl.py +1795 -0
 - tinygrad/runtime/driver/hip_comgr.py +47 -0
 - tinygrad/runtime/driver/hsa.py +143 -0
 - tinygrad/runtime/graph/clang.py +38 -0
 - tinygrad/runtime/graph/cuda.py +81 -0
 - tinygrad/runtime/graph/hcq.py +143 -0
 - tinygrad/runtime/graph/hsa.py +171 -0
 - tinygrad/runtime/graph/metal.py +75 -0
 - tinygrad/runtime/ops_amd.py +564 -0
 - tinygrad/runtime/ops_clang.py +24 -77
 - tinygrad/runtime/ops_cuda.py +175 -89
 - tinygrad/runtime/ops_disk.py +56 -33
 - tinygrad/runtime/ops_gpu.py +92 -95
 - tinygrad/runtime/ops_hsa.py +278 -0
 - tinygrad/runtime/ops_llvm.py +39 -60
 - tinygrad/runtime/ops_metal.py +92 -74
 - tinygrad/runtime/ops_npy.py +9 -0
 - tinygrad/runtime/ops_nv.py +630 -0
 - tinygrad/runtime/ops_python.py +204 -0
 - tinygrad/shape/shapetracker.py +86 -254
 - tinygrad/shape/symbolic.py +166 -141
 - tinygrad/shape/view.py +296 -0
 - tinygrad/tensor.py +2619 -448
 - {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
 - tinygrad-0.9.0.dist-info/METADATA +227 -0
 - tinygrad-0.9.0.dist-info/RECORD +60 -0
 - {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
 - tinygrad/codegen/assembly.py +0 -190
 - tinygrad/codegen/optimizer.py +0 -379
 - tinygrad/codegen/search.py +0 -72
 - tinygrad/graph.py +0 -83
 - tinygrad/jit.py +0 -57
 - tinygrad/nn/image.py +0 -100
 - tinygrad/renderer/assembly_arm64.py +0 -169
 - tinygrad/renderer/assembly_ptx.py +0 -98
 - tinygrad/renderer/wgsl.py +0 -53
 - tinygrad/runtime/lib.py +0 -113
 - tinygrad/runtime/ops_cpu.py +0 -51
 - tinygrad/runtime/ops_hip.py +0 -82
 - tinygrad/runtime/ops_shm.py +0 -29
 - tinygrad/runtime/ops_torch.py +0 -30
 - tinygrad/runtime/ops_webgpu.py +0 -45
 - tinygrad-0.7.0.dist-info/METADATA +0 -212
 - tinygrad-0.7.0.dist-info/RECORD +0 -40
 - {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
 
| 
         @@ -0,0 +1,630 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from __future__ import annotations
         
     | 
| 
      
 2 
     | 
    
         
            +
            import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
         
     | 
| 
      
 3 
     | 
    
         
            +
            from typing import Tuple, List, Any, cast
         
     | 
| 
      
 4 
     | 
    
         
            +
            from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, BufferOptions
         
     | 
| 
      
 5 
     | 
    
         
            +
            from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
         
     | 
| 
      
 6 
     | 
    
         
            +
            from tinygrad.renderer.cstyle import NVRenderer
         
     | 
| 
      
 7 
     | 
    
         
            +
            from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
         
     | 
| 
      
 8 
     | 
    
         
            +
            import tinygrad.runtime.autogen.cuda as cuda
         
     | 
| 
      
 9 
     | 
    
         
            +
            import tinygrad.runtime.autogen.nv_gpu as nv_gpu
         
     | 
| 
      
 10 
     | 
    
         
            +
            if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            libc = ctypes.CDLL(ctypes.util.find_library("c"))
         
     | 
| 
      
 13 
     | 
    
         
            +
            libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
         
     | 
| 
      
 14 
     | 
    
         
            +
            libc.mmap.restype = ctypes.c_void_p
         
     | 
| 
      
 15 
     | 
    
         
            +
            libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
         
     | 
| 
      
 16 
     | 
    
         
            +
            libc.munmap.restype = ctypes.c_int
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            if MOCKGPU:=getenv("MOCKGPU"):
         
     | 
| 
      
 19 
     | 
    
         
            +
              import extra.mockgpu.mockgpu  # noqa: F401
         
     | 
| 
      
 20 
     | 
    
         
            +
              libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
         
     | 
| 
      
 21 
     | 
    
         
            +
              libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            def nv_iowr(fd, nr, args):
         
     | 
| 
      
 24 
     | 
    
         
            +
              ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
         
     | 
| 
      
 25 
     | 
    
         
            +
              if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            def rm_alloc(fd, clss, root, parant, params):
         
     | 
| 
      
 28 
     | 
    
         
            +
              made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
         
     | 
| 
      
 29 
     | 
    
         
            +
                                              pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
         
     | 
| 
      
 30 
     | 
    
         
            +
              nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
         
     | 
| 
      
 31 
     | 
    
         
            +
              if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}")
         
     | 
| 
      
 32 
     | 
    
         
            +
              return made
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            def rm_control(fd, cmd, client, obj, params):
         
     | 
| 
      
 35 
     | 
    
         
            +
              made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
         
     | 
| 
      
 36 
     | 
    
         
            +
                                              params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
         
     | 
| 
      
 37 
     | 
    
         
            +
              nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
         
     | 
| 
      
 38 
     | 
    
         
            +
              if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}")
         
     | 
| 
      
 39 
     | 
    
         
            +
              return made
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
            def uvm_ioctl(cmd, sttyp, fd, **kwargs):
         
     | 
| 
      
 42 
     | 
    
         
            +
              ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
         
     | 
| 
      
 43 
     | 
    
         
            +
              if ret != 0: raise RuntimeError(f"uvm_ioctl returned {ret}")
         
     | 
| 
      
 44 
     | 
    
         
            +
              if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl struct returned {made.rmStatus}")
         
     | 
| 
      
 45 
     | 
    
         
            +
              return made
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
            def make_uvm_type():
         
     | 
| 
      
 48 
     | 
    
         
            +
              fxns = {name.replace("UVM_", "").lower():
         
     | 
| 
      
 49 
     | 
    
         
            +
                      functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
         
     | 
| 
      
 50 
     | 
    
         
            +
                      for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
         
     | 
| 
      
 51 
     | 
    
         
            +
              return type("NVUVM", (object, ), fxns)
         
     | 
| 
      
 52 
     | 
    
         
            +
            uvm = make_uvm_type()
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
            def make_qmd_struct_type():
         
     | 
| 
      
 55 
     | 
    
         
            +
              fields = []
         
     | 
| 
      
 56 
     | 
    
         
            +
              bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
         
     | 
| 
      
 57 
     | 
    
         
            +
              bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
         
     | 
| 
      
 58 
     | 
    
         
            +
              bits = sorted(bits, key=lambda x: x[1][1])
         
     | 
| 
      
 59 
     | 
    
         
            +
              for i,(name, data) in enumerate(bits):
         
     | 
| 
      
 60 
     | 
    
         
            +
                if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0:  fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
         
     | 
| 
      
 61 
     | 
    
         
            +
                fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
         
     | 
| 
      
 62 
     | 
    
         
            +
              return init_c_struct_t(tuple(fields))
         
     | 
| 
      
 63 
     | 
    
         
            +
            qmd_struct_t = make_qmd_struct_type()
         
     | 
| 
      
 64 
     | 
    
         
            +
            assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
            def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
         
     | 
| 
      
 67 
     | 
    
         
            +
            def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
         
     | 
| 
      
 68 
     | 
    
         
            +
            def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
            class NVCompiler(Compiler):
         
     | 
| 
      
 71 
     | 
    
         
            +
              def __init__(self, arch:str):
         
     | 
| 
      
 72 
     | 
    
         
            +
                self.arch = arch
         
     | 
| 
      
 73 
     | 
    
         
            +
                #NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
         
     | 
| 
      
 74 
     | 
    
         
            +
                cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
         
     | 
| 
      
 75 
     | 
    
         
            +
                self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
         
     | 
| 
      
 76 
     | 
    
         
            +
                if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
         
     | 
| 
      
 77 
     | 
    
         
            +
                super().__init__(f"compile_nv_{self.arch}")
         
     | 
| 
      
 78 
     | 
    
         
            +
              def compile(self, src:str) -> bytes:
         
     | 
| 
      
 79 
     | 
    
         
            +
                cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
         
     | 
| 
      
 80 
     | 
    
         
            +
                status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                if status != 0:
         
     | 
| 
      
 83 
     | 
    
         
            +
                  raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
         
     | 
| 
      
 84 
     | 
    
         
            +
                return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
            class HWQueue:
         
     | 
| 
      
 87 
     | 
    
         
            +
              def __init__(self): self.q, self.binded_device, self.next_cmd_index = [], None, 0
         
     | 
| 
      
 88 
     | 
    
         
            +
              def __del__(self):
         
     | 
| 
      
 89 
     | 
    
         
            +
                if self.binded_device is not None:
         
     | 
| 
      
 90 
     | 
    
         
            +
                  self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
         
     | 
| 
      
 91 
     | 
    
         
            +
                  self.binded_device._gpu_free(self.hw_page)
         
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
      
 93 
     | 
    
         
            +
              def ptr(self) -> int: return self.next_cmd_index
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
              def wait(self, signal, value=0):
         
     | 
| 
      
 96 
     | 
    
         
            +
                self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
         
     | 
| 
      
 97 
     | 
    
         
            +
                           (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
         
     | 
| 
      
 98 
     | 
    
         
            +
                self.next_cmd_index += 1
         
     | 
| 
      
 99 
     | 
    
         
            +
                return self
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
              def signal(self, signal, value=0, timestamp=False):
         
     | 
| 
      
 102 
     | 
    
         
            +
                self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal))), *nvdata64_le(value),
         
     | 
| 
      
 103 
     | 
    
         
            +
                           (1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
         
     | 
| 
      
 104 
     | 
    
         
            +
                self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
         
     | 
| 
      
 105 
     | 
    
         
            +
                self.next_cmd_index += 1
         
     | 
| 
      
 106 
     | 
    
         
            +
                return self
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
              def bind(self, device: NVDevice):
         
     | 
| 
      
 109 
     | 
    
         
            +
                self.binded_device = device
         
     | 
| 
      
 110 
     | 
    
         
            +
                self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
         
     | 
| 
      
 111 
     | 
    
         
            +
                hw_view = to_mv(self.hw_page.base, self.hw_page.length).cast("I")
         
     | 
| 
      
 112 
     | 
    
         
            +
                for i, value in enumerate(self.q): hw_view[i] = value
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
                # From now on, the queue is on the device for faster submission.
         
     | 
| 
      
 115 
     | 
    
         
            +
                self.q = hw_view # type: ignore
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
              def _submit(self, dev, gpu_ring, put_value, gpfifo_entries, gpfifo_token, gpu_ring_controls):
         
     | 
| 
      
 118 
     | 
    
         
            +
                if dev == self.binded_device: cmdq_addr = self.hw_page.base
         
     | 
| 
      
 119 
     | 
    
         
            +
                else:
         
     | 
| 
      
 120 
     | 
    
         
            +
                  dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
         
     | 
| 
      
 121 
     | 
    
         
            +
                  cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
         
     | 
| 
      
 122 
     | 
    
         
            +
                  dev.cmdq_wptr += len(self.q) * 4
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
                gpu_ring[put_value % gpfifo_entries] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
         
     | 
| 
      
 125 
     | 
    
         
            +
                gpu_ring_controls.GPPut = (put_value + 1) % gpfifo_entries
         
     | 
| 
      
 126 
     | 
    
         
            +
                dev.gpu_mmio[0x90 // 4] = gpfifo_token
         
     | 
| 
      
 127 
     | 
    
         
            +
                return put_value + 1
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
            class HWComputeQueue(HWQueue):
         
     | 
| 
      
 130 
     | 
    
         
            +
              def __init__(self):
         
     | 
| 
      
 131 
     | 
    
         
            +
                super().__init__()
         
     | 
| 
      
 132 
     | 
    
         
            +
                self.ptr_to_qmd = {}
         
     | 
| 
      
 133 
     | 
    
         
            +
             
     | 
| 
      
 134 
     | 
    
         
            +
              def copy_from_cpu(self, gpuaddr, data):
         
     | 
| 
      
 135 
     | 
    
         
            +
                self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
         
     | 
| 
      
 136 
     | 
    
         
            +
                self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
         
     | 
| 
      
 137 
     | 
    
         
            +
                self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
         
     | 
| 
      
 138 
     | 
    
         
            +
                self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
         
     | 
| 
      
 139 
     | 
    
         
            +
                self.next_cmd_index += 1
         
     | 
| 
      
 140 
     | 
    
         
            +
                return self
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
              def exec(self, prg, kernargs, global_size=(1,1,1), local_size=(1,1,1), signal=None, signal_value=0, chain_exec_ptr=None):
         
     | 
| 
      
 143 
     | 
    
         
            +
                prg.qmd.cta_raster_width, prg.qmd.cta_raster_height, prg.qmd.cta_raster_depth = global_size
         
     | 
| 
      
 144 
     | 
    
         
            +
                prg.qmd.cta_thread_dimension0, prg.qmd.cta_thread_dimension1, prg.qmd.cta_thread_dimension2 = local_size
         
     | 
| 
      
 145 
     | 
    
         
            +
                prg.qmd.constant_buffer_addr_lower_0 = kernargs & 0xffffffff
         
     | 
| 
      
 146 
     | 
    
         
            +
                prg.qmd.constant_buffer_addr_upper_0 = kernargs >> 32
         
     | 
| 
      
 147 
     | 
    
         
            +
                if signal is not None:
         
     | 
| 
      
 148 
     | 
    
         
            +
                  prg.qmd.release0_address_lower = ctypes.addressof(from_mv(signal)) & 0xffffffff
         
     | 
| 
      
 149 
     | 
    
         
            +
                  prg.qmd.release0_address_upper = ctypes.addressof(from_mv(signal)) >> 32
         
     | 
| 
      
 150 
     | 
    
         
            +
                  prg.qmd.release0_payload_lower = signal_value & 0xffffffff
         
     | 
| 
      
 151 
     | 
    
         
            +
                  prg.qmd.release0_payload_upper = signal_value >> 32
         
     | 
| 
      
 152 
     | 
    
         
            +
                  prg.qmd.release0_enable = 1
         
     | 
| 
      
 153 
     | 
    
         
            +
                else: prg.qmd.release0_enable = 0
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                ctypes.memmove(qmd_addr:=(kernargs + round_up(prg.constbuf_0_size, 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
         
     | 
| 
      
 156 
     | 
    
         
            +
                self.ptr_to_qmd[self.ptr()] = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
         
     | 
| 
      
 157 
     | 
    
         
            +
             
     | 
| 
      
 158 
     | 
    
         
            +
                if chain_exec_ptr is None:
         
     | 
| 
      
 159 
     | 
    
         
            +
                  self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
         
     | 
| 
      
 160 
     | 
    
         
            +
                  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8]
         
     | 
| 
      
 161 
     | 
    
         
            +
                  self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9]
         
     | 
| 
      
 162 
     | 
    
         
            +
                else:
         
     | 
| 
      
 163 
     | 
    
         
            +
                  self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_pointer = qmd_addr >> 8
         
     | 
| 
      
 164 
     | 
    
         
            +
                  self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_action = 1
         
     | 
| 
      
 165 
     | 
    
         
            +
                  self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_prefetch = 1
         
     | 
| 
      
 166 
     | 
    
         
            +
                  self.ptr_to_qmd[chain_exec_ptr].dependent_qmd0_enable = 1
         
     | 
| 
      
 167 
     | 
    
         
            +
                self.next_cmd_index += 1
         
     | 
| 
      
 168 
     | 
    
         
            +
                return self
         
     | 
| 
      
 169 
     | 
    
         
            +
             
     | 
| 
      
 170 
     | 
    
         
            +
              def update_exec(self, cmd_ptr, global_size, local_size):
         
     | 
| 
      
 171 
     | 
    
         
            +
                # Patch the exec cmd with new launch dims
         
     | 
| 
      
 172 
     | 
    
         
            +
                qmd = self.ptr_to_qmd[cmd_ptr]
         
     | 
| 
      
 173 
     | 
    
         
            +
                qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth = global_size
         
     | 
| 
      
 174 
     | 
    
         
            +
                qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 = local_size
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
              def submit(self, dev:NVDevice):
         
     | 
| 
      
 177 
     | 
    
         
            +
                if len(self.q) == 0: return
         
     | 
| 
      
 178 
     | 
    
         
            +
                dev.compute_put_value = self._submit(dev, dev.compute_gpu_ring, dev.compute_put_value, dev.compute_gpfifo_entries,
         
     | 
| 
      
 179 
     | 
    
         
            +
                                                     dev.compute_gpfifo_token, dev.compute_gpu_ring_controls)
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
            class HWCopyQueue(HWQueue):
         
     | 
| 
      
 182 
     | 
    
         
            +
              def copy(self, dest, src, copy_size):
         
     | 
| 
      
 183 
     | 
    
         
            +
                self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
         
     | 
| 
      
 184 
     | 
    
         
            +
                self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
         
     | 
| 
      
 185 
     | 
    
         
            +
                self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
         
     | 
| 
      
 186 
     | 
    
         
            +
                self.next_cmd_index += 1
         
     | 
| 
      
 187 
     | 
    
         
            +
                return self
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
      
 189 
     | 
    
         
            +
              def submit(self, dev:NVDevice):
         
     | 
| 
      
 190 
     | 
    
         
            +
                if len(self.q) == 0: return
         
     | 
| 
      
 191 
     | 
    
         
            +
                dev.dma_put_value = self._submit(dev, dev.dma_gpu_ring, dev.dma_put_value, dev.dma_gpfifo_entries,
         
     | 
| 
      
 192 
     | 
    
         
            +
                                                 dev.dma_gpfifo_token, dev.dma_gpu_ring_controls)
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
            SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
         
     | 
| 
      
 195 
     | 
    
         
            +
            class NVProgram:
         
     | 
| 
      
 196 
     | 
    
         
            +
              def __init__(self, device:NVDevice, name:str, lib:bytes):
         
     | 
| 
      
 197 
     | 
    
         
            +
                self.device, self.name, self.lib = device, name, lib
         
     | 
| 
      
 198 
     | 
    
         
            +
                if DEBUG >= 6:
         
     | 
| 
      
 199 
     | 
    
         
            +
                  try:
         
     | 
| 
      
 200 
     | 
    
         
            +
                    fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
         
     | 
| 
      
 201 
     | 
    
         
            +
                    with open(fn + ".cubin", "wb") as f: f.write(lib)
         
     | 
| 
      
 202 
     | 
    
         
            +
                    print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
         
     | 
| 
      
 203 
     | 
    
         
            +
                  except Exception as e: print("failed to disasm cubin", str(e))
         
     | 
| 
      
 204 
     | 
    
         
            +
             
     | 
| 
      
 205 
     | 
    
         
            +
                self.global_init, self.shmem_usage = None, 0
         
     | 
| 
      
 206 
     | 
    
         
            +
                constant_buffers_data = {}
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
                if MOCKGPU:
         
     | 
| 
      
 209 
     | 
    
         
            +
                  self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10
         
     | 
| 
      
 210 
     | 
    
         
            +
                  constant_buffers_data[0] = memoryview(bytearray(0x190))
         
     | 
| 
      
 211 
     | 
    
         
            +
                else:
         
     | 
| 
      
 212 
     | 
    
         
            +
                  _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
         
     | 
| 
      
 213 
     | 
    
         
            +
                  sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
         
     | 
| 
      
 214 
     | 
    
         
            +
                  shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
         
     | 
| 
      
 215 
     | 
    
         
            +
                  for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
         
     | 
| 
      
 216 
     | 
    
         
            +
                    section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
         
     | 
| 
      
 217 
     | 
    
         
            +
                    if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
         
     | 
| 
      
 218 
     | 
    
         
            +
                    elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
         
     | 
| 
      
 219 
     | 
    
         
            +
                      self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
         
     | 
| 
      
 220 
     | 
    
         
            +
                      self.registers_usage = sh_info >> 24
         
     | 
| 
      
 221 
     | 
    
         
            +
                    if match := re.match(r'\.nv\.constant(\d+)', section_name):
         
     | 
| 
      
 222 
     | 
    
         
            +
                      constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
         
     | 
| 
      
 223 
     | 
    
         
            +
                    if section_name == ".nv.global.init": self.global_init = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
         
     | 
| 
      
 224 
     | 
    
         
            +
                    elif section_name == ".nv.info":
         
     | 
| 
      
 225 
     | 
    
         
            +
                      section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
         
     | 
| 
      
 226 
     | 
    
         
            +
                      for i in range(sh_size // 12):
         
     | 
| 
      
 227 
     | 
    
         
            +
                        if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
         
     | 
| 
      
 228 
     | 
    
         
            +
                          raise RuntimeError("too high local memory")
         
     | 
| 
      
 229 
     | 
    
         
            +
             
     | 
| 
      
 230 
     | 
    
         
            +
                # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
         
     | 
| 
      
 231 
     | 
    
         
            +
                self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
         
     | 
| 
      
 232 
     | 
    
         
            +
             
     | 
| 
      
 233 
     | 
    
         
            +
                # Load program and constant buffers (if any)
         
     | 
| 
      
 234 
     | 
    
         
            +
                self.lib_sz = round_up(round_up(self.program.nbytes, 128) + round_up(0 if self.global_init is None else self.global_init.nbytes, 128) +
         
     | 
| 
      
 235 
     | 
    
         
            +
                                       sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
         
     | 
| 
      
 236 
     | 
    
         
            +
                self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
         
     | 
| 
      
 237 
     | 
    
         
            +
                for st in range(0, len(self.program), 4095):
         
     | 
| 
      
 238 
     | 
    
         
            +
                  HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
         
     | 
| 
      
 239 
     | 
    
         
            +
             
     | 
| 
      
 240 
     | 
    
         
            +
                self.constbuffer_0 = [0] * 88
         
     | 
| 
      
 241 
     | 
    
         
            +
                self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
         
     | 
| 
      
 242 
     | 
    
         
            +
             
     | 
| 
      
 243 
     | 
    
         
            +
                smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
         
     | 
| 
      
 244 
     | 
    
         
            +
                self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
         
     | 
| 
      
 245 
     | 
    
         
            +
                                        invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
         
     | 
| 
      
 246 
     | 
    
         
            +
                                        cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
         
     | 
| 
      
 247 
     | 
    
         
            +
                                        shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
         
     | 
| 
      
 248 
     | 
    
         
            +
                                        max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
         
     | 
| 
      
 249 
     | 
    
         
            +
                                        barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
         
     | 
| 
      
 250 
     | 
    
         
            +
                                        program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
         
     | 
| 
      
 251 
     | 
    
         
            +
                                        program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
         
     | 
| 
      
 252 
     | 
    
         
            +
                                        constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
         
     | 
| 
      
 253 
     | 
    
         
            +
             
     | 
| 
      
 254 
     | 
    
         
            +
                # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
         
     | 
| 
      
 255 
     | 
    
         
            +
                self.constbuf_0_size = constant_buffers_data[0].nbytes if 0 in constant_buffers_data else 0
         
     | 
| 
      
 256 
     | 
    
         
            +
                self.kernargs_segment_size = round_up(self.constbuf_0_size, 1 << 8) + (8 << 8)
         
     | 
| 
      
 257 
     | 
    
         
            +
                self.kernargs_offset = 0x160
         
     | 
| 
      
 258 
     | 
    
         
            +
             
     | 
| 
      
 259 
     | 
    
         
            +
                # constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
         
     | 
| 
      
 260 
     | 
    
         
            +
                if 0 in constant_buffers_data: constant_buffers_data.pop(0)
         
     | 
| 
      
 261 
     | 
    
         
            +
             
     | 
| 
      
 262 
     | 
    
         
            +
                off = round_up(self.program.nbytes, 128)
         
     | 
| 
      
 263 
     | 
    
         
            +
                if self.global_init is not None:
         
     | 
| 
      
 264 
     | 
    
         
            +
                  # Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
         
     | 
| 
      
 265 
     | 
    
         
            +
                  assert 4 in constant_buffers_data and constant_buffers_data[4].nbytes == 8
         
     | 
| 
      
 266 
     | 
    
         
            +
                  HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
         
     | 
| 
      
 267 
     | 
    
         
            +
                  constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')
         
     | 
| 
      
 268 
     | 
    
         
            +
                  off += round_up(self.global_init.nbytes, 128)
         
     | 
| 
      
 269 
     | 
    
         
            +
             
     | 
| 
      
 270 
     | 
    
         
            +
                for i,data in constant_buffers_data.items():
         
     | 
| 
      
 271 
     | 
    
         
            +
                  self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
         
     | 
| 
      
 272 
     | 
    
         
            +
                  self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
         
     | 
| 
      
 273 
     | 
    
         
            +
                  self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
         
     | 
| 
      
 274 
     | 
    
         
            +
                  self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
      
 276 
     | 
    
         
            +
                  HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
         
     | 
| 
      
 277 
     | 
    
         
            +
                  off += round_up(data.nbytes, 128)
         
     | 
| 
      
 278 
     | 
    
         
            +
             
     | 
| 
      
 279 
     | 
    
         
            +
                HWComputeQueue().signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 280 
     | 
    
         
            +
                self.device.timeline_value += 1
         
     | 
| 
      
 281 
     | 
    
         
            +
                self.device.synchronize()
         
     | 
| 
      
 282 
     | 
    
         
            +
             
     | 
| 
      
 283 
     | 
    
         
            +
              def __del__(self):
         
     | 
| 
      
 284 
     | 
    
         
            +
                if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
              def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
         
     | 
| 
      
 287 
     | 
    
         
            +
                if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
         
     | 
| 
      
 288 
     | 
    
         
            +
                if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
         
     | 
| 
      
 289 
     | 
    
         
            +
                  raise RuntimeError("Invalid global/local dims")
         
     | 
| 
      
 290 
     | 
    
         
            +
             
     | 
| 
      
 291 
     | 
    
         
            +
                if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_segment_size):
         
     | 
| 
      
 292 
     | 
    
         
            +
                  self.device.kernargs_ptr = self.device.kernargs_page.base
         
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
      
 294 
     | 
    
         
            +
                # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
         
     | 
| 
      
 295 
     | 
    
         
            +
                if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
         
     | 
| 
      
 296 
     | 
    
         
            +
                kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + [val for val in vals]
         
     | 
| 
      
 297 
     | 
    
         
            +
             
     | 
| 
      
 298 
     | 
    
         
            +
                queue = HWComputeQueue()
         
     | 
| 
      
 299 
     | 
    
         
            +
                queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
         
     | 
| 
      
 300 
     | 
    
         
            +
                if wait: queue.signal(self.device.time_event_st, timestamp=True)
         
     | 
| 
      
 301 
     | 
    
         
            +
                queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
         
     | 
| 
      
 302 
     | 
    
         
            +
                queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
         
     | 
| 
      
 303 
     | 
    
         
            +
                if wait: queue.signal(self.device.time_event_en, timestamp=True)
         
     | 
| 
      
 304 
     | 
    
         
            +
                queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 305 
     | 
    
         
            +
                self.device.timeline_value += 1
         
     | 
| 
      
 306 
     | 
    
         
            +
                self.device.kernargs_ptr += self.kernargs_segment_size
         
     | 
| 
      
 307 
     | 
    
         
            +
             
     | 
| 
      
 308 
     | 
    
         
            +
                if wait:
         
     | 
| 
      
 309 
     | 
    
         
            +
                  self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
         
     | 
| 
      
 310 
     | 
    
         
            +
                  return (self.device.time_event_en[1] - self.device.time_event_st[1]) / 1e9
         
     | 
| 
      
 311 
     | 
    
         
            +
             
     | 
| 
      
 312 
     | 
    
         
            +
            class NVAllocator(LRUAllocator):
         
     | 
| 
      
 313 
     | 
    
         
            +
              def __init__(self, device:NVDevice):
         
     | 
| 
      
 314 
     | 
    
         
            +
                self.device = device
         
     | 
| 
      
 315 
     | 
    
         
            +
                self.b = [self.device._gpu_host_alloc(2 << 20) for _ in range(16)]
         
     | 
| 
      
 316 
     | 
    
         
            +
                self.b_timeline = [0] * len(self.b)
         
     | 
| 
      
 317 
     | 
    
         
            +
                self.b_next = 0
         
     | 
| 
      
 318 
     | 
    
         
            +
                super().__init__()
         
     | 
| 
      
 319 
     | 
    
         
            +
             
     | 
| 
      
 320 
     | 
    
         
            +
              def _alloc(self, size:int, options:BufferOptions):
         
     | 
| 
      
 321 
     | 
    
         
            +
                if options.host: return self.device._gpu_host_alloc(size)
         
     | 
| 
      
 322 
     | 
    
         
            +
                else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access)
         
     | 
| 
      
 323 
     | 
    
         
            +
             
     | 
| 
      
 324 
     | 
    
         
            +
              def _free(self, gpumem, options:BufferOptions):
         
     | 
| 
      
 325 
     | 
    
         
            +
                NVDevice.synchronize_system()
         
     | 
| 
      
 326 
     | 
    
         
            +
                if options.host: self.device._gpu_host_free(gpumem)
         
     | 
| 
      
 327 
     | 
    
         
            +
                else: self.device._gpu_free(gpumem)
         
     | 
| 
      
 328 
     | 
    
         
            +
             
     | 
| 
      
 329 
     | 
    
         
            +
              def copyin(self, dest, src: memoryview):
         
     | 
| 
      
 330 
     | 
    
         
            +
                for i in range(0, src.nbytes, self.b[0].length):
         
     | 
| 
      
 331 
     | 
    
         
            +
                  self.b_next = (self.b_next + 1) % len(self.b)
         
     | 
| 
      
 332 
     | 
    
         
            +
                  NVDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
         
     | 
| 
      
 333 
     | 
    
         
            +
                  ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].length, src.nbytes-i))
         
     | 
| 
      
 334 
     | 
    
         
            +
                  HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
         
     | 
| 
      
 335 
     | 
    
         
            +
                               .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
         
     | 
| 
      
 336 
     | 
    
         
            +
                               .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 337 
     | 
    
         
            +
                  self.b_timeline[self.b_next] = self.device.timeline_value
         
     | 
| 
      
 338 
     | 
    
         
            +
                  self.device.timeline_value += 1
         
     | 
| 
      
 339 
     | 
    
         
            +
             
     | 
| 
      
 340 
     | 
    
         
            +
              def copyout(self, dest:memoryview, src):
         
     | 
| 
      
 341 
     | 
    
         
            +
                NVDevice.synchronize_system()
         
     | 
| 
      
 342 
     | 
    
         
            +
                for i in range(0, dest.nbytes, self.b[0].length):
         
     | 
| 
      
 343 
     | 
    
         
            +
                  HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
         
     | 
| 
      
 344 
     | 
    
         
            +
                               .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].length, dest.nbytes-i)) \
         
     | 
| 
      
 345 
     | 
    
         
            +
                               .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 346 
     | 
    
         
            +
                  NVDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
         
     | 
| 
      
 347 
     | 
    
         
            +
                  self.device.timeline_value += 1
         
     | 
| 
      
 348 
     | 
    
         
            +
             
     | 
| 
      
 349 
     | 
    
         
            +
                  ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
         
     | 
| 
      
 350 
     | 
    
         
            +
             
     | 
| 
      
 351 
     | 
    
         
            +
              def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
         
     | 
| 
      
 352 
     | 
    
         
            +
                src_dev._gpu_map(dest)
         
     | 
| 
      
 353 
     | 
    
         
            +
                HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
         
     | 
| 
      
 354 
     | 
    
         
            +
                             .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
         
     | 
| 
      
 355 
     | 
    
         
            +
                             .copy(dest.va_addr, src.va_addr, sz) \
         
     | 
| 
      
 356 
     | 
    
         
            +
                             .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
         
     | 
| 
      
 357 
     | 
    
         
            +
                HWComputeQueue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
         
     | 
| 
      
 358 
     | 
    
         
            +
                src_dev.timeline_value += 1
         
     | 
| 
      
 359 
     | 
    
         
            +
             
     | 
| 
      
 360 
     | 
    
         
            +
            MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
         
     | 
| 
      
 361 
     | 
    
         
            +
            class NVDevice(Compiled):
         
     | 
| 
      
 362 
     | 
    
         
            +
              root = None
         
     | 
| 
      
 363 
     | 
    
         
            +
              fd_ctl: int = -1
         
     | 
| 
      
 364 
     | 
    
         
            +
              fd_uvm: int = -1
         
     | 
| 
      
 365 
     | 
    
         
            +
              gpus_info = None
         
     | 
| 
      
 366 
     | 
    
         
            +
              signals_page:Any = None
         
     | 
| 
      
 367 
     | 
    
         
            +
              signals_pool: List[Any] = []
         
     | 
| 
      
 368 
     | 
    
         
            +
              uvm_vaddr: int = 0x1000000000
         
     | 
| 
      
 369 
     | 
    
         
            +
              host_object_enumerator: int = 0x1000
         
     | 
| 
      
 370 
     | 
    
         
            +
              devices: List[NVDevice] = []
         
     | 
| 
      
 371 
     | 
    
         
            +
             
     | 
| 
      
 372 
     | 
    
         
            +
              def _new_gpu_fd(self):
         
     | 
| 
      
 373 
     | 
    
         
            +
                fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
         
     | 
| 
      
 374 
     | 
    
         
            +
                nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
         
     | 
| 
      
 375 
     | 
    
         
            +
                return fd_dev
         
     | 
| 
      
 376 
     | 
    
         
            +
             
     | 
| 
      
 377 
     | 
    
         
            +
              def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
         
     | 
| 
      
 378 
     | 
    
         
            +
                fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
         
     | 
| 
      
 379 
     | 
    
         
            +
                made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
         
     | 
| 
      
 380 
     | 
    
         
            +
                  params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
         
     | 
| 
      
 381 
     | 
    
         
            +
                nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
         
     | 
| 
      
 382 
     | 
    
         
            +
                if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
         
     | 
| 
      
 383 
     | 
    
         
            +
                return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
         
     | 
| 
      
 384 
     | 
    
         
            +
             
     | 
| 
      
 385 
     | 
    
         
            +
              def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
         
     | 
| 
      
 386 
     | 
    
         
            +
                size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
         
     | 
| 
      
 387 
     | 
    
         
            +
                alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
         
     | 
| 
      
 388 
     | 
    
         
            +
                  attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
         
     | 
| 
      
 389 
     | 
    
         
            +
                        ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
         
     | 
| 
      
 390 
     | 
    
         
            +
                  attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
         
     | 
| 
      
 391 
     | 
    
         
            +
                         ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
         
     | 
| 
      
 392 
     | 
    
         
            +
                  flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
         
     | 
| 
      
 393 
     | 
    
         
            +
                         nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
         
     | 
| 
      
 394 
     | 
    
         
            +
                mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
         
     | 
| 
      
 395 
     | 
    
         
            +
             
     | 
| 
      
 396 
     | 
    
         
            +
                if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
         
     | 
| 
      
 397 
     | 
    
         
            +
                if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
         
     | 
| 
      
 398 
     | 
    
         
            +
                return self._gpu_uvm_map(va_addr, size, mem_handle)
         
     | 
| 
      
 399 
     | 
    
         
            +
             
     | 
| 
      
 400 
     | 
    
         
            +
              def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
         
     | 
| 
      
 401 
     | 
    
         
            +
                alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
         
     | 
| 
      
 402 
     | 
    
         
            +
                  attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
         
     | 
| 
      
 403 
     | 
    
         
            +
                  attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
         
     | 
| 
      
 404 
     | 
    
         
            +
                  flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
         
     | 
| 
      
 405 
     | 
    
         
            +
                         nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
         
     | 
| 
      
 406 
     | 
    
         
            +
                mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
         
     | 
| 
      
 407 
     | 
    
         
            +
             
     | 
| 
      
 408 
     | 
    
         
            +
                if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
         
     | 
| 
      
 409 
     | 
    
         
            +
                if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
         
     | 
| 
      
 410 
     | 
    
         
            +
             
     | 
| 
      
 411 
     | 
    
         
            +
                return self._gpu_uvm_map(va_addr, size, mem_handle)
         
     | 
| 
      
 412 
     | 
    
         
            +
             
     | 
| 
      
 413 
     | 
    
         
            +
              def _gpu_host_alloc(self, size):
         
     | 
| 
      
 414 
     | 
    
         
            +
                va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
         
     | 
| 
      
 415 
     | 
    
         
            +
                libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
         
     | 
| 
      
 416 
     | 
    
         
            +
                return self._map_to_gpu(va_base, sz)
         
     | 
| 
      
 417 
     | 
    
         
            +
             
     | 
| 
      
 418 
     | 
    
         
            +
              def _gpu_free(self, mem):
         
     | 
| 
      
 419 
     | 
    
         
            +
                made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
         
     | 
| 
      
 420 
     | 
    
         
            +
                nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
         
     | 
| 
      
 421 
     | 
    
         
            +
                if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
         
     | 
| 
      
 422 
     | 
    
         
            +
                uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
         
     | 
| 
      
 423 
     | 
    
         
            +
             
     | 
| 
      
 424 
     | 
    
         
            +
              def _gpu_host_free(self, mem):
         
     | 
| 
      
 425 
     | 
    
         
            +
                uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
         
     | 
| 
      
 426 
     | 
    
         
            +
                libc.munmap(mem.base, mem.length)
         
     | 
| 
      
 427 
     | 
    
         
            +
             
     | 
| 
      
 428 
     | 
    
         
            +
              def _map_to_gpu(self, va_base, size):
         
     | 
| 
      
 429 
     | 
    
         
            +
                NVDevice.host_object_enumerator += 1
         
     | 
| 
      
 430 
     | 
    
         
            +
                flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
         
     | 
| 
      
 431 
     | 
    
         
            +
                         (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
         
     | 
| 
      
 432 
     | 
    
         
            +
                made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
         
     | 
| 
      
 433 
     | 
    
         
            +
                  hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
         
     | 
| 
      
 434 
     | 
    
         
            +
                nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
         
     | 
| 
      
 435 
     | 
    
         
            +
                if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
         
     | 
| 
      
 436 
     | 
    
         
            +
                return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
         
     | 
| 
      
 437 
     | 
    
         
            +
             
     | 
| 
      
 438 
     | 
    
         
            +
              def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
         
     | 
| 
      
 439 
     | 
    
         
            +
                if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
         
     | 
| 
      
 440 
     | 
    
         
            +
                gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
         
     | 
| 
      
 441 
     | 
    
         
            +
                  nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
         
     | 
| 
      
 442 
     | 
    
         
            +
             
     | 
| 
      
 443 
     | 
    
         
            +
                # NOTE: va_addr is set to make rawbufs compatable with AMD.
         
     | 
| 
      
 444 
     | 
    
         
            +
                return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
         
     | 
| 
      
 445 
     | 
    
         
            +
                                                   gpuAttributesCount=1, perGpuAttributes=gpu_attrs, va_addr=va_base)
         
     | 
| 
      
 446 
     | 
    
         
            +
             
     | 
| 
      
 447 
     | 
    
         
            +
              def _gpu_map(self, mem):
         
     | 
| 
      
 448 
     | 
    
         
            +
                if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
         
     | 
| 
      
 449 
     | 
    
         
            +
                mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_uuid])
         
     | 
| 
      
 450 
     | 
    
         
            +
                return self._gpu_uvm_map(mem.base, mem.length, mem.hMemory, create_range=False)
         
     | 
| 
      
 451 
     | 
    
         
            +
             
     | 
| 
      
 452 
     | 
    
         
            +
              def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
         
     | 
| 
      
 453 
     | 
    
         
            +
                NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
         
     | 
| 
      
 454 
     | 
    
         
            +
                return res_va
         
     | 
| 
      
 455 
     | 
    
         
            +
             
     | 
| 
      
 456 
     | 
    
         
            +
              def __init__(self, device:str=""):
         
     | 
| 
      
 457 
     | 
    
         
            +
                if NVDevice.root is None:
         
     | 
| 
      
 458 
     | 
    
         
            +
                  NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
         
     | 
| 
      
 459 
     | 
    
         
            +
                  NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
         
     | 
| 
      
 460 
     | 
    
         
            +
                  fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
         
     | 
| 
      
 461 
     | 
    
         
            +
                  NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
         
     | 
| 
      
 462 
     | 
    
         
            +
                  uvm.initialize(self.fd_uvm)
         
     | 
| 
      
 463 
     | 
    
         
            +
                  try:
         
     | 
| 
      
 464 
     | 
    
         
            +
                    uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm)
         
     | 
| 
      
 465 
     | 
    
         
            +
                  except RuntimeError:
         
     | 
| 
      
 466 
     | 
    
         
            +
                    pass  # this error is okay, CUDA hits it too
         
     | 
| 
      
 467 
     | 
    
         
            +
             
     | 
| 
      
 468 
     | 
    
         
            +
                  NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
         
     | 
| 
      
 469 
     | 
    
         
            +
                  nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
         
     | 
| 
      
 470 
     | 
    
         
            +
             
     | 
| 
      
 471 
     | 
    
         
            +
                # TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
         
     | 
| 
      
 472 
     | 
    
         
            +
                self.device_id = int(device.split(":")[1]) if ":" in device else 0
         
     | 
| 
      
 473 
     | 
    
         
            +
                self.fd_dev = self._new_gpu_fd()
         
     | 
| 
      
 474 
     | 
    
         
            +
             
     | 
| 
      
 475 
     | 
    
         
            +
                assert NVDevice.gpus_info[self.device_id].valid
         
     | 
| 
      
 476 
     | 
    
         
            +
                gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
         
     | 
| 
      
 477 
     | 
    
         
            +
                rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
         
     | 
| 
      
 478 
     | 
    
         
            +
                device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
         
     | 
| 
      
 479 
     | 
    
         
            +
                self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A
         
     | 
| 
      
 480 
     | 
    
         
            +
             
     | 
| 
      
 481 
     | 
    
         
            +
                device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
         
     | 
| 
      
 482 
     | 
    
         
            +
                                                               vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
         
     | 
| 
      
 483 
     | 
    
         
            +
                self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
         
     | 
| 
      
 484 
     | 
    
         
            +
                self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
         
     | 
| 
      
 485 
     | 
    
         
            +
                self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
         
     | 
| 
      
 486 
     | 
    
         
            +
                gpu_mmio_ptr = self._gpu_map_to_cpu(self.usermode, 0x10000, flags=2)
         
     | 
| 
      
 487 
     | 
    
         
            +
                self.gpu_mmio = to_mv(gpu_mmio_ptr, 0x10000).cast("I")
         
     | 
| 
      
 488 
     | 
    
         
            +
             
     | 
| 
      
 489 
     | 
    
         
            +
                boost_params = nv_gpu.struct_NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
         
     | 
| 
      
 490 
     | 
    
         
            +
                  (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
         
     | 
| 
      
 491 
     | 
    
         
            +
                rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, self.root, self.subdevice, boost_params)
         
     | 
| 
      
 492 
     | 
    
         
            +
             
     | 
| 
      
 493 
     | 
    
         
            +
                vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
         
     | 
| 
      
 494 
     | 
    
         
            +
                  flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
         
     | 
| 
      
 495 
     | 
    
         
            +
                vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
         
     | 
| 
      
 496 
     | 
    
         
            +
             
     | 
| 
      
 497 
     | 
    
         
            +
                gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
         
     | 
| 
      
 498 
     | 
    
         
            +
                rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
         
     | 
| 
      
 499 
     | 
    
         
            +
                self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
         
     | 
| 
      
 500 
     | 
    
         
            +
             
     | 
| 
      
 501 
     | 
    
         
            +
                uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
         
     | 
| 
      
 502 
     | 
    
         
            +
                uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
         
     | 
| 
      
 503 
     | 
    
         
            +
                                         hClient=self.root, hVaSpace=vaspace)
         
     | 
| 
      
 504 
     | 
    
         
            +
             
     | 
| 
      
 505 
     | 
    
         
            +
                for dev in self.devices:
         
     | 
| 
      
 506 
     | 
    
         
            +
                  uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
         
     | 
| 
      
 507 
     | 
    
         
            +
             
     | 
| 
      
 508 
     | 
    
         
            +
                if NVDevice.signals_page is None:
         
     | 
| 
      
 509 
     | 
    
         
            +
                  NVDevice.signals_page = self._gpu_system_alloc(0x10000, map_to_cpu=True)
         
     | 
| 
      
 510 
     | 
    
         
            +
                  NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
         
     | 
| 
      
 511 
     | 
    
         
            +
                else: self._gpu_map(NVDevice.signals_page)
         
     | 
| 
      
 512 
     | 
    
         
            +
             
     | 
| 
      
 513 
     | 
    
         
            +
                channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
         
     | 
| 
      
 514 
     | 
    
         
            +
                channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
         
     | 
| 
      
 515 
     | 
    
         
            +
             
     | 
| 
      
 516 
     | 
    
         
            +
                gpfifo = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
         
     | 
| 
      
 517 
     | 
    
         
            +
             
     | 
| 
      
 518 
     | 
    
         
            +
                ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
         
     | 
| 
      
 519 
     | 
    
         
            +
                ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
         
     | 
| 
      
 520 
     | 
    
         
            +
             
     | 
| 
      
 521 
     | 
    
         
            +
                self.compute_gpfifo_entries: int = 0x10000
         
     | 
| 
      
 522 
     | 
    
         
            +
                self.compute_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0, entries=self.compute_gpfifo_entries)
         
     | 
| 
      
 523 
     | 
    
         
            +
                self.compute_gpu_ring: memoryview = to_mv(gpfifo.base, self.compute_gpfifo_entries * 8).cast("Q")
         
     | 
| 
      
 524 
     | 
    
         
            +
                self.compute_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + self.compute_gpfifo_entries * 8)
         
     | 
| 
      
 525 
     | 
    
         
            +
                self.compute_put_value: int = 0
         
     | 
| 
      
 526 
     | 
    
         
            +
             
     | 
| 
      
 527 
     | 
    
         
            +
                self.dma_gpfifo_entries: int = 0x10000
         
     | 
| 
      
 528 
     | 
    
         
            +
                self.dma_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0x100000, entries=self.dma_gpfifo_entries)
         
     | 
| 
      
 529 
     | 
    
         
            +
                self.dma_gpu_ring: memoryview = to_mv(gpfifo.base + 0x100000, self.dma_gpfifo_entries * 8).cast("Q")
         
     | 
| 
      
 530 
     | 
    
         
            +
                self.dma_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + 0x100000 + self.dma_gpfifo_entries * 8)
         
     | 
| 
      
 531 
     | 
    
         
            +
                self.dma_put_value: int = 0
         
     | 
| 
      
 532 
     | 
    
         
            +
             
     | 
| 
      
 533 
     | 
    
         
            +
                en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
         
     | 
| 
      
 534 
     | 
    
         
            +
                rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
         
     | 
| 
      
 535 
     | 
    
         
            +
             
     | 
| 
      
 536 
     | 
    
         
            +
                self.timeline_value: int = 1
         
     | 
| 
      
 537 
     | 
    
         
            +
                self.timeline_signal, self._shadow_timeline_signal = NVDevice._get_signal(), NVDevice._get_signal()
         
     | 
| 
      
 538 
     | 
    
         
            +
                self.time_event_st, self.time_event_en = NVDevice._get_signal(), NVDevice._get_signal()
         
     | 
| 
      
 539 
     | 
    
         
            +
             
     | 
| 
      
 540 
     | 
    
         
            +
                self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
         
     | 
| 
      
 541 
     | 
    
         
            +
                self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
         
     | 
| 
      
 542 
     | 
    
         
            +
                self.cmdq_wptr: int = 0 # in bytes
         
     | 
| 
      
 543 
     | 
    
         
            +
             
     | 
| 
      
 544 
     | 
    
         
            +
                self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
         
     | 
| 
      
 545 
     | 
    
         
            +
                self.kernargs_ptr: int = self.kernargs_page.base
         
     | 
| 
      
 546 
     | 
    
         
            +
             
     | 
| 
      
 547 
     | 
    
         
            +
                self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
         
     | 
| 
      
 548 
     | 
    
         
            +
             
     | 
| 
      
 549 
     | 
    
         
            +
                from tinygrad.runtime.graph.hcq import HCQGraph
         
     | 
| 
      
 550 
     | 
    
         
            +
                super().__init__(device, NVAllocator(self), NVRenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch),
         
     | 
| 
      
 551 
     | 
    
         
            +
                                 functools.partial(NVProgram, self), functools.partial(HCQGraph, NVDevice, HWComputeQueue, HWCopyQueue))
         
     | 
| 
      
 552 
     | 
    
         
            +
             
     | 
| 
      
 553 
     | 
    
         
            +
                self._cmdq_setup_compute_gpfifo()
         
     | 
| 
      
 554 
     | 
    
         
            +
                self._cmdq_setup_dma_gpfifo()
         
     | 
| 
      
 555 
     | 
    
         
            +
             
     | 
| 
      
 556 
     | 
    
         
            +
                NVDevice.devices.append(self)
         
     | 
| 
      
 557 
     | 
    
         
            +
             
     | 
| 
      
 558 
     | 
    
         
            +
              def synchronize(self):
         
     | 
| 
      
 559 
     | 
    
         
            +
                NVDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
         
     | 
| 
      
 560 
     | 
    
         
            +
                self.cmdq_wptr = 0
         
     | 
| 
      
 561 
     | 
    
         
            +
             
     | 
| 
      
 562 
     | 
    
         
            +
                if self.timeline_value > (1 << 63):
         
     | 
| 
      
 563 
     | 
    
         
            +
                  self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
         
     | 
| 
      
 564 
     | 
    
         
            +
                  self.timeline_signal[0], self.timeline_value = 0, 1
         
     | 
| 
      
 565 
     | 
    
         
            +
                  cast(NVAllocator, self.allocator).b_timeline = [0] * len(cast(NVAllocator, self.allocator).b)
         
     | 
| 
      
 566 
     | 
    
         
            +
             
     | 
| 
      
 567 
     | 
    
         
            +
              @staticmethod
         
     | 
| 
      
 568 
     | 
    
         
            +
              def synchronize_system():
         
     | 
| 
      
 569 
     | 
    
         
            +
                for d in NVDevice.devices: d.synchronize()
         
     | 
| 
      
 570 
     | 
    
         
            +
             
     | 
| 
      
 571 
     | 
    
         
            +
              @classmethod
         
     | 
| 
      
 572 
     | 
    
         
            +
              def _set_signal(self, sig, value): sig[0] = value
         
     | 
| 
      
 573 
     | 
    
         
            +
             
     | 
| 
      
 574 
     | 
    
         
            +
              @classmethod
         
     | 
| 
      
 575 
     | 
    
         
            +
              def _get_signal(self, value=0) -> memoryview:
         
     | 
| 
      
 576 
     | 
    
         
            +
                self._set_signal(sig := self.signals_pool.pop(), value)
         
     | 
| 
      
 577 
     | 
    
         
            +
                return sig
         
     | 
| 
      
 578 
     | 
    
         
            +
             
     | 
| 
      
 579 
     | 
    
         
            +
              @classmethod
         
     | 
| 
      
 580 
     | 
    
         
            +
              def _wait_signal(self, signal, value=0, timeout=10000):
         
     | 
| 
      
 581 
     | 
    
         
            +
                start_time = time.time() * 1000
         
     | 
| 
      
 582 
     | 
    
         
            +
                sem_value = signal[0]
         
     | 
| 
      
 583 
     | 
    
         
            +
                while sem_value < value:
         
     | 
| 
      
 584 
     | 
    
         
            +
                  sem_value = signal[0]
         
     | 
| 
      
 585 
     | 
    
         
            +
                  if time.time() * 1000 - start_time > timeout: raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
         
     | 
| 
      
 586 
     | 
    
         
            +
             
     | 
| 
      
 587 
     | 
    
         
            +
              def _gpu_fifo_setup(self, gpfifo, ctxshare, channel_group, offset, entries=0x400):
         
     | 
| 
      
 588 
     | 
    
         
            +
                notifier = self._gpu_system_alloc(48 << 20)
         
     | 
| 
      
 589 
     | 
    
         
            +
                params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo.hMemory,
         
     | 
| 
      
 590 
     | 
    
         
            +
                  gpFifoOffset=gpfifo.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
         
     | 
| 
      
 591 
     | 
    
         
            +
                  hUserdMemory=(ctypes.c_uint32*8)(gpfifo.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
         
     | 
| 
      
 592 
     | 
    
         
            +
                gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
         
     | 
| 
      
 593 
     | 
    
         
            +
                rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
         
     | 
| 
      
 594 
     | 
    
         
            +
                rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
         
     | 
| 
      
 595 
     | 
    
         
            +
             
     | 
| 
      
 596 
     | 
    
         
            +
                ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
         
     | 
| 
      
 597 
     | 
    
         
            +
                rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
         
     | 
| 
      
 598 
     | 
    
         
            +
                assert ws_token_params.workSubmitToken != -1
         
     | 
| 
      
 599 
     | 
    
         
            +
             
     | 
| 
      
 600 
     | 
    
         
            +
                channel_base = self._alloc_gpu_vaddr(0x4000000)
         
     | 
| 
      
 601 
     | 
    
         
            +
                uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
         
     | 
| 
      
 602 
     | 
    
         
            +
                                     hChannel=gpfifo, base=channel_base, length=0x4000000)
         
     | 
| 
      
 603 
     | 
    
         
            +
             
     | 
| 
      
 604 
     | 
    
         
            +
                return ws_token_params.workSubmitToken
         
     | 
| 
      
 605 
     | 
    
         
            +
             
     | 
| 
      
 606 
     | 
    
         
            +
              def _cmdq_setup_compute_gpfifo(self):
         
     | 
| 
      
 607 
     | 
    
         
            +
                self.slm_per_thread = 0x900
         
     | 
| 
      
 608 
     | 
    
         
            +
                bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
         
     | 
| 
      
 609 
     | 
    
         
            +
                bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
         
     | 
| 
      
 610 
     | 
    
         
            +
                self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
         
     | 
| 
      
 611 
     | 
    
         
            +
             
     | 
| 
      
 612 
     | 
    
         
            +
                # Set windows addresses to not collide with other allocated buffers.
         
     | 
| 
      
 613 
     | 
    
         
            +
                self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
         
     | 
| 
      
 614 
     | 
    
         
            +
             
     | 
| 
      
 615 
     | 
    
         
            +
                queue = HWComputeQueue()
         
     | 
| 
      
 616 
     | 
    
         
            +
                queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
         
     | 
| 
      
 617 
     | 
    
         
            +
                queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
         
     | 
| 
      
 618 
     | 
    
         
            +
                queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
         
     | 
| 
      
 619 
     | 
    
         
            +
                queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
         
     | 
| 
      
 620 
     | 
    
         
            +
                queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
         
     | 
| 
      
 621 
     | 
    
         
            +
                queue.signal(self.timeline_signal, self.timeline_value).submit(self)
         
     | 
| 
      
 622 
     | 
    
         
            +
                self.timeline_value += 1
         
     | 
| 
      
 623 
     | 
    
         
            +
                self.synchronize()
         
     | 
| 
      
 624 
     | 
    
         
            +
             
     | 
| 
      
 625 
     | 
    
         
            +
              def _cmdq_setup_dma_gpfifo(self):
         
     | 
| 
      
 626 
     | 
    
         
            +
                queue = HWCopyQueue()
         
     | 
| 
      
 627 
     | 
    
         
            +
                queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
         
     | 
| 
      
 628 
     | 
    
         
            +
                queue.signal(self.timeline_signal, self.timeline_value).submit(self)
         
     | 
| 
      
 629 
     | 
    
         
            +
                self.timeline_value += 1
         
     | 
| 
      
 630 
     | 
    
         
            +
                self.synchronize()
         
     |