tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +6 -0
 - tinygrad/codegen/kernel.py +572 -83
 - tinygrad/codegen/linearizer.py +415 -395
 - tinygrad/codegen/uops.py +415 -0
 - tinygrad/device.py +183 -0
 - tinygrad/dtype.py +113 -0
 - tinygrad/engine/__init__.py +0 -0
 - tinygrad/engine/graph.py +100 -0
 - tinygrad/engine/jit.py +195 -0
 - tinygrad/engine/realize.py +191 -0
 - tinygrad/engine/schedule.py +362 -0
 - tinygrad/engine/search.py +196 -0
 - tinygrad/{mlops.py → function.py} +76 -55
 - tinygrad/helpers.py +196 -89
 - tinygrad/lazy.py +210 -371
 - tinygrad/multi.py +169 -0
 - tinygrad/nn/__init__.py +202 -22
 - tinygrad/nn/datasets.py +7 -0
 - tinygrad/nn/optim.py +112 -32
 - tinygrad/nn/state.py +136 -39
 - tinygrad/ops.py +119 -202
 - tinygrad/renderer/__init__.py +61 -0
 - tinygrad/renderer/assembly.py +276 -0
 - tinygrad/renderer/cstyle.py +353 -166
 - tinygrad/renderer/llvmir.py +150 -138
 - tinygrad/runtime/autogen/amd_gpu.py +1900 -0
 - tinygrad/runtime/autogen/comgr.py +865 -0
 - tinygrad/runtime/autogen/cuda.py +5923 -0
 - tinygrad/runtime/autogen/hip.py +5909 -0
 - tinygrad/runtime/autogen/hsa.py +5761 -0
 - tinygrad/runtime/autogen/kfd.py +812 -0
 - tinygrad/runtime/autogen/nv_gpu.py +33328 -0
 - tinygrad/runtime/autogen/opencl.py +1795 -0
 - tinygrad/runtime/driver/hip_comgr.py +47 -0
 - tinygrad/runtime/driver/hsa.py +143 -0
 - tinygrad/runtime/graph/clang.py +38 -0
 - tinygrad/runtime/graph/cuda.py +81 -0
 - tinygrad/runtime/graph/hcq.py +143 -0
 - tinygrad/runtime/graph/hsa.py +171 -0
 - tinygrad/runtime/graph/metal.py +75 -0
 - tinygrad/runtime/ops_amd.py +564 -0
 - tinygrad/runtime/ops_clang.py +24 -77
 - tinygrad/runtime/ops_cuda.py +175 -89
 - tinygrad/runtime/ops_disk.py +56 -33
 - tinygrad/runtime/ops_gpu.py +92 -95
 - tinygrad/runtime/ops_hsa.py +278 -0
 - tinygrad/runtime/ops_llvm.py +39 -60
 - tinygrad/runtime/ops_metal.py +92 -74
 - tinygrad/runtime/ops_npy.py +9 -0
 - tinygrad/runtime/ops_nv.py +630 -0
 - tinygrad/runtime/ops_python.py +204 -0
 - tinygrad/shape/shapetracker.py +86 -254
 - tinygrad/shape/symbolic.py +166 -141
 - tinygrad/shape/view.py +296 -0
 - tinygrad/tensor.py +2619 -448
 - {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
 - tinygrad-0.9.0.dist-info/METADATA +227 -0
 - tinygrad-0.9.0.dist-info/RECORD +60 -0
 - {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
 - tinygrad/codegen/assembly.py +0 -190
 - tinygrad/codegen/optimizer.py +0 -379
 - tinygrad/codegen/search.py +0 -72
 - tinygrad/graph.py +0 -83
 - tinygrad/jit.py +0 -57
 - tinygrad/nn/image.py +0 -100
 - tinygrad/renderer/assembly_arm64.py +0 -169
 - tinygrad/renderer/assembly_ptx.py +0 -98
 - tinygrad/renderer/wgsl.py +0 -53
 - tinygrad/runtime/lib.py +0 -113
 - tinygrad/runtime/ops_cpu.py +0 -51
 - tinygrad/runtime/ops_hip.py +0 -82
 - tinygrad/runtime/ops_shm.py +0 -29
 - tinygrad/runtime/ops_torch.py +0 -30
 - tinygrad/runtime/ops_webgpu.py +0 -45
 - tinygrad-0.7.0.dist-info/METADATA +0 -212
 - tinygrad-0.7.0.dist-info/RECORD +0 -40
 - {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
 
| 
         @@ -0,0 +1,75 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from typing import List, Any, Dict, cast, Optional
         
     | 
| 
      
 2 
     | 
    
         
            +
            import Metal
         
     | 
| 
      
 3 
     | 
    
         
            +
            from tinygrad.dtype import dtypes
         
     | 
| 
      
 4 
     | 
    
         
            +
            from tinygrad.helpers import dedup, unwrap2, GraphException
         
     | 
| 
      
 5 
     | 
    
         
            +
            from tinygrad.device import Buffer
         
     | 
| 
      
 6 
     | 
    
         
            +
            from tinygrad.engine.realize import ExecItem, CompiledRunner
         
     | 
| 
      
 7 
     | 
    
         
            +
            from tinygrad.engine.jit import GraphRunner
         
     | 
| 
      
 8 
     | 
    
         
            +
            from tinygrad.shape.symbolic import Variable
         
     | 
| 
      
 9 
     | 
    
         
            +
            from tinygrad.runtime.ops_metal import wait_check
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            class MetalGraph(GraphRunner):
         
     | 
| 
      
 12 
     | 
    
         
            +
              def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
         
     | 
| 
      
 13 
     | 
    
         
            +
                super().__init__(jit_cache, input_rawbuffers, var_vals)
         
     | 
| 
      
 14 
     | 
    
         
            +
                if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                # create metal batch exec
         
     | 
| 
      
 17 
     | 
    
         
            +
                icb_descriptor = Metal.MTLIndirectCommandBufferDescriptor.new()
         
     | 
| 
      
 18 
     | 
    
         
            +
                icb_descriptor.setCommandTypes_(Metal.MTLIndirectCommandType(Metal.MTLIndirectCommandTypeConcurrentDispatch))
         
     | 
| 
      
 19 
     | 
    
         
            +
                icb_descriptor.setInheritBuffers_(False)
         
     | 
| 
      
 20 
     | 
    
         
            +
                icb_descriptor.setInheritPipelineState_(False)
         
     | 
| 
      
 21 
     | 
    
         
            +
                icb_descriptor.setMaxKernelBufferBindCount_(31)
         
     | 
| 
      
 22 
     | 
    
         
            +
                self.icb = self.device.device.newIndirectCommandBufferWithDescriptor_maxCommandCount_options_(icb_descriptor, len(self.jit_cache),
         
     | 
| 
      
 23 
     | 
    
         
            +
                                                                                                              Metal.MTLResourceOptions(0))
         
     | 
| 
      
 24 
     | 
    
         
            +
                if self.icb is None: raise GraphException("create indirect command buffer failed, does your system support this?")
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                if len(self.vars): self.int_buf = self.device.allocator.alloc(len(self.vars)*dtypes.int32.itemsize)
         
     | 
| 
      
 27 
     | 
    
         
            +
                all_resources = [self.int_buf] if len(self.vars) else []
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                for j,ji in enumerate(self.jit_cache):
         
     | 
| 
      
 30 
     | 
    
         
            +
                  prg: CompiledRunner = cast(CompiledRunner, ji.prg)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  descriptor = Metal.MTLComputePipelineDescriptor.new()
         
     | 
| 
      
 32 
     | 
    
         
            +
                  descriptor.setComputeFunction_(prg.clprg.fxn)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  descriptor.setSupportIndirectCommandBuffers_(True)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  icb_command = self.icb.indirectComputeCommandAtIndex_(j)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  icb_command.setComputePipelineState_(unwrap2(
         
     | 
| 
      
 36 
     | 
    
         
            +
                    self.device.device.newComputePipelineStateWithDescriptor_options_reflection_error_(descriptor, Metal.MTLPipelineOption(0), None, None)))
         
     | 
| 
      
 37 
     | 
    
         
            +
                  for i,b in enumerate(ji.bufs):
         
     | 
| 
      
 38 
     | 
    
         
            +
                    if b is not None:
         
     | 
| 
      
 39 
     | 
    
         
            +
                      icb_command.setKernelBuffer_offset_atIndex_(b._buf, 0, i)
         
     | 
| 
      
 40 
     | 
    
         
            +
                      all_resources.append(b._buf)
         
     | 
| 
      
 41 
     | 
    
         
            +
                  for i,v in enumerate(prg.p.vars): icb_command.setKernelBuffer_offset_atIndex_(self.int_buf, self.vars.index(v)*4, len(ji.bufs)+i)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  if j not in self.jc_idx_with_updatable_launch_dims:
         
     | 
| 
      
 43 
     | 
    
         
            +
                    global_size, local_size = prg.p.launch_dims(var_vals)
         
     | 
| 
      
 44 
     | 
    
         
            +
                    icb_command.concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
         
     | 
| 
      
 45 
     | 
    
         
            +
                  icb_command.setBarrier()
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                self.all_resources = dedup(all_resources)
         
     | 
| 
      
 48 
     | 
    
         
            +
                self.command_buffer: Any = None
         
     | 
| 
      
 49 
     | 
    
         
            +
                if len(self.vars): self.int_buf_view = self.int_buf.contents().as_buffer(self.int_buf.length()).cast('i')
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
              def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
         
     | 
| 
      
 52 
     | 
    
         
            +
                if self.command_buffer is not None and self.command_buffer in self.device.mtl_buffers_in_flight: wait_check(self.command_buffer)
         
     | 
| 
      
 53 
     | 
    
         
            +
                all_resources = dedup(self.all_resources + [x._buf for x in input_rawbuffers])
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                for (j,i),input_idx in self.input_replace.items():
         
     | 
| 
      
 56 
     | 
    
         
            +
                  self.icb.indirectComputeCommandAtIndex_(j).setKernelBuffer_offset_atIndex_(input_rawbuffers[input_idx]._buf, 0, i)
         
     | 
| 
      
 57 
     | 
    
         
            +
                for j in self.jc_idx_with_updatable_launch_dims:
         
     | 
| 
      
 58 
     | 
    
         
            +
                  global_size, local_size = cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals)
         
     | 
| 
      
 59 
     | 
    
         
            +
                  self.icb.indirectComputeCommandAtIndex_(j).concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size),
         
     | 
| 
      
 60 
     | 
    
         
            +
                                                                                                                   Metal.MTLSize(*local_size))
         
     | 
| 
      
 61 
     | 
    
         
            +
                for j, var in enumerate(self.vars): self.int_buf_view[j] = var_vals[var]
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                command_buffer = self.device.mtl_queue.commandBuffer()
         
     | 
| 
      
 64 
     | 
    
         
            +
                encoder = command_buffer.computeCommandEncoder()
         
     | 
| 
      
 65 
     | 
    
         
            +
                encoder.useResources_count_usage_(all_resources, len(all_resources), Metal.MTLResourceUsageRead | Metal.MTLResourceUsageWrite)
         
     | 
| 
      
 66 
     | 
    
         
            +
                encoder.executeCommandsInBuffer_withRange_(self.icb, Metal.MTLIndirectCommandBufferExecutionRangeMake(0, len(self.jit_cache)))
         
     | 
| 
      
 67 
     | 
    
         
            +
                encoder.endEncoding()
         
     | 
| 
      
 68 
     | 
    
         
            +
                command_buffer.commit()
         
     | 
| 
      
 69 
     | 
    
         
            +
                self.command_buffer = command_buffer
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
                if wait:
         
     | 
| 
      
 72 
     | 
    
         
            +
                  wait_check(command_buffer)
         
     | 
| 
      
 73 
     | 
    
         
            +
                  return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
         
     | 
| 
      
 74 
     | 
    
         
            +
                self.device.mtl_buffers_in_flight.append(command_buffer)
         
     | 
| 
      
 75 
     | 
    
         
            +
                return None
         
     | 
| 
         @@ -0,0 +1,564 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from __future__ import annotations
         
     | 
| 
      
 2 
     | 
    
         
            +
            from typing import Tuple, List, Any, cast
         
     | 
| 
      
 3 
     | 
    
         
            +
            import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
         
     | 
| 
      
 4 
     | 
    
         
            +
            from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
         
     | 
| 
      
 5 
     | 
    
         
            +
            from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
         
     | 
| 
      
 6 
     | 
    
         
            +
            from tinygrad.renderer.cstyle import AMDRenderer
         
     | 
| 
      
 7 
     | 
    
         
            +
            from tinygrad.runtime.driver.hip_comgr import compile_hip
         
     | 
| 
      
 8 
     | 
    
         
            +
            from tinygrad.runtime.ops_hsa import HSACompiler
         
     | 
| 
      
 9 
     | 
    
         
            +
            import tinygrad.runtime.autogen.kfd as kfd
         
     | 
| 
      
 10 
     | 
    
         
            +
            import tinygrad.runtime.autogen.hsa as hsa
         
     | 
| 
      
 11 
     | 
    
         
            +
            import tinygrad.runtime.autogen.amd_gpu as amd_gpu
         
     | 
| 
      
 12 
     | 
    
         
            +
            if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl  # noqa: F401
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            libc = ctypes.CDLL(ctypes.util.find_library("c"))
         
     | 
| 
      
 15 
     | 
    
         
            +
            libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
         
     | 
| 
      
 16 
     | 
    
         
            +
            libc.mmap.restype = ctypes.c_void_p
         
     | 
| 
      
 17 
     | 
    
         
            +
            libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
         
     | 
| 
      
 18 
     | 
    
         
            +
            libc.munmap.restype = ctypes.c_int
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            if getenv("MOCKGPU"):
         
     | 
| 
      
 21 
     | 
    
         
            +
              import extra.mockgpu.mockgpu  # noqa: F401
         
     | 
| 
      
 22 
     | 
    
         
            +
              libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore
         
     | 
| 
      
 23 
     | 
    
         
            +
              libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            def is_usable_gpu(gpu_id):
         
     | 
| 
      
 26 
     | 
    
         
            +
              try:
         
     | 
| 
      
 27 
     | 
    
         
            +
                with gpu_id.open() as f:
         
     | 
| 
      
 28 
     | 
    
         
            +
                  return int(f.read()) != 0
         
     | 
| 
      
 29 
     | 
    
         
            +
              except OSError:
         
     | 
| 
      
 30 
     | 
    
         
            +
                return False
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
         
     | 
| 
      
 33 
     | 
    
         
            +
              made = made_struct or user_struct(**kwargs)
         
     | 
| 
      
 34 
     | 
    
         
            +
              ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
         
     | 
| 
      
 35 
     | 
    
         
            +
              if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
         
     | 
| 
      
 36 
     | 
    
         
            +
              return made
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            def ioctls_from_header():
         
     | 
| 
      
 39 
     | 
    
         
            +
              #hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
         
     | 
| 
      
 40 
     | 
    
         
            +
              #pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
         
     | 
| 
      
 41 
     | 
    
         
            +
              #matches = re.findall(pattern, hdr, re.MULTILINE)
         
     | 
| 
      
 42 
     | 
    
         
            +
              # get this from python instead
         
     | 
| 
      
 43 
     | 
    
         
            +
              hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
         
     | 
| 
      
 44 
     | 
    
         
            +
              pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
         
     | 
| 
      
 45 
     | 
    
         
            +
              matches = re.findall(pattern, hdrpy, re.MULTILINE)
         
     | 
| 
      
 46 
     | 
    
         
            +
              idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
         
     | 
| 
      
 47 
     | 
    
         
            +
              fxns = {name.replace("AMDKFD_IOC_", "").lower():
         
     | 
| 
      
 48 
     | 
    
         
            +
                      functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
         
     | 
| 
      
 49 
     | 
    
         
            +
                      for name, idir, nr, sname in matches}
         
     | 
| 
      
 50 
     | 
    
         
            +
              return type("KIO", (object, ), fxns)
         
     | 
| 
      
 51 
     | 
    
         
            +
            kio = ioctls_from_header()
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
            def create_sdma_packets():
         
     | 
| 
      
 54 
     | 
    
         
            +
              # TODO: clean up this, if we want to keep it
         
     | 
| 
      
 55 
     | 
    
         
            +
              structs = {}
         
     | 
| 
      
 56 
     | 
    
         
            +
              for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
         
     | 
| 
      
 57 
     | 
    
         
            +
                names = set()
         
     | 
| 
      
 58 
     | 
    
         
            +
                fields = []
         
     | 
| 
      
 59 
     | 
    
         
            +
                for pkt_fields in pkt._fields_:
         
     | 
| 
      
 60 
     | 
    
         
            +
                  if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  else:
         
     | 
| 
      
 62 
     | 
    
         
            +
                    assert pkt_fields[1]._fields_[0][0] == '_0'
         
     | 
| 
      
 63 
     | 
    
         
            +
                    for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
         
     | 
| 
      
 64 
     | 
    
         
            +
                      fname = union_fields[0]
         
     | 
| 
      
 65 
     | 
    
         
            +
                      if fname in names: fname = pkt_fields[0]+fname
         
     | 
| 
      
 66 
     | 
    
         
            +
                      names.add(fname)
         
     | 
| 
      
 67 
     | 
    
         
            +
                      # merge together 64-bit fields, otherwise just append them
         
     | 
| 
      
 68 
     | 
    
         
            +
                      if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
         
     | 
| 
      
 69 
     | 
    
         
            +
                      else: fields.append(tuple([fname, *union_fields[1:]]))
         
     | 
| 
      
 70 
     | 
    
         
            +
                new_name = name[16:-4].lower()
         
     | 
| 
      
 71 
     | 
    
         
            +
                structs[new_name] = init_c_struct_t(tuple(fields))
         
     | 
| 
      
 72 
     | 
    
         
            +
                assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
         
     | 
| 
      
 73 
     | 
    
         
            +
              return type("SDMA_PKTS", (object, ), structs)
         
     | 
| 
      
 74 
     | 
    
         
            +
            sdma_pkts = create_sdma_packets()
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
            class AMDCompiler(Compiler):
         
     | 
| 
      
 77 
     | 
    
         
            +
              def __init__(self, arch:str):
         
     | 
| 
      
 78 
     | 
    
         
            +
                self.arch = arch
         
     | 
| 
      
 79 
     | 
    
         
            +
                super().__init__(f"compile_hip_{self.arch}")
         
     | 
| 
      
 80 
     | 
    
         
            +
              def compile(self, src:str) -> bytes:
         
     | 
| 
      
 81 
     | 
    
         
            +
                try: return compile_hip(src, self.arch)
         
     | 
| 
      
 82 
     | 
    
         
            +
                except RuntimeError as e: raise CompileError(e)
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
            PAGE_SIZE = 0x1000
         
     | 
| 
      
 85 
     | 
    
         
            +
            SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
         
     | 
| 
      
 86 
     | 
    
         
            +
            SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
            BASE_ADDR = 0x00001260
         
     | 
| 
      
 89 
     | 
    
         
            +
            SUB = amd_gpu.PACKET3_SET_SH_REG_START - BASE_ADDR
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
            regCOMPUTE_PGM_LO = 0x1bac - SUB
         
     | 
| 
      
 92 
     | 
    
         
            +
            regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
         
     | 
| 
      
 93 
     | 
    
         
            +
            regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
         
     | 
| 
      
 94 
     | 
    
         
            +
            regCOMPUTE_START_X = 0x1ba4 - SUB
         
     | 
| 
      
 95 
     | 
    
         
            +
            regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
         
     | 
| 
      
 96 
     | 
    
         
            +
            regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
         
     | 
| 
      
 97 
     | 
    
         
            +
            regCOMPUTE_RESTART_X = 0x1bbb - SUB
         
     | 
| 
      
 98 
     | 
    
         
            +
            regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
         
     | 
| 
      
 99 
     | 
    
         
            +
            regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
         
     | 
| 
      
 100 
     | 
    
         
            +
            regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
            regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
         
     | 
| 
      
 103 
     | 
    
         
            +
            regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
         
     | 
| 
      
 104 
     | 
    
         
            +
             
     | 
| 
      
 105 
     | 
    
         
            +
            # VGT_EVENT_TYPE in navi10_enum.h
         
     | 
| 
      
 106 
     | 
    
         
            +
            CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
         
     | 
| 
      
 107 
     | 
    
         
            +
            CS_PARTIAL_FLUSH = 0x7
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
            WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
         
     | 
| 
      
 110 
     | 
    
         
            +
            WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
            COMPUTE_SHADER_EN = 1
         
     | 
| 
      
 113 
     | 
    
         
            +
            FORCE_START_AT_000 = 1 << 2
         
     | 
| 
      
 114 
     | 
    
         
            +
            CS_W32_EN = 1 << 15
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
            class HWPM4Queue:
         
     | 
| 
      
 117 
     | 
    
         
            +
              def __init__(self): self.q = []
         
     | 
| 
      
 118 
     | 
    
         
            +
              def ptr(self) -> int: return len(self.q)
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
              def hdp_flush(self):
         
     | 
| 
      
 121 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
         
     | 
| 
      
 122 
     | 
    
         
            +
                  amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | \
         
     | 
| 
      
 123 
     | 
    
         
            +
                  amd_gpu.WAIT_REG_MEM_ENGINE(0), regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE, 0x0, 0x0, 0x20]
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
              def invalidate_cache(self):
         
     | 
| 
      
 126 
     | 
    
         
            +
                # overkill?
         
     | 
| 
      
 127 
     | 
    
         
            +
                addr=0x0
         
     | 
| 
      
 128 
     | 
    
         
            +
                sz=(1 << 64)-1
         
     | 
| 
      
 129 
     | 
    
         
            +
                gli=1
         
     | 
| 
      
 130 
     | 
    
         
            +
                glv=1
         
     | 
| 
      
 131 
     | 
    
         
            +
                glk=1
         
     | 
| 
      
 132 
     | 
    
         
            +
                gl1=1
         
     | 
| 
      
 133 
     | 
    
         
            +
                gl2=1
         
     | 
| 
      
 134 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
         
     | 
| 
      
 135 
     | 
    
         
            +
                           sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
         
     | 
| 
      
 136 
     | 
    
         
            +
                           amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
         
     | 
| 
      
 137 
     | 
    
         
            +
                           amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
         
     | 
| 
      
 138 
     | 
    
         
            +
                           amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
         
     | 
| 
      
 139 
     | 
    
         
            +
                return self
         
     | 
| 
      
 140 
     | 
    
         
            +
             
     | 
| 
      
 141 
     | 
    
         
            +
              def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), signal=None, signal_value=0):
         
     | 
| 
      
 142 
     | 
    
         
            +
                self.hdp_flush()
         
     | 
| 
      
 143 
     | 
    
         
            +
                self.invalidate_cache()
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
                code = hsa.amd_kernel_code_t.from_address(prg.handle) # NOTE: this is wrong, it's not this object
         
     | 
| 
      
 146 
     | 
    
         
            +
                assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
         
     | 
| 
      
 147 
     | 
    
         
            +
                assert code.workitem_private_segment_byte_size == 0
         
     | 
| 
      
 148 
     | 
    
         
            +
                assert code.max_scratch_backing_memory_byte_size == 0
         
     | 
| 
      
 149 
     | 
    
         
            +
                assert code.kernel_code_prefetch_byte_size == 0
         
     | 
| 
      
 150 
     | 
    
         
            +
                rsrc1, rsrc2 = code.compute_pgm_rsrc1, code.compute_pgm_rsrc2
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                # this is required
         
     | 
| 
      
 153 
     | 
    
         
            +
                lds_size = ((prg.group_segment_size + 511) // 512) & 0x1FF
         
     | 
| 
      
 154 
     | 
    
         
            +
                assert lds_size <= 0x80 # larger numbers stall the GPU
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                prog_addr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
         
     | 
| 
      
 157 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, prog_addr&0xFFFFFFFF, prog_addr>>32, 0, 0,
         
     | 
| 
      
 158 
     | 
    
         
            +
                           (prg.device.scratch.va_addr>>8)&0xFFFFFFFF, prg.device.scratch.va_addr>>40]
         
     | 
| 
      
 159 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, rsrc1, rsrc2 | (lds_size << 15)]
         
     | 
| 
      
 160 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_TMPRING_SIZE, 0x00200200] # (waveSize << 12) | (numWaves)
         
     | 
| 
      
 161 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_RESTART_X, 0,0,0,0]
         
     | 
| 
      
 162 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF,0xFFFFFFFF]
         
     | 
| 
      
 163 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF,0xFFFFFFFF]
         
     | 
| 
      
 164 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]
         
     | 
| 
      
 165 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, kernargs&0xFFFFFFFF, kernargs>>32]
         
     | 
| 
      
 166 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0]
         
     | 
| 
      
 167 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_RESOURCE_LIMITS, 0]
         
     | 
| 
      
 168 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
         
     | 
| 
      
 169 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
                if signal is not None: self.signal(signal, signal_value)
         
     | 
| 
      
 172 
     | 
    
         
            +
                return self
         
     | 
| 
      
 173 
     | 
    
         
            +
             
     | 
| 
      
 174 
     | 
    
         
            +
              def update_exec(self, cmd_ptr, global_size, local_size):
         
     | 
| 
      
 175 
     | 
    
         
            +
                # Patch the exec cmd with new launch dims
         
     | 
| 
      
 176 
     | 
    
         
            +
                assert self.q[cmd_ptr + 67] == amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),"The pointer does not point to a packet of this type"
         
     | 
| 
      
 177 
     | 
    
         
            +
                self.q[cmd_ptr + 59 : cmd_ptr + 62] = local_size
         
     | 
| 
      
 178 
     | 
    
         
            +
                self.q[cmd_ptr + 68 : cmd_ptr + 71] = global_size
         
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
              def wait(self, signal:hsa.amd_signal_t, value=0):
         
     | 
| 
      
 181 
     | 
    
         
            +
                addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
         
     | 
| 
      
 182 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
         
     | 
| 
      
 183 
     | 
    
         
            +
                  amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
         
     | 
| 
      
 184 
     | 
    
         
            +
                  amd_gpu.WAIT_REG_MEM_ENGINE(0), addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
         
     | 
| 
      
 185 
     | 
    
         
            +
                return self
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
              def timestamp(self, addr):
         
     | 
| 
      
 188 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
         
     | 
| 
      
 189 
     | 
    
         
            +
                  # event_index__mec_release_mem__end_of_pipe = 5
         
     | 
| 
      
 190 
     | 
    
         
            +
                  amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5),
         
     | 
| 
      
 191 
     | 
    
         
            +
                  # * 3 - send 64bit GPU counter value
         
     | 
| 
      
 192 
     | 
    
         
            +
                  amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(3) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(0) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
         
     | 
| 
      
 193 
     | 
    
         
            +
                  addr&0xFFFFFFFF, addr>>32, 0, 0, 0]
         
     | 
| 
      
 194 
     | 
    
         
            +
                return self
         
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
              def signal(self, signal:hsa.amd_signal_t, value=0):
         
     | 
| 
      
 197 
     | 
    
         
            +
                # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
         
     | 
| 
      
 198 
     | 
    
         
            +
                addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
         
     | 
| 
      
 199 
     | 
    
         
            +
                self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
         
     | 
| 
      
 200 
     | 
    
         
            +
                    # event_index__mec_release_mem__end_of_pipe = 5
         
     | 
| 
      
 201 
     | 
    
         
            +
                    # event_index__mec_release_mem__shader_done = 6
         
     | 
| 
      
 202 
     | 
    
         
            +
                    amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
         
     | 
| 
      
 203 
     | 
    
         
            +
                      amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
         
     | 
| 
      
 204 
     | 
    
         
            +
                      amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
         
     | 
| 
      
 205 
     | 
    
         
            +
                      amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
         
     | 
| 
      
 206 
     | 
    
         
            +
                    amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
         
     | 
| 
      
 207 
     | 
    
         
            +
                    addr&0xFFFFFFFF, addr>>32,
         
     | 
| 
      
 208 
     | 
    
         
            +
                    value&0xFFFFFFFF, value>>32, 0]
         
     | 
| 
      
 209 
     | 
    
         
            +
                if signal.event_mailbox_ptr != 0:
         
     | 
| 
      
 210 
     | 
    
         
            +
                  self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
         
     | 
| 
      
 211 
     | 
    
         
            +
                    # event_index__mec_release_mem__end_of_pipe = 5
         
     | 
| 
      
 212 
     | 
    
         
            +
                    # event_index__mec_release_mem__shader_done = 6
         
     | 
| 
      
 213 
     | 
    
         
            +
                    amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
         
     | 
| 
      
 214 
     | 
    
         
            +
                      amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
         
     | 
| 
      
 215 
     | 
    
         
            +
                      amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
         
     | 
| 
      
 216 
     | 
    
         
            +
                      amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
         
     | 
| 
      
 217 
     | 
    
         
            +
                    amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
         
     | 
| 
      
 218 
     | 
    
         
            +
                    signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
         
     | 
| 
      
 219 
     | 
    
         
            +
                    signal.event_id&0xFFFFFFFF, signal.event_id>>32,
         
     | 
| 
      
 220 
     | 
    
         
            +
                    signal.event_id]
         
     | 
| 
      
 221 
     | 
    
         
            +
                return self
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
              def submit(self, device:AMDDevice):
         
     | 
| 
      
 224 
     | 
    
         
            +
                wptr = device.pm4_write_pointer[0]
         
     | 
| 
      
 225 
     | 
    
         
            +
                pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
         
     | 
| 
      
 226 
     | 
    
         
            +
                for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
         
     | 
| 
      
 227 
     | 
    
         
            +
                device.pm4_write_pointer[0] = wptr + len(self.q)
         
     | 
| 
      
 228 
     | 
    
         
            +
                device.pm4_doorbell[0] = wptr + len(self.q)
         
     | 
| 
      
 229 
     | 
    
         
            +
                return self
         
     | 
| 
      
 230 
     | 
    
         
            +
             
     | 
| 
      
 231 
     | 
    
         
            +
            # prebuilt sdma packets
         
     | 
| 
      
 232 
     | 
    
         
            +
            sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
         
     | 
| 
      
 233 
     | 
    
         
            +
            sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
         
     | 
| 
      
 234 
     | 
    
         
            +
                                          GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
         
     | 
| 
      
 235 
     | 
    
         
            +
                                          GCR_CONTROL_GL2_RANGE=0)
         
     | 
| 
      
 236 
     | 
    
         
            +
            sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
         
     | 
| 
      
 237 
     | 
    
         
            +
                                          GCR_CONTROL_GL2_RANGE=0)
         
     | 
| 
      
 238 
     | 
    
         
            +
             
     | 
| 
      
 239 
     | 
    
         
            +
            SDMA_MAX_COPY_SIZE = 0x400000
         
     | 
| 
      
 240 
     | 
    
         
            +
            class HWCopyQueue:
         
     | 
| 
      
 241 
     | 
    
         
            +
              def __init__(self): self.q = []
         
     | 
| 
      
 242 
     | 
    
         
            +
             
     | 
| 
      
 243 
     | 
    
         
            +
              def submit(self, device:AMDDevice):
         
     | 
| 
      
 244 
     | 
    
         
            +
                read_ptr = device.sdma_read_pointer[0]
         
     | 
| 
      
 245 
     | 
    
         
            +
                if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
         
     | 
| 
      
 246 
     | 
    
         
            +
                for cmd in self.q:
         
     | 
| 
      
 247 
     | 
    
         
            +
                  if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
         
     | 
| 
      
 248 
     | 
    
         
            +
                    ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
         
     | 
| 
      
 249 
     | 
    
         
            +
                    device.sdma_doorbell_value += fill
         
     | 
| 
      
 250 
     | 
    
         
            +
                  ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
         
     | 
| 
      
 251 
     | 
    
         
            +
                  device.sdma_doorbell_value += cmdsz
         
     | 
| 
      
 252 
     | 
    
         
            +
                device.sdma_write_pointer[0] = device.sdma_doorbell_value
         
     | 
| 
      
 253 
     | 
    
         
            +
                device.sdma_doorbell[0] = device.sdma_doorbell_value
         
     | 
| 
      
 254 
     | 
    
         
            +
                return self
         
     | 
| 
      
 255 
     | 
    
         
            +
             
     | 
| 
      
 256 
     | 
    
         
            +
              def timestamp(self, addr):
         
     | 
| 
      
 257 
     | 
    
         
            +
                self.q.append(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=addr))
         
     | 
| 
      
 258 
     | 
    
         
            +
                return self
         
     | 
| 
      
 259 
     | 
    
         
            +
             
     | 
| 
      
 260 
     | 
    
         
            +
              def copy(self, dest, src, copy_size):
         
     | 
| 
      
 261 
     | 
    
         
            +
                self.q.append(sdma_flush_hdp_pkt)  # TODO: do I need this?
         
     | 
| 
      
 262 
     | 
    
         
            +
                self.q.append(sdma_cache_inv)
         
     | 
| 
      
 263 
     | 
    
         
            +
                copied = 0
         
     | 
| 
      
 264 
     | 
    
         
            +
                copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
         
     | 
| 
      
 265 
     | 
    
         
            +
                for _ in range(copies_commands):
         
     | 
| 
      
 266 
     | 
    
         
            +
                  step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
         
     | 
| 
      
 267 
     | 
    
         
            +
                  self.q.append(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
         
     | 
| 
      
 268 
     | 
    
         
            +
                                                      count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
         
     | 
| 
      
 269 
     | 
    
         
            +
                  copied += step_copy_size
         
     | 
| 
      
 270 
     | 
    
         
            +
                self.q.append(sdma_cache_wb)
         
     | 
| 
      
 271 
     | 
    
         
            +
                return self
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
              def signal(self, signal:hsa.amd_signal_t, value=0):
         
     | 
| 
      
 274 
     | 
    
         
            +
                self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
         
     | 
| 
      
 275 
     | 
    
         
            +
                if signal.event_mailbox_ptr != 0:
         
     | 
| 
      
 276 
     | 
    
         
            +
                  self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
         
     | 
| 
      
 277 
     | 
    
         
            +
                  self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
         
     | 
| 
      
 278 
     | 
    
         
            +
                return self
         
     | 
| 
      
 279 
     | 
    
         
            +
             
     | 
| 
      
 280 
     | 
    
         
            +
              def wait(self, signal:hsa.amd_signal_t, value=0):
         
     | 
| 
      
 281 
     | 
    
         
            +
                self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=WAIT_REG_MEM_FUNCTION_GEQ,
         
     | 
| 
      
 282 
     | 
    
         
            +
                                                    addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
         
     | 
| 
      
 283 
     | 
    
         
            +
                                                    value=value, mask=0xffffffff, interval=0x04, retry_count=0xfff))
         
     | 
| 
      
 284 
     | 
    
         
            +
                return self
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
            SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
         
     | 
| 
      
 287 
     | 
    
         
            +
            class AMDProgram:
         
     | 
| 
      
 288 
     | 
    
         
            +
              def __init__(self, device:AMDDevice, name:str, lib:bytes):
         
     | 
| 
      
 289 
     | 
    
         
            +
                # TODO; this API needs the type signature of the function and global_size/local_size
         
     | 
| 
      
 290 
     | 
    
         
            +
                self.device, self.name, self.lib = device, name, lib
         
     | 
| 
      
 291 
     | 
    
         
            +
             
     | 
| 
      
 292 
     | 
    
         
            +
                if DEBUG >= 6:
         
     | 
| 
      
 293 
     | 
    
         
            +
                  asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
         
     | 
| 
      
 294 
     | 
    
         
            +
                  print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
         
     | 
| 
      
 295 
     | 
    
         
            +
             
     | 
| 
      
 296 
     | 
    
         
            +
                _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
         
     | 
| 
      
 297 
     | 
    
         
            +
                sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
         
     | 
| 
      
 298 
     | 
    
         
            +
             
     | 
| 
      
 299 
     | 
    
         
            +
                lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
         
     | 
| 
      
 300 
     | 
    
         
            +
                self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
         
     | 
| 
      
 301 
     | 
    
         
            +
                lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
         
     | 
| 
      
 302 
     | 
    
         
            +
             
     | 
| 
      
 303 
     | 
    
         
            +
                for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
         
     | 
| 
      
 304 
     | 
    
         
            +
                  if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
         
     | 
| 
      
 305 
     | 
    
         
            +
             
     | 
| 
      
 306 
     | 
    
         
            +
                entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
         
     | 
| 
      
 307 
     | 
    
         
            +
                self.handle = self.lib_gpu.va_addr + entry_point
         
     | 
| 
      
 308 
     | 
    
         
            +
                self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
         
     | 
| 
      
 309 
     | 
    
         
            +
                self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
         
     | 
| 
      
 310 
     | 
    
         
            +
                self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
         
     | 
| 
      
 311 
     | 
    
         
            +
                self.kernargs_offset = 0
         
     | 
| 
      
 312 
     | 
    
         
            +
                assert self.private_segment_size <= self.device.max_private_segment_size, \
         
     | 
| 
      
 313 
     | 
    
         
            +
                  f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
         
     | 
| 
      
 314 
     | 
    
         
            +
             
     | 
| 
      
 315 
     | 
    
         
            +
                HWPM4Queue().invalidate_cache().submit(self.device)
         
     | 
| 
      
 316 
     | 
    
         
            +
             
     | 
| 
      
 317 
     | 
    
         
            +
              # NOTE: no programs are ever freed
         
     | 
| 
      
 318 
     | 
    
         
            +
              def __del__(self):
         
     | 
| 
      
 319 
     | 
    
         
            +
                if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
         
     | 
| 
      
 320 
     | 
    
         
            +
             
     | 
| 
      
 321 
     | 
    
         
            +
              def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
         
     | 
| 
      
 322 
     | 
    
         
            +
                if self.device.kernargs_ptr + self.kernargs_segment_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
         
     | 
| 
      
 323 
     | 
    
         
            +
                  self.device.kernargs_ptr = self.device.kernargs.va_addr
         
     | 
| 
      
 324 
     | 
    
         
            +
                assert self.device.kernargs_ptr + self.kernargs_segment_size <= (self.device.kernargs.va_addr + self.device.kernargs.size), "kernargs overrun"
         
     | 
| 
      
 325 
     | 
    
         
            +
                if not hasattr(self, "args_struct_t"):
         
     | 
| 
      
 326 
     | 
    
         
            +
                  self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
         
     | 
| 
      
 327 
     | 
    
         
            +
                                                             [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
         
     | 
| 
      
 328 
     | 
    
         
            +
                  if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
         
     | 
| 
      
 329 
     | 
    
         
            +
                    raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
         
     | 
| 
      
 330 
     | 
    
         
            +
                args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
         
     | 
| 
      
 331 
     | 
    
         
            +
                for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
         
     | 
| 
      
 332 
     | 
    
         
            +
                for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
         
     | 
| 
      
 333 
     | 
    
         
            +
             
     | 
| 
      
 334 
     | 
    
         
            +
                q = HWPM4Queue()
         
     | 
| 
      
 335 
     | 
    
         
            +
                q.wait(self.device.timeline_signal, self.device.timeline_value - 1)
         
     | 
| 
      
 336 
     | 
    
         
            +
                if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
         
     | 
| 
      
 337 
     | 
    
         
            +
                q.exec(self, self.device.kernargs_ptr, global_size, local_size)
         
     | 
| 
      
 338 
     | 
    
         
            +
                if wait: q.timestamp(ctypes.addressof(self.device.timeline_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset)
         
     | 
| 
      
 339 
     | 
    
         
            +
                q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 340 
     | 
    
         
            +
                self.device.timeline_value += 1
         
     | 
| 
      
 341 
     | 
    
         
            +
                self.device.kernargs_ptr += self.kernargs_segment_size
         
     | 
| 
      
 342 
     | 
    
         
            +
             
     | 
| 
      
 343 
     | 
    
         
            +
                if wait:
         
     | 
| 
      
 344 
     | 
    
         
            +
                  self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
         
     | 
| 
      
 345 
     | 
    
         
            +
                  return (self.device.timeline_signal.end_ts - self.device.timeline_signal.start_ts) / 1e8
         
     | 
| 
      
 346 
     | 
    
         
            +
             
     | 
| 
      
 347 
     | 
    
         
            +
            class AMDAllocator(LRUAllocator):
         
     | 
| 
      
 348 
     | 
    
         
            +
              def __init__(self, device:AMDDevice):
         
     | 
| 
      
 349 
     | 
    
         
            +
                self.device = device
         
     | 
| 
      
 350 
     | 
    
         
            +
                # NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
         
     | 
| 
      
 351 
     | 
    
         
            +
                self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(16)]
         
     | 
| 
      
 352 
     | 
    
         
            +
                self.b_timeline = [0] * len(self.b)
         
     | 
| 
      
 353 
     | 
    
         
            +
                self.b_next = 0
         
     | 
| 
      
 354 
     | 
    
         
            +
                super().__init__()
         
     | 
| 
      
 355 
     | 
    
         
            +
             
     | 
| 
      
 356 
     | 
    
         
            +
              def _alloc(self, size:int, options:BufferOptions):
         
     | 
| 
      
 357 
     | 
    
         
            +
                try:
         
     | 
| 
      
 358 
     | 
    
         
            +
                  if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
         
     | 
| 
      
 359 
     | 
    
         
            +
                  else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
         
     | 
| 
      
 360 
     | 
    
         
            +
                except OSError as e:
         
     | 
| 
      
 361 
     | 
    
         
            +
                  if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
         
     | 
| 
      
 362 
     | 
    
         
            +
                  else: raise
         
     | 
| 
      
 363 
     | 
    
         
            +
             
     | 
| 
      
 364 
     | 
    
         
            +
              def _free(self, gpumem, options:BufferOptions): self.device._gpu_free(gpumem)
         
     | 
| 
      
 365 
     | 
    
         
            +
              #def as_buffer(self, src:Any) -> memoryview:
         
     | 
| 
      
 366 
     | 
    
         
            +
              #  self.device.synchronize()
         
     | 
| 
      
 367 
     | 
    
         
            +
              #  return to_mv(src.va_addr, src.size)
         
     | 
| 
      
 368 
     | 
    
         
            +
             
     | 
| 
      
 369 
     | 
    
         
            +
              #def copy_from_fd(self, dest, fd, offset, size):
         
     | 
| 
      
 370 
     | 
    
         
            +
              #  fo = io.FileIO(fd, "a+b", closefd=False)
         
     | 
| 
      
 371 
     | 
    
         
            +
              #  fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
         
     | 
| 
      
 372 
     | 
    
         
            +
              #  copied_in, total_copy_size = 0, round_up(size+minor_offset, PAGE_SIZE)
         
     | 
| 
      
 373 
     | 
    
         
            +
              #  for i in range(0, size+minor_offset, self.b[0].size):
         
     | 
| 
      
 374 
     | 
    
         
            +
              #    local_size = min(self.b[0].size, total_copy_size-i)
         
     | 
| 
      
 375 
     | 
    
         
            +
              #    copy_size = min(local_size-minor_offset, size-copied_in)
         
     | 
| 
      
 376 
     | 
    
         
            +
              #    if copy_size == 0: break
         
     | 
| 
      
 377 
     | 
    
         
            +
             
     | 
| 
      
 378 
     | 
    
         
            +
              #    fo.readinto(to_mv(self.b[1].va_addr, local_size))
         
     | 
| 
      
 379 
     | 
    
         
            +
              #    if i != 0: self.device._wait_signal(self.device.signal_sdma)
         
     | 
| 
      
 380 
     | 
    
         
            +
              #    self.b = self.b[::-1]
         
     | 
| 
      
 381 
     | 
    
         
            +
              #    self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
         
     | 
| 
      
 382 
     | 
    
         
            +
             
     | 
| 
      
 383 
     | 
    
         
            +
              #    copied_in += copy_size
         
     | 
| 
      
 384 
     | 
    
         
            +
              #    minor_offset = 0 # only on the first
         
     | 
| 
      
 385 
     | 
    
         
            +
              #  self.device._wait_signal(self.device.signal_sdma)
         
     | 
| 
      
 386 
     | 
    
         
            +
             
     | 
| 
      
 387 
     | 
    
         
            +
              def copyin(self, dest, src: memoryview):
         
     | 
| 
      
 388 
     | 
    
         
            +
                for i in range(0, src.nbytes, self.b[0].size):
         
     | 
| 
      
 389 
     | 
    
         
            +
                  self.b_next = (self.b_next + 1) % len(self.b)
         
     | 
| 
      
 390 
     | 
    
         
            +
                  AMDDevice._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
         
     | 
| 
      
 391 
     | 
    
         
            +
                  ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
         
     | 
| 
      
 392 
     | 
    
         
            +
                  HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
         
     | 
| 
      
 393 
     | 
    
         
            +
                               .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
         
     | 
| 
      
 394 
     | 
    
         
            +
                               .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 395 
     | 
    
         
            +
                  self.b_timeline[self.b_next] = self.device.timeline_value
         
     | 
| 
      
 396 
     | 
    
         
            +
                  self.device.timeline_value += 1
         
     | 
| 
      
 397 
     | 
    
         
            +
             
     | 
| 
      
 398 
     | 
    
         
            +
              def copyout(self, dest:memoryview, src):
         
     | 
| 
      
 399 
     | 
    
         
            +
                self.device.synchronize()
         
     | 
| 
      
 400 
     | 
    
         
            +
                for i in range(0, dest.nbytes, self.b[0].size):
         
     | 
| 
      
 401 
     | 
    
         
            +
                  HWCopyQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
         
     | 
| 
      
 402 
     | 
    
         
            +
                               .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
         
     | 
| 
      
 403 
     | 
    
         
            +
                               .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
         
     | 
| 
      
 404 
     | 
    
         
            +
                  AMDDevice._wait_signal(self.device.timeline_signal, self.device.timeline_value)
         
     | 
| 
      
 405 
     | 
    
         
            +
                  self.device.timeline_value += 1
         
     | 
| 
      
 406 
     | 
    
         
            +
             
     | 
| 
      
 407 
     | 
    
         
            +
                  ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
         
     | 
| 
      
 408 
     | 
    
         
            +
             
     | 
| 
      
 409 
     | 
    
         
            +
              def transfer(self, dest, src, sz:int, src_dev:AMDDevice, dest_dev:AMDDevice):
         
     | 
| 
      
 410 
     | 
    
         
            +
                src_dev._gpu_map(dest)
         
     | 
| 
      
 411 
     | 
    
         
            +
                HWCopyQueue().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
         
     | 
| 
      
 412 
     | 
    
         
            +
                             .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
         
     | 
| 
      
 413 
     | 
    
         
            +
                             .copy(dest.va_addr, src.va_addr, sz) \
         
     | 
| 
      
 414 
     | 
    
         
            +
                             .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
         
     | 
| 
      
 415 
     | 
    
         
            +
                HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
         
     | 
| 
      
 416 
     | 
    
         
            +
                src_dev.timeline_value += 1
         
     | 
| 
      
 417 
     | 
    
         
            +
             
     | 
| 
      
 418 
     | 
    
         
            +
            MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
         
     | 
| 
      
 419 
     | 
    
         
            +
            class AMDDevice(Compiled):
         
     | 
| 
      
 420 
     | 
    
         
            +
              kfd:int = -1
         
     | 
| 
      
 421 
     | 
    
         
            +
              event_page:Any = None  # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
         
     | 
| 
      
 422 
     | 
    
         
            +
              signals_page:Any = None
         
     | 
| 
      
 423 
     | 
    
         
            +
              signals_pool:List[hsa.amd_signal_t] = []
         
     | 
| 
      
 424 
     | 
    
         
            +
              gpus:List[pathlib.Path] = []
         
     | 
| 
      
 425 
     | 
    
         
            +
             
     | 
| 
      
 426 
     | 
    
         
            +
              def _gpu_map(self, mem):
         
     | 
| 
      
 427 
     | 
    
         
            +
                if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
         
     | 
| 
      
 428 
     | 
    
         
            +
                mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
         
     | 
| 
      
 429 
     | 
    
         
            +
                c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
         
     | 
| 
      
 430 
     | 
    
         
            +
                stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
         
     | 
| 
      
 431 
     | 
    
         
            +
                assert stm.n_success == len(mem.mapped_gpu_ids)
         
     | 
| 
      
 432 
     | 
    
         
            +
             
     | 
| 
      
 433 
     | 
    
         
            +
              def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
         
     | 
| 
      
 434 
     | 
    
         
            +
                flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
         
     | 
| 
      
 435 
     | 
    
         
            +
                if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
         
     | 
| 
      
 436 
     | 
    
         
            +
                if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
         
     | 
| 
      
 437 
     | 
    
         
            +
                if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
         
     | 
| 
      
 438 
     | 
    
         
            +
                  buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
         
     | 
| 
      
 439 
     | 
    
         
            +
                else:
         
     | 
| 
      
 440 
     | 
    
         
            +
                  buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
         
     | 
| 
      
 441 
     | 
    
         
            +
                assert addr != 0xffffffffffffffff
         
     | 
| 
      
 442 
     | 
    
         
            +
                mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
         
     | 
| 
      
 443 
     | 
    
         
            +
                if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
         
     | 
| 
      
 444 
     | 
    
         
            +
                  buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
         
     | 
| 
      
 445 
     | 
    
         
            +
                  assert addr == buf == mem.va_addr
         
     | 
| 
      
 446 
     | 
    
         
            +
                if map_to_gpu: self._gpu_map(mem)
         
     | 
| 
      
 447 
     | 
    
         
            +
                return mem
         
     | 
| 
      
 448 
     | 
    
         
            +
             
     | 
| 
      
 449 
     | 
    
         
            +
              def _gpu_free(self, mem):
         
     | 
| 
      
 450 
     | 
    
         
            +
                if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
         
     | 
| 
      
 451 
     | 
    
         
            +
                  c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
         
     | 
| 
      
 452 
     | 
    
         
            +
                  stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
         
     | 
| 
      
 453 
     | 
    
         
            +
                  assert stm.n_success == len(gpus)
         
     | 
| 
      
 454 
     | 
    
         
            +
                libc.munmap(mem.va_addr, mem.size)
         
     | 
| 
      
 455 
     | 
    
         
            +
                kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
         
     | 
| 
      
 456 
     | 
    
         
            +
             
     | 
| 
      
 457 
     | 
    
         
            +
              @classmethod
         
     | 
| 
      
 458 
     | 
    
         
            +
              def _set_signal(self, sig, value): sig.value = value
         
     | 
| 
      
 459 
     | 
    
         
            +
             
     | 
| 
      
 460 
     | 
    
         
            +
              @classmethod
         
     | 
| 
      
 461 
     | 
    
         
            +
              def _get_signal(self, value=0, sync_event=None) -> hsa.amd_signal_t:
         
     | 
| 
      
 462 
     | 
    
         
            +
                self._set_signal(ret := self.signals_pool.pop(), value)
         
     | 
| 
      
 463 
     | 
    
         
            +
                if sync_event is not None:
         
     | 
| 
      
 464 
     | 
    
         
            +
                  ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
         
     | 
| 
      
 465 
     | 
    
         
            +
                  ret.event_id = sync_event.event_id
         
     | 
| 
      
 466 
     | 
    
         
            +
                else: ret.event_mailbox_ptr = ret.event_id = 0
         
     | 
| 
      
 467 
     | 
    
         
            +
                return ret
         
     | 
| 
      
 468 
     | 
    
         
            +
             
     | 
| 
      
 469 
     | 
    
         
            +
              @classmethod
         
     | 
| 
      
 470 
     | 
    
         
            +
              def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
         
     | 
| 
      
 471 
     | 
    
         
            +
                assert signal.event_id != 0, "can't wait on this signal"
         
     | 
| 
      
 472 
     | 
    
         
            +
                evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
         
     | 
| 
      
 473 
     | 
    
         
            +
             
     | 
| 
      
 474 
     | 
    
         
            +
                start_time = time.time() * 1000
         
     | 
| 
      
 475 
     | 
    
         
            +
                while (time.time() * 1000 - start_time) < timeout:
         
     | 
| 
      
 476 
     | 
    
         
            +
                  if signal.value >= value: return
         
     | 
| 
      
 477 
     | 
    
         
            +
                  kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=100)
         
     | 
| 
      
 478 
     | 
    
         
            +
                raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
         
     | 
| 
      
 479 
     | 
    
         
            +
             
     | 
| 
      
 480 
     | 
    
         
            +
              def __init__(self, device:str=""):
         
     | 
| 
      
 481 
     | 
    
         
            +
                if AMDDevice.kfd == -1:
         
     | 
| 
      
 482 
     | 
    
         
            +
                  AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
         
     | 
| 
      
 483 
     | 
    
         
            +
                  AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
         
     | 
| 
      
 484 
     | 
    
         
            +
                self.device_id = int(device.split(":")[1]) if ":" in device else 0
         
     | 
| 
      
 485 
     | 
    
         
            +
                with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
         
     | 
| 
      
 486 
     | 
    
         
            +
                with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
         
     | 
| 
      
 487 
     | 
    
         
            +
                self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
         
     | 
| 
      
 488 
     | 
    
         
            +
                target = int(self.properties['gfx_target_version'])
         
     | 
| 
      
 489 
     | 
    
         
            +
                self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
         
     | 
| 
      
 490 
     | 
    
         
            +
                kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
         
     | 
| 
      
 491 
     | 
    
         
            +
             
     | 
| 
      
 492 
     | 
    
         
            +
                if AMDDevice.event_page is None:
         
     | 
| 
      
 493 
     | 
    
         
            +
                  AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
         
     | 
| 
      
 494 
     | 
    
         
            +
                  AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
         
     | 
| 
      
 495 
     | 
    
         
            +
                  for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
         
     | 
| 
      
 496 
     | 
    
         
            +
                    AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
         
     | 
| 
      
 497 
     | 
    
         
            +
                  sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
         
     | 
| 
      
 498 
     | 
    
         
            +
                else:
         
     | 
| 
      
 499 
     | 
    
         
            +
                  self._gpu_map(AMDDevice.signals_page)
         
     | 
| 
      
 500 
     | 
    
         
            +
                  self._gpu_map(AMDDevice.event_page)
         
     | 
| 
      
 501 
     | 
    
         
            +
                  sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
         
     | 
| 
      
 502 
     | 
    
         
            +
             
     | 
| 
      
 503 
     | 
    
         
            +
                self.timeline_value: int = 1
         
     | 
| 
      
 504 
     | 
    
         
            +
                self.timeline_signal = AMDDevice._get_signal(sync_event=sync_event)
         
     | 
| 
      
 505 
     | 
    
         
            +
                self._shadow_timeline_signal = AMDDevice._get_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))
         
     | 
| 
      
 506 
     | 
    
         
            +
             
     | 
| 
      
 507 
     | 
    
         
            +
                self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
         
     | 
| 
      
 508 
     | 
    
         
            +
                self.kernargs_ptr = self.kernargs.va_addr
         
     | 
| 
      
 509 
     | 
    
         
            +
             
     | 
| 
      
 510 
     | 
    
         
            +
                # scratch setup
         
     | 
| 
      
 511 
     | 
    
         
            +
                max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
         
     | 
| 
      
 512 
     | 
    
         
            +
                max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
         
     | 
| 
      
 513 
     | 
    
         
            +
                self.max_private_segment_size = 4096
         
     | 
| 
      
 514 
     | 
    
         
            +
                wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
         
     | 
| 
      
 515 
     | 
    
         
            +
                self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
         
     | 
| 
      
 516 
     | 
    
         
            +
                self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
         
     | 
| 
      
 517 
     | 
    
         
            +
             
     | 
| 
      
 518 
     | 
    
         
            +
                # SDMA Queue
         
     | 
| 
      
 519 
     | 
    
         
            +
                self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
         
     | 
| 
      
 520 
     | 
    
         
            +
                self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
         
     | 
| 
      
 521 
     | 
    
         
            +
                self.sdma_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
         
     | 
| 
      
 522 
     | 
    
         
            +
                  queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
         
     | 
| 
      
 523 
     | 
    
         
            +
                  write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
         
     | 
| 
      
 524 
     | 
    
         
            +
             
     | 
| 
      
 525 
     | 
    
         
            +
                # doorbell page
         
     | 
| 
      
 526 
     | 
    
         
            +
                self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff)  # doorbell is two pages
         
     | 
| 
      
 527 
     | 
    
         
            +
                self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
         
     | 
| 
      
 528 
     | 
    
         
            +
             
     | 
| 
      
 529 
     | 
    
         
            +
                self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
         
     | 
| 
      
 530 
     | 
    
         
            +
                self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
         
     | 
| 
      
 531 
     | 
    
         
            +
                self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
         
     | 
| 
      
 532 
     | 
    
         
            +
                self.sdma_doorbell_value = 0
         
     | 
| 
      
 533 
     | 
    
         
            +
             
     | 
| 
      
 534 
     | 
    
         
            +
                # PM4 Queue
         
     | 
| 
      
 535 
     | 
    
         
            +
                self.pm4_ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
         
     | 
| 
      
 536 
     | 
    
         
            +
                self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
         
     | 
| 
      
 537 
     | 
    
         
            +
                self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
         
     | 
| 
      
 538 
     | 
    
         
            +
                self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
         
     | 
| 
      
 539 
     | 
    
         
            +
                self.pm4_queue = kio.create_queue(AMDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
         
     | 
| 
      
 540 
     | 
    
         
            +
                  queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
         
     | 
| 
      
 541 
     | 
    
         
            +
                  eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
         
     | 
| 
      
 542 
     | 
    
         
            +
                  # TODO: are these needed? (i know eop is)
         
     | 
| 
      
 543 
     | 
    
         
            +
                  ctx_save_restore_address=self.pm4_ctx_save_restore_address.va_addr, ctx_save_restore_size=self.pm4_ctx_save_restore_address.size,
         
     | 
| 
      
 544 
     | 
    
         
            +
                  ctl_stack_size = 0xa000,
         
     | 
| 
      
 545 
     | 
    
         
            +
                  write_pointer_address=self.gart_pm4.va_addr, read_pointer_address=self.gart_pm4.va_addr+8)
         
     | 
| 
      
 546 
     | 
    
         
            +
             
     | 
| 
      
 547 
     | 
    
         
            +
                self.pm4_read_pointer = to_mv(self.pm4_queue.read_pointer_address, 8).cast("Q")
         
     | 
| 
      
 548 
     | 
    
         
            +
                self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
         
     | 
| 
      
 549 
     | 
    
         
            +
                self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 8).cast("Q")
         
     | 
| 
      
 550 
     | 
    
         
            +
             
     | 
| 
      
 551 
     | 
    
         
            +
                from tinygrad.runtime.graph.hcq import HCQGraph
         
     | 
| 
      
 552 
     | 
    
         
            +
                super().__init__(device, AMDAllocator(self), AMDRenderer(), HSACompiler(self.arch),
         
     | 
| 
      
 553 
     | 
    
         
            +
                                 functools.partial(AMDProgram, self),
         
     | 
| 
      
 554 
     | 
    
         
            +
                                 functools.partial(HCQGraph, AMDDevice, HWPM4Queue, HWCopyQueue))
         
     | 
| 
      
 555 
     | 
    
         
            +
             
     | 
| 
      
 556 
     | 
    
         
            +
              def synchronize(self):
         
     | 
| 
      
 557 
     | 
    
         
            +
                AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
         
     | 
| 
      
 558 
     | 
    
         
            +
             
     | 
| 
      
 559 
     | 
    
         
            +
                # reset kernargs
         
     | 
| 
      
 560 
     | 
    
         
            +
                self.kernargs_ptr = self.kernargs.va_addr
         
     | 
| 
      
 561 
     | 
    
         
            +
                if self.timeline_value > (1 << 31):
         
     | 
| 
      
 562 
     | 
    
         
            +
                  self.timeline_signal, self._shadow_timeline_signal = self._shadow_timeline_signal, self.timeline_signal
         
     | 
| 
      
 563 
     | 
    
         
            +
                  self.timeline_signal.value, self.timeline_value = 0, 1
         
     | 
| 
      
 564 
     | 
    
         
            +
                  cast(AMDAllocator, self.allocator).b_timeline = [0] * len(cast(AMDAllocator, self.allocator).b)
         
     |