tinygrad 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
 - tinygrad/codegen/expander.py +121 -0
 - tinygrad/codegen/kernel.py +35 -37
 - tinygrad/codegen/linearize.py +19 -10
 - tinygrad/codegen/lowerer.py +31 -8
 - tinygrad/codegen/symbolic.py +476 -0
 - tinygrad/codegen/transcendental.py +10 -0
 - tinygrad/device.py +28 -11
 - tinygrad/dtype.py +12 -3
 - tinygrad/engine/jit.py +3 -2
 - tinygrad/engine/multi.py +0 -1
 - tinygrad/engine/realize.py +7 -4
 - tinygrad/engine/schedule.py +227 -255
 - tinygrad/engine/search.py +20 -27
 - tinygrad/gradient.py +3 -0
 - tinygrad/helpers.py +7 -4
 - tinygrad/nn/state.py +2 -2
 - tinygrad/ops.py +64 -329
 - tinygrad/renderer/__init__.py +19 -3
 - tinygrad/renderer/cstyle.py +39 -18
 - tinygrad/renderer/llvmir.py +55 -18
 - tinygrad/renderer/ptx.py +6 -2
 - tinygrad/renderer/wgsl.py +20 -12
 - tinygrad/runtime/autogen/libc.py +404 -71
 - tinygrad/runtime/autogen/{libpciaccess.py → pci.py} +25 -715
 - tinygrad/runtime/autogen/webgpu.py +6985 -0
 - tinygrad/runtime/graph/metal.py +28 -29
 - tinygrad/runtime/ops_amd.py +37 -34
 - tinygrad/runtime/{ops_clang.py → ops_cpu.py} +4 -2
 - tinygrad/runtime/ops_disk.py +1 -1
 - tinygrad/runtime/ops_dsp.py +59 -33
 - tinygrad/runtime/ops_llvm.py +14 -12
 - tinygrad/runtime/ops_metal.py +78 -62
 - tinygrad/runtime/ops_nv.py +9 -6
 - tinygrad/runtime/ops_python.py +5 -5
 - tinygrad/runtime/ops_webgpu.py +200 -38
 - tinygrad/runtime/support/am/amdev.py +23 -11
 - tinygrad/runtime/support/am/ip.py +10 -10
 - tinygrad/runtime/support/elf.py +2 -0
 - tinygrad/runtime/support/hcq.py +7 -5
 - tinygrad/runtime/support/llvm.py +8 -14
 - tinygrad/shape/shapetracker.py +3 -2
 - tinygrad/shape/view.py +2 -3
 - tinygrad/spec.py +21 -20
 - tinygrad/tensor.py +150 -90
 - tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
 - tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
 - tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
 - tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
 - tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
 - tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
 - tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
 - tinygrad/viz/index.html +544 -0
 - tinygrad/viz/perfetto.html +178 -0
 - tinygrad/viz/serve.py +205 -0
 - {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/METADATA +20 -8
 - tinygrad-0.10.2.dist-info/RECORD +99 -0
 - tinygrad/codegen/rewriter.py +0 -516
 - tinygrad-0.10.1.dist-info/RECORD +0 -86
 - {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
 - {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +0 -0
 - {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
 
    
        tinygrad/runtime/graph/metal.py
    CHANGED
    
    | 
         @@ -22,16 +22,16 @@ class MetalGraph(GraphRunner): 
     | 
|
| 
       22 
22 
     | 
    
         
             
                if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
24 
     | 
    
         
             
                # create metal batch exec
         
     | 
| 
       25 
     | 
    
         
            -
                icb_descriptor = msg(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor") 
     | 
| 
       26 
     | 
    
         
            -
                msg( 
     | 
| 
       27 
     | 
    
         
            -
                msg( 
     | 
| 
       28 
     | 
    
         
            -
                msg( 
     | 
| 
       29 
     | 
    
         
            -
                msg( 
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
                self.icb = msg( 
     | 
| 
       32 
     | 
    
         
            -
                  icb_descriptor, len(jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache 
     | 
| 
      
 25 
     | 
    
         
            +
                icb_descriptor = msg("new", objc_instance)(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor"))
         
     | 
| 
      
 26 
     | 
    
         
            +
                msg("setCommandTypes:")(icb_descriptor, MTLIndirectCommandType.MTLIndirectCommandTypeConcurrentDispatch)
         
     | 
| 
      
 27 
     | 
    
         
            +
                msg("setInheritBuffers:")(icb_descriptor, False)
         
     | 
| 
      
 28 
     | 
    
         
            +
                msg("setInheritPipelineState:")(icb_descriptor, False)
         
     | 
| 
      
 29 
     | 
    
         
            +
                msg("setMaxKernelBufferBindCount:")(icb_descriptor, 31)
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                self.icb = msg("newIndirectCommandBufferWithDescriptor:maxCommandCount:options:", objc_instance)(self.dev.sysdevice,
         
     | 
| 
      
 32 
     | 
    
         
            +
                  icb_descriptor, len(jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache)
         
     | 
| 
       33 
33 
     | 
    
         
             
                if self.icb.value is None: raise GraphException("create indirect command buffer failed, does your system support this?")
         
     | 
| 
       34 
     | 
    
         
            -
                icb_label = bytes(msg(msg( 
     | 
| 
      
 34 
     | 
    
         
            +
                icb_label = bytes(msg("UTF8String", ctypes.c_char_p)(msg("description", objc_instance)(self.icb))).decode()
         
     | 
| 
       35 
35 
     | 
    
         
             
                self.needs_icb_fix = int("AGXG15XFamilyIndirectCommandBuffer" not in icb_label)    # not required on M3
         
     | 
| 
       36 
36 
     | 
    
         | 
| 
       37 
37 
     | 
    
         
             
                if len(self.vars): self.int_buf = self.dev.allocator.alloc(len(self.vars)*dtypes.int32.itemsize)
         
     | 
| 
         @@ -39,18 +39,18 @@ class MetalGraph(GraphRunner): 
     | 
|
| 
       39 
39 
     | 
    
         
             
                all_pipelines = []
         
     | 
| 
       40 
40 
     | 
    
         
             
                for j,ji in enumerate(jit_cache):
         
     | 
| 
       41 
41 
     | 
    
         
             
                  prg: CompiledRunner = cast(CompiledRunner, ji.prg)
         
     | 
| 
       42 
     | 
    
         
            -
                  icb_command = msg( 
     | 
| 
      
 42 
     | 
    
         
            +
                  icb_command = msg("indirectComputeCommandAtIndex:", objc_instance)(self.icb, j)
         
     | 
| 
       43 
43 
     | 
    
         
             
                  all_pipelines.append(prg._prg.pipeline_state)
         
     | 
| 
       44 
     | 
    
         
            -
                  msg( 
     | 
| 
      
 44 
     | 
    
         
            +
                  msg("setComputePipelineState:")(icb_command, prg._prg.pipeline_state)
         
     | 
| 
       45 
45 
     | 
    
         
             
                  for i,b in enumerate(ji.bufs):
         
     | 
| 
       46 
46 
     | 
    
         
             
                    if b is not None and b not in input_rawbuffers:
         
     | 
| 
       47 
     | 
    
         
            -
                      msg( 
     | 
| 
      
 47 
     | 
    
         
            +
                      msg("setKernelBuffer:offset:atIndex:")(icb_command, b._buf.buf, b._buf.offset, i)
         
     | 
| 
       48 
48 
     | 
    
         
             
                      all_resources.append(b._buf.buf)
         
     | 
| 
       49 
     | 
    
         
            -
                  for i,v in enumerate(prg.p.vars): msg( 
     | 
| 
      
 49 
     | 
    
         
            +
                  for i,v in enumerate(prg.p.vars): msg("setKernelBuffer:offset:atIndex:")(icb_command, self.int_buf.buf, self.vars.index(v)*4, len(ji.bufs)+i)
         
     | 
| 
       50 
50 
     | 
    
         | 
| 
       51 
51 
     | 
    
         
             
                  global_size, local_size = prg.p.launch_dims(var_vals)
         
     | 
| 
       52 
     | 
    
         
            -
                  msg( 
     | 
| 
       53 
     | 
    
         
            -
                  msg( 
     | 
| 
      
 52 
     | 
    
         
            +
                  msg("concurrentDispatchThreadgroups:threadsPerThreadgroup:")(icb_command, to_struct(*global_size), to_struct(*local_size))
         
     | 
| 
      
 53 
     | 
    
         
            +
                  msg("setBarrier")(icb_command)
         
     | 
| 
       54 
54 
     | 
    
         | 
| 
       55 
55 
     | 
    
         
             
                self.all_resources = dedup(all_resources)
         
     | 
| 
       56 
56 
     | 
    
         
             
                self.all_pipelines = dedup(all_pipelines)
         
     | 
| 
         @@ -64,18 +64,17 @@ class MetalGraph(GraphRunner): 
     | 
|
| 
       64 
64 
     | 
    
         
             
                all_resources = dedup(self.all_resources + [x._buf.buf for x in input_rawbuffers])
         
     | 
| 
       65 
65 
     | 
    
         | 
| 
       66 
66 
     | 
    
         
             
                for (j,i),input_idx in self.input_replace.items():
         
     | 
| 
       67 
     | 
    
         
            -
                  computeCommand = msg( 
     | 
| 
       68 
     | 
    
         
            -
                  msg( 
     | 
| 
       69 
     | 
    
         
            -
                                                                                             input_rawbuffers[input_idx]._buf.offset, i)
         
     | 
| 
      
 67 
     | 
    
         
            +
                  computeCommand = msg("indirectComputeCommandAtIndex:", objc_id)(self.icb, j)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  msg("setKernelBuffer:offset:atIndex:")(computeCommand, input_rawbuffers[input_idx]._buf.buf, input_rawbuffers[input_idx]._buf.offset, i)
         
     | 
| 
       70 
69 
     | 
    
         | 
| 
       71 
70 
     | 
    
         
             
                for j, global_dims, local_dims in self.updated_launch_dims(var_vals):
         
     | 
| 
       72 
     | 
    
         
            -
                  computeCommand = msg( 
     | 
| 
       73 
     | 
    
         
            -
                  msg( 
     | 
| 
      
 71 
     | 
    
         
            +
                  computeCommand = msg("indirectComputeCommandAtIndex:", objc_id)(self.icb, j)
         
     | 
| 
      
 72 
     | 
    
         
            +
                  msg("concurrentDispatchThreadgroups:threadsPerThreadgroup:")(computeCommand, to_struct(*global_dims), to_struct(*local_dims))
         
     | 
| 
       74 
73 
     | 
    
         
             
                for j, var in enumerate(self.vars): self.int_buf_view[j] = var_vals[var]
         
     | 
| 
       75 
74 
     | 
    
         | 
| 
       76 
     | 
    
         
            -
                command_buffer = msg( 
     | 
| 
       77 
     | 
    
         
            -
                encoder = msg( 
     | 
| 
       78 
     | 
    
         
            -
                msg( 
     | 
| 
      
 75 
     | 
    
         
            +
                command_buffer = msg("commandBuffer", objc_instance)(self.dev.mtl_queue)
         
     | 
| 
      
 76 
     | 
    
         
            +
                encoder = msg("computeCommandEncoder", objc_instance)(command_buffer)
         
     | 
| 
      
 77 
     | 
    
         
            +
                msg("useResources:count:usage:")(encoder, (objc_id * len(all_resources))(*all_resources), len(all_resources),
         
     | 
| 
       79 
78 
     | 
    
         
             
                    MTLResourceUsage.MTLResourceUsageRead | MTLResourceUsage.MTLResourceUsageWrite)
         
     | 
| 
       80 
79 
     | 
    
         | 
| 
       81 
80 
     | 
    
         
             
                # NOTE: the pipelines likely need to be added to the used resources to fix the crash on M1/M2, but I haven't figured out how
         
     | 
| 
         @@ -85,13 +84,13 @@ class MetalGraph(GraphRunner): 
     | 
|
| 
       85 
84 
     | 
    
         
             
                # to repro the crash (which can also crash other running GPU apps), run with FIX_METAL_ICB=0
         
     | 
| 
       86 
85 
     | 
    
         
             
                if getenv("FIX_METAL_ICB", self.needs_icb_fix):
         
     | 
| 
       87 
86 
     | 
    
         
             
                  for ps in self.all_pipelines:
         
     | 
| 
       88 
     | 
    
         
            -
                    msg( 
     | 
| 
       89 
     | 
    
         
            -
                    msg( 
     | 
| 
      
 87 
     | 
    
         
            +
                    msg("setComputePipelineState:")(encoder, ps)
         
     | 
| 
      
 88 
     | 
    
         
            +
                    msg("dispatchThreadgroups:threadsPerThreadgroup:")(encoder, to_struct(0,0,0), to_struct(0,0,0))
         
     | 
| 
       90 
89 
     | 
    
         | 
| 
       91 
     | 
    
         
            -
                msg( 
     | 
| 
       92 
     | 
    
         
            -
                msg( 
     | 
| 
       93 
     | 
    
         
            -
                msg( 
     | 
| 
       94 
     | 
    
         
            -
                msg( 
     | 
| 
      
 90 
     | 
    
         
            +
                msg("executeCommandsInBuffer:withRange:")(encoder, self.icb, self.range)
         
     | 
| 
      
 91 
     | 
    
         
            +
                msg("endEncoding")(encoder)
         
     | 
| 
      
 92 
     | 
    
         
            +
                msg("setLabel:")(command_buffer, to_ns_str(f"batched {len(self.jit_cache)}"))
         
     | 
| 
      
 93 
     | 
    
         
            +
                msg("commit")(command_buffer)
         
     | 
| 
       95 
94 
     | 
    
         
             
                self.command_buffer = command_buffer
         
     | 
| 
       96 
95 
     | 
    
         | 
| 
       97 
96 
     | 
    
         
             
                self.dev.mtl_buffers_in_flight.append(command_buffer)
         
     | 
    
        tinygrad/runtime/ops_amd.py
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            from __future__ import annotations
         
     | 
| 
       2 
2 
     | 
    
         
             
            from typing import Any, cast
         
     | 
| 
       3 
     | 
    
         
            -
            import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select 
     | 
| 
      
 3 
     | 
    
         
            +
            import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
         
     | 
| 
       4 
4 
     | 
    
         
             
            assert sys.platform != 'win32'
         
     | 
| 
       5 
5 
     | 
    
         
             
            from dataclasses import dataclass
         
     | 
| 
       6 
6 
     | 
    
         
             
            from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
         
     | 
| 
       7 
7 
     | 
    
         
             
            from tinygrad.ops import sint
         
     | 
| 
       8 
     | 
    
         
            -
            from tinygrad.device import BufferSpec
         
     | 
| 
      
 8 
     | 
    
         
            +
            from tinygrad.device import BufferSpec, CPUProgram
         
     | 
| 
       9 
9 
     | 
    
         
             
            from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
         
     | 
| 
       10 
10 
     | 
    
         
             
            from tinygrad.renderer.cstyle import AMDRenderer
         
     | 
| 
       11 
     | 
    
         
            -
            from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc,  
     | 
| 
      
 11 
     | 
    
         
            +
            from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
         
     | 
| 
       12 
12 
     | 
    
         
             
            from tinygrad.runtime.autogen.am import am
         
     | 
| 
       13 
13 
     | 
    
         
             
            from tinygrad.runtime.support.compiler_hip import AMDCompiler
         
     | 
| 
       14 
14 
     | 
    
         
             
            from tinygrad.runtime.support.elf import elf_loader
         
     | 
| 
         @@ -151,13 +151,11 @@ class AMDComputeQueue(HWQueue): 
     | 
|
| 
       151 
151 
     | 
    
         
             
                for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
         
     | 
| 
       152 
152 
     | 
    
         | 
| 
       153 
153 
     | 
    
         
             
                dev.compute_queue.put_value += len(cmds)
         
     | 
| 
       154 
     | 
    
         
            -
                dev.compute_queue. 
     | 
| 
       155 
     | 
    
         
            -
                dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
         
     | 
| 
      
 154 
     | 
    
         
            +
                dev.compute_queue.signal_doorbell()
         
     | 
| 
       156 
155 
     | 
    
         | 
| 
       157 
     | 
    
         
            -
            SDMA_MAX_COPY_SIZE = 0x400000
         
     | 
| 
       158 
156 
     | 
    
         
             
            class AMDCopyQueue(HWQueue):
         
     | 
| 
       159 
     | 
    
         
            -
              def __init__(self):
         
     | 
| 
       160 
     | 
    
         
            -
                self.internal_cmd_sizes = []
         
     | 
| 
      
 157 
     | 
    
         
            +
              def __init__(self, max_copy_size=0x40000000):
         
     | 
| 
      
 158 
     | 
    
         
            +
                self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
         
     | 
| 
       161 
159 
     | 
    
         
             
                super().__init__()
         
     | 
| 
       162 
160 
     | 
    
         | 
| 
       163 
161 
     | 
    
         
             
              def q(self, *arr):
         
     | 
| 
         @@ -165,10 +163,10 @@ class AMDCopyQueue(HWQueue): 
     | 
|
| 
       165 
163 
     | 
    
         
             
                self.internal_cmd_sizes.append(len(arr))
         
     | 
| 
       166 
164 
     | 
    
         | 
| 
       167 
165 
     | 
    
         
             
              def copy(self, dest:sint, src:sint, copy_size:int):
         
     | 
| 
       168 
     | 
    
         
            -
                copied, copy_commands = 0, (copy_size +  
     | 
| 
      
 166 
     | 
    
         
            +
                copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
         
     | 
| 
       169 
167 
     | 
    
         | 
| 
       170 
168 
     | 
    
         
             
                for _ in range(copy_commands):
         
     | 
| 
       171 
     | 
    
         
            -
                  step_copy_size = min(copy_size - copied,  
     | 
| 
      
 169 
     | 
    
         
            +
                  step_copy_size = min(copy_size - copied, self.max_copy_size)
         
     | 
| 
       172 
170 
     | 
    
         | 
| 
       173 
171 
     | 
    
         
             
                  self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
         
     | 
| 
       174 
172 
     | 
    
         
             
                    amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
         
     | 
| 
         @@ -237,8 +235,7 @@ class AMDCopyQueue(HWQueue): 
     | 
|
| 
       237 
235 
     | 
    
         
             
                  dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
         
     | 
| 
       238 
236 
     | 
    
         
             
                  dev.sdma_queue.put_value += rem_packet_cnt * 4
         
     | 
| 
       239 
237 
     | 
    
         | 
| 
       240 
     | 
    
         
            -
                dev.sdma_queue. 
     | 
| 
       241 
     | 
    
         
            -
                dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
         
     | 
| 
      
 238 
     | 
    
         
            +
                dev.sdma_queue.signal_doorbell()
         
     | 
| 
       242 
239 
     | 
    
         | 
| 
       243 
240 
     | 
    
         
             
            class AMDProgram(HCQProgram):
         
     | 
| 
       244 
241 
     | 
    
         
             
              def __init__(self, dev:AMDDevice, name:str, lib:bytes):
         
     | 
| 
         @@ -280,8 +277,6 @@ class AMDProgram(HCQProgram): 
     | 
|
| 
       280 
277 
     | 
    
         
             
                if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
         
     | 
| 
       281 
278 
     | 
    
         | 
| 
       282 
279 
     | 
    
         
             
            class AMDAllocator(HCQAllocator['AMDDevice']):
         
     | 
| 
       283 
     | 
    
         
            -
              def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
         
     | 
| 
       284 
     | 
    
         
            -
             
     | 
| 
       285 
280 
     | 
    
         
             
              def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
         
     | 
| 
       286 
281 
     | 
    
         
             
                return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
         
     | 
| 
       287 
282 
     | 
    
         | 
| 
         @@ -301,6 +296,13 @@ class AMDQueueDesc: 
     | 
|
| 
       301 
296 
     | 
    
         
             
              doorbell: memoryview
         
     | 
| 
       302 
297 
     | 
    
         
             
              put_value: int = 0
         
     | 
| 
       303 
298 
     | 
    
         | 
| 
      
 299 
     | 
    
         
            +
              def signal_doorbell(self):
         
     | 
| 
      
 300 
     | 
    
         
            +
                self.write_ptr[0] = self.put_value
         
     | 
| 
      
 301 
     | 
    
         
            +
             
     | 
| 
      
 302 
     | 
    
         
            +
                # Ensure all prior writes are visible to the GPU.
         
     | 
| 
      
 303 
     | 
    
         
            +
                if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
         
     | 
| 
      
 304 
     | 
    
         
            +
                self.doorbell[0] = self.put_value
         
     | 
| 
      
 305 
     | 
    
         
            +
             
     | 
| 
       304 
306 
     | 
    
         
             
            class KFDIface:
         
     | 
| 
       305 
307 
     | 
    
         
             
              kfd:HWInterface|None = None
         
     | 
| 
       306 
308 
     | 
    
         
             
              event_page:HCQBuffer|None = None
         
     | 
| 
         @@ -426,6 +428,7 @@ class KFDIface: 
     | 
|
| 
       426 
428 
     | 
    
         
             
            class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
         
     | 
| 
       427 
429 
     | 
    
         | 
| 
       428 
430 
     | 
    
         
             
            class PCIIface:
         
     | 
| 
      
 431 
     | 
    
         
            +
              supported_devs:list[int] = [0x744c, 0x7480]
         
     | 
| 
       429 
432 
     | 
    
         
             
              vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
         
     | 
| 
       430 
433 
     | 
    
         
             
              vfio_fd:HWInterface
         
     | 
| 
       431 
434 
     | 
    
         
             
              gpus:list[Any] = []
         
     | 
| 
         @@ -434,25 +437,23 @@ class PCIIface: 
     | 
|
| 
       434 
437 
     | 
    
         
             
                self.dev = dev
         
     | 
| 
       435 
438 
     | 
    
         | 
| 
       436 
439 
     | 
    
         
             
                if first_dev:=len(PCIIface.gpus) == 0:
         
     | 
| 
       437 
     | 
    
         
            -
                   
     | 
| 
       438 
     | 
    
         
            -
             
     | 
| 
       439 
     | 
    
         
            -
             
     | 
| 
       440 
     | 
    
         
            -
                    if  
     | 
| 
      
 440 
     | 
    
         
            +
                  for pcibus in HWInterface("/sys/bus/pci/devices").listdir():
         
     | 
| 
      
 441 
     | 
    
         
            +
                    vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
         
     | 
| 
      
 442 
     | 
    
         
            +
                    device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
         
     | 
| 
      
 443 
     | 
    
         
            +
                    if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
         
     | 
| 
       441 
444 
     | 
    
         | 
| 
       442 
445 
     | 
    
         
             
                  # TODO: visible_devices should be handled layer above this?
         
     | 
| 
       443 
446 
     | 
    
         
             
                  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
         
     | 
| 
       444 
447 
     | 
    
         
             
                  PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
         
     | 
| 
       445 
448 
     | 
    
         | 
| 
       446 
     | 
    
         
            -
                self. 
     | 
| 
       447 
     | 
    
         
            -
                self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
         
     | 
| 
      
 449 
     | 
    
         
            +
                self.pcibus = PCIIface.gpus[dev_id]
         
     | 
| 
       448 
450 
     | 
    
         | 
| 
       449 
451 
     | 
    
         
             
                # Unbind the device from the kernel driver
         
     | 
| 
       450 
452 
     | 
    
         
             
                if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
         
     | 
| 
       451 
453 
     | 
    
         
             
                  HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
         
     | 
| 
       452 
     | 
    
         
            -
                  HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
         
     | 
| 
       453 
454 
     | 
    
         | 
| 
       454 
     | 
    
         
            -
                 
     | 
| 
       455 
     | 
    
         
            -
                 
     | 
| 
      
 455 
     | 
    
         
            +
                supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
         
     | 
| 
      
 456 
     | 
    
         
            +
                HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
         
     | 
| 
       456 
457 
     | 
    
         | 
| 
       457 
458 
     | 
    
         
             
                # Try to init vfio. Use it if success.
         
     | 
| 
       458 
459 
     | 
    
         
             
                if PCIIface.vfio:
         
     | 
| 
         @@ -485,16 +486,20 @@ class PCIIface: 
     | 
|
| 
       485 
486 
     | 
    
         
             
                  irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
         
     | 
| 
       486 
487 
     | 
    
         
             
                    argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
         
     | 
| 
       487 
488 
     | 
    
         
             
                  vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
         
     | 
| 
       488 
     | 
    
         
            -
                else:  
     | 
| 
      
 489 
     | 
    
         
            +
                else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
         
     | 
| 
       489 
490 
     | 
    
         | 
| 
       490 
491 
     | 
    
         
             
                self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
         
     | 
| 
       491 
     | 
    
         
            -
                self. 
     | 
| 
      
 492 
     | 
    
         
            +
                self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
         
     | 
| 
      
 493 
     | 
    
         
            +
                self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
         
     | 
| 
      
 494 
     | 
    
         
            +
             
     | 
| 
      
 495 
     | 
    
         
            +
                bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
         
     | 
| 
      
 496 
     | 
    
         
            +
                self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
         
     | 
| 
       492 
497 
     | 
    
         | 
| 
       493 
498 
     | 
    
         
             
                self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
         
     | 
| 
       494 
499 
     | 
    
         
             
                self.doorbell_cpu_addr = mv_address(dbell)
         
     | 
| 
       495 
500 
     | 
    
         | 
| 
       496 
     | 
    
         
            -
                 
     | 
| 
       497 
     | 
    
         
            -
                 
     | 
| 
      
 501 
     | 
    
         
            +
                pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
         
     | 
| 
      
 502 
     | 
    
         
            +
                self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
         
     | 
| 
       498 
503 
     | 
    
         | 
| 
       499 
504 
     | 
    
         
             
                array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
         
     | 
| 
       500 
505 
     | 
    
         
             
                simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
         
     | 
| 
         @@ -503,8 +508,9 @@ class PCIIface: 
     | 
|
| 
       503 
508 
     | 
    
         
             
                  'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
         
     | 
| 
       504 
509 
     | 
    
         | 
| 
       505 
510 
     | 
    
         
             
              def _map_pci_range(self, bar, off=0, addr=0, size=None):
         
     | 
| 
       506 
     | 
    
         
            -
                fd, sz = self.bar_fds[bar], size or self. 
     | 
| 
       507 
     | 
    
         
            -
                 
     | 
| 
      
 511 
     | 
    
         
            +
                fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
         
     | 
| 
      
 512 
     | 
    
         
            +
                libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
         
     | 
| 
      
 513 
     | 
    
         
            +
                return to_mv(loc, sz)
         
     | 
| 
       508 
514 
     | 
    
         | 
| 
       509 
515 
     | 
    
         
             
              def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
         
     | 
| 
       510 
516 
     | 
    
         
             
                if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
         
     | 
| 
         @@ -530,8 +536,7 @@ class PCIIface: 
     | 
|
| 
       530 
536 
     | 
    
         
             
                if self.dev in mem.meta.mapped_devs: return
         
     | 
| 
       531 
537 
     | 
    
         
             
                mem.meta.mapped_devs.append(self.dev)
         
     | 
| 
       532 
538 
     | 
    
         | 
| 
       533 
     | 
    
         
            -
                 
     | 
| 
       534 
     | 
    
         
            -
                paddrs = [(paddr if mem.meta.mapping.system else (paddr + owner_sys_base), size) for paddr, size in mem.meta.mapping.paddrs]
         
     | 
| 
      
 539 
     | 
    
         
            +
                paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
         
     | 
| 
       535 
540 
     | 
    
         
             
                self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
         
     | 
| 
       536 
541 
     | 
    
         | 
| 
       537 
542 
     | 
    
         
             
              def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
         
     | 
| 
         @@ -596,8 +601,6 @@ class AMDDevice(HCQCompiled): 
     | 
|
| 
       596 
601 
     | 
    
         
             
                self.max_private_segment_size = 0
         
     | 
| 
       597 
602 
     | 
    
         
             
                self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
         
     | 
| 
       598 
603 
     | 
    
         | 
| 
       599 
     | 
    
         
            -
                atexit.register(self.device_fini)
         
     | 
| 
       600 
     | 
    
         
            -
             
     | 
| 
       601 
604 
     | 
    
         
             
              def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
         
     | 
| 
       602 
605 
     | 
    
         
             
                ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
         
     | 
| 
       603 
606 
     | 
    
         
             
                gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
         
     | 
| 
         @@ -627,6 +630,6 @@ class AMDDevice(HCQCompiled): 
     | 
|
| 
       627 
630 
     | 
    
         | 
| 
       628 
631 
     | 
    
         
             
              def on_device_hang(self): self.dev_iface.on_device_hang()
         
     | 
| 
       629 
632 
     | 
    
         | 
| 
       630 
     | 
    
         
            -
              def  
     | 
| 
      
 633 
     | 
    
         
            +
              def finalize(self):
         
     | 
| 
       631 
634 
     | 
    
         
             
                self.synchronize()
         
     | 
| 
       632 
635 
     | 
    
         
             
                if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
         
     | 
| 
         @@ -1,5 +1,5 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            import platform, subprocess, sys
         
     | 
| 
       2 
     | 
    
         
            -
            from tinygrad.helpers import capstone_flatdump
         
     | 
| 
      
 2 
     | 
    
         
            +
            from tinygrad.helpers import capstone_flatdump, getenv
         
     | 
| 
       3 
3 
     | 
    
         
             
            from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
         
     | 
| 
       4 
4 
     | 
    
         
             
            from tinygrad.runtime.support.elf import jit_loader
         
     | 
| 
       5 
5 
     | 
    
         
             
            from tinygrad.renderer.cstyle import ClangRenderer
         
     | 
| 
         @@ -13,10 +13,12 @@ class ClangJITCompiler(Compiler): 
     | 
|
| 
       13 
13 
     | 
    
         
             
                target = 'x86_64' if sys.platform == 'win32' else platform.machine()
         
     | 
| 
       14 
14 
     | 
    
         
             
                args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
         
     | 
| 
       15 
15 
     | 
    
         
             
                arch_args = ['-ffixed-x18'] if target == 'arm64' else []
         
     | 
| 
       16 
     | 
    
         
            -
                obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
         
     | 
| 
      
 16 
     | 
    
         
            +
                obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
         
     | 
| 
       17 
17 
     | 
    
         
             
                return jit_loader(obj)
         
     | 
| 
       18 
18 
     | 
    
         | 
| 
       19 
19 
     | 
    
         
             
              def disassemble(self, lib:bytes): return capstone_flatdump(lib)
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
            class ClangDevice(Compiled):
         
     | 
| 
       22 
22 
     | 
    
         
             
              def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            CPUDevice = ClangDevice
         
     | 
    
        tinygrad/runtime/ops_disk.py
    CHANGED
    
    | 
         @@ -67,7 +67,7 @@ class DiskBuffer: 
     | 
|
| 
       67 
67 
     | 
    
         
             
                self.device, self.size, self.offset = device, size, offset
         
     | 
| 
       68 
68 
     | 
    
         
             
              def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
         
     | 
| 
       69 
69 
     | 
    
         
             
              def _buf(self) -> memoryview:
         
     | 
| 
       70 
     | 
    
         
            -
                assert hasattr(self.device, "mem"), "DiskBuffer wasn't opened"
         
     | 
| 
      
 70 
     | 
    
         
            +
                assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}"
         
     | 
| 
       71 
71 
     | 
    
         
             
                return memoryview(self.device.mem)[self.offset:self.offset+self.size]
         
     | 
| 
       72 
72 
     | 
    
         | 
| 
       73 
73 
     | 
    
         
             
            MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
         
     | 
    
        tinygrad/runtime/ops_dsp.py
    CHANGED
    
    | 
         @@ -1,6 +1,5 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            from __future__ import annotations
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
            import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, time, struct
         
     | 
| 
      
 2 
     | 
    
         
            +
            import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
         
     | 
| 
       4 
3 
     | 
    
         
             
            assert sys.platform != 'win32'
         
     | 
| 
       5 
4 
     | 
    
         
             
            from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
         
     | 
| 
       6 
5 
     | 
    
         
             
            from tinygrad.dtype import dtypes, DType, PtrDType
         
     | 
| 
         @@ -10,25 +9,45 @@ from tinygrad.renderer.cstyle import ClangRenderer 
     | 
|
| 
       10 
9 
     | 
    
         
             
            from tinygrad.runtime.autogen import libc, qcom_dsp
         
     | 
| 
       11 
10 
     | 
    
         
             
            if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
         
     | 
| 
       12 
11 
     | 
    
         | 
| 
      
 12 
     | 
    
         
            +
            from tinygrad.ops import PatternMatcher, UPat
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            dsp_pm = PatternMatcher([
         
     | 
| 
      
 15 
     | 
    
         
            +
              (((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
         
     | 
| 
      
 16 
     | 
    
         
            +
               lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
         
     | 
| 
      
 17 
     | 
    
         
            +
                 arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
         
     | 
| 
      
 18 
     | 
    
         
            +
              (UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
         
     | 
| 
      
 19 
     | 
    
         
            +
                                  "__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 else None),
         
     | 
| 
      
 20 
     | 
    
         
            +
            ])
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            dsp_pm_late = PatternMatcher([
         
     | 
| 
      
 23 
     | 
    
         
            +
              (UPat.var("x")+UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
         
     | 
| 
      
 24 
     | 
    
         
            +
              (UPat.var("x")*UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
         
     | 
| 
      
 25 
     | 
    
         
            +
              (UPat.var("x")//UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
         
     | 
| 
      
 26 
     | 
    
         
            +
              (UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
         
     | 
| 
      
 27 
     | 
    
         
            +
               lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
         
     | 
| 
      
 28 
     | 
    
         
            +
            ])
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
       13 
30 
     | 
    
         
             
            class DSPRenderer(ClangRenderer):
         
     | 
| 
       14 
31 
     | 
    
         
             
              device = "DSP"
         
     | 
| 
       15 
     | 
    
         
            -
              supports_float4 =  
     | 
| 
      
 32 
     | 
    
         
            +
              supports_float4 = True
         
     | 
| 
       16 
33 
     | 
    
         
             
              buffer_suffix = " restrict __attribute__((align_value(128)))"
         
     | 
| 
       17 
34 
     | 
    
         
             
              kernel_prefix = "__attribute__((noinline)) "
         
     | 
| 
      
 35 
     | 
    
         
            +
              pre_matcher = dsp_pm
         
     | 
| 
      
 36 
     | 
    
         
            +
              extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
         
     | 
| 
       18 
37 
     | 
    
         
             
              type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
         
     | 
| 
       19 
38 
     | 
    
         
             
              code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
         
     | 
| 
       20 
39 
     | 
    
         
             
                             Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
         
     | 
| 
       21 
40 
     | 
    
         
             
                             Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
         
     | 
| 
       22 
41 
     | 
    
         | 
| 
       23 
     | 
    
         
            -
              def render_kernel(self, function_name:str, kernel: 
     | 
| 
      
 42 
     | 
    
         
            +
              def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
         
     | 
| 
       24 
43 
     | 
    
         
             
                ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
         
     | 
| 
       25 
     | 
    
         
            -
                msrc = ['''struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency; 
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
      
 44 
     | 
    
         
            +
                msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
         
     | 
| 
      
 45 
     | 
    
         
            +
                  _Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
         
     | 
| 
      
 46 
     | 
    
         
            +
                  'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
         
     | 
| 
      
 47 
     | 
    
         
            +
                  'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
         
     | 
| 
      
 48 
     | 
    
         
            +
                  'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
         
     | 
| 
      
 49 
     | 
    
         
            +
                  'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
         
     | 
| 
      
 50 
     | 
    
         
            +
                  'HAP_power_set((void*)handle, (void*)&req);']
         
     | 
| 
       32 
51 
     | 
    
         
             
                msrc += ['if ((sc>>24) != 2) return 0;']
         
     | 
| 
       33 
52 
     | 
    
         
             
                msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
         
     | 
| 
       34 
53 
     | 
    
         
             
                msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
         
     | 
| 
         @@ -55,7 +74,7 @@ class DSPProgram: 
     | 
|
| 
       55 
74 
     | 
    
         
             
              def __init__(self, dev:DSPDevice, name:str, lib:bytes):
         
     | 
| 
       56 
75 
     | 
    
         
             
                self.dev, self.lib = dev, lib
         
     | 
| 
       57 
76 
     | 
    
         | 
| 
       58 
     | 
    
         
            -
              def __call__(self, *bufs, vals: 
     | 
| 
      
 77 
     | 
    
         
            +
              def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
         
     | 
| 
       59 
78 
     | 
    
         
             
                if len(bufs) >= 16: raise RuntimeError(f"Too many buffers to execute: {len(bufs)}")
         
     | 
| 
       60 
79 
     | 
    
         | 
| 
       61 
80 
     | 
    
         
             
                pra, fds, attrs, _ = rpc_prep_args(ins=[var_vals_mv:=memoryview(bytearray((len(bufs)+len(vals))*4)), off_mv:=memoryview(bytearray(len(bufs)*4))],
         
     | 
| 
         @@ -66,7 +85,7 @@ class DSPProgram: 
     | 
|
| 
       66 
85 
     | 
    
         
             
                return timer[0] / 1e6
         
     | 
| 
       67 
86 
     | 
    
         | 
| 
       68 
87 
     | 
    
         
             
            class DSPBuffer:
         
     | 
| 
       69 
     | 
    
         
            -
              def __init__(self, va_addr:int, size:int, share_info 
     | 
| 
      
 88 
     | 
    
         
            +
              def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
         
     | 
| 
       70 
89 
     | 
    
         
             
                self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
         
     | 
| 
       71 
90 
     | 
    
         | 
| 
       72 
91 
     | 
    
         
             
            class DSPAllocator(Allocator):
         
     | 
| 
         @@ -81,9 +100,10 @@ class DSPAllocator(Allocator): 
     | 
|
| 
       81 
100 
     | 
    
         
             
                return DSPBuffer(va_addr, size, share_info, offset=0)
         
     | 
| 
       82 
101 
     | 
    
         | 
| 
       83 
102 
     | 
    
         
             
              def _free(self, opaque:DSPBuffer, options:BufferSpec):
         
     | 
| 
       84 
     | 
    
         
            -
                libc 
     | 
| 
       85 
     | 
    
         
            -
             
     | 
| 
       86 
     | 
    
         
            -
             
     | 
| 
      
 103 
     | 
    
         
            +
                if libc is not None and qcom_dsp is not None:
         
     | 
| 
      
 104 
     | 
    
         
            +
                  libc.munmap(opaque.va_addr, opaque.size)
         
     | 
| 
      
 105 
     | 
    
         
            +
                  os.close(opaque.share_info.fd)
         
     | 
| 
      
 106 
     | 
    
         
            +
                  qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
         
     | 
| 
       87 
107 
     | 
    
         | 
| 
       88 
108 
     | 
    
         
             
              def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
         
     | 
| 
       89 
109 
     | 
    
         
             
              def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
         
     | 
| 
         @@ -99,7 +119,7 @@ class ClangCompiler(Compiler): 
     | 
|
| 
       99 
119 
     | 
    
         
             
              def compile(self, src:str) -> bytes:
         
     | 
| 
       100 
120 
     | 
    
         
             
                # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
         
     | 
| 
       101 
121 
     | 
    
         
             
                with tempfile.NamedTemporaryFile(delete=True) as output_file:
         
     | 
| 
       102 
     | 
    
         
            -
                  subprocess.check_output(['clang', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
         
     | 
| 
      
 122 
     | 
    
         
            +
                  subprocess.check_output([getenv("CC", 'clang'), *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
         
     | 
| 
       103 
123 
     | 
    
         
             
                                           '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
         
     | 
| 
       104 
124 
     | 
    
         
             
                  return pathlib.Path(output_file.name).read_bytes()
         
     | 
| 
       105 
125 
     | 
    
         | 
| 
         @@ -228,25 +248,32 @@ class RPCListener(threading.Thread): 
     | 
|
| 
       228 
248 
     | 
    
         | 
| 
       229 
249 
     | 
    
         
             
            # ***** mock DSP *****
         
     | 
| 
       230 
250 
     | 
    
         | 
| 
      
 251 
     | 
    
         
            +
            mockdsp_boilerplate = '''/* DSP boilerplate */ static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
         
     | 
| 
      
 252 
     | 
    
         
            +
            long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = %7; trap0(#1); %0 = r0" : "=r" (retval)
         
     | 
| 
      
 253 
     | 
    
         
            +
              : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "r" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
         
     | 
| 
      
 254 
     | 
    
         
            +
            static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
         
     | 
| 
      
 255 
     | 
    
         
            +
            static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
         
     | 
| 
      
 256 
     | 
    
         
            +
            static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
         
     | 
| 
      
 257 
     | 
    
         
            +
            static unsigned int inscount(void) {{ unsigned int ret; __asm__ volatile(".word 0x6a15c000; %0 = R0" : "=r" (ret) : : "r0"); return ret; }}
         
     | 
| 
      
 258 
     | 
    
         
            +
            static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
         
     | 
| 
      
 259 
     | 
    
         
            +
            return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
         
     | 
| 
      
 260 
     | 
    
         
            +
             
     | 
| 
       231 
261 
     | 
    
         
             
            class MockDSPRenderer(DSPRenderer):
         
     | 
| 
       232 
     | 
    
         
            -
              def render_kernel(self, function_name:str, kernel: 
     | 
| 
      
 262 
     | 
    
         
            +
              def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
         
     | 
| 
       233 
263 
     | 
    
         
             
                ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
         
     | 
| 
       234 
264 
     | 
    
         
             
                # https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
         
     | 
| 
       235 
     | 
    
         
            -
                 
     | 
| 
       236 
     | 
    
         
            -
             
     | 
| 
       237 
     | 
    
         
            -
                      : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "i" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
         
     | 
| 
       238 
     | 
    
         
            -
                  static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
         
     | 
| 
       239 
     | 
    
         
            -
                  static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
         
     | 
| 
       240 
     | 
    
         
            -
                  static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
         
     | 
| 
       241 
     | 
    
         
            -
                  static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
         
     | 
| 
       242 
     | 
    
         
            -
                    return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}''', 'void _start(void) {']
         
     | 
| 
      
 265 
     | 
    
         
            +
                # control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
         
     | 
| 
      
 266 
     | 
    
         
            +
                msrc = [mockdsp_boilerplate, 'void _start(void) {']
         
     | 
| 
       243 
267 
     | 
    
         
             
                for i,b in enumerate(bufs):
         
     | 
| 
       244 
268 
     | 
    
         
             
                  if isinstance(b[1][0], PtrDType):
         
     | 
| 
       245 
269 
     | 
    
         
             
                    sz = b[1][0].size*b[1][0].itemsize
         
     | 
| 
       246 
     | 
    
         
            -
                     
     | 
| 
      
 270 
     | 
    
         
            +
                    # for loop for big reads
         
     | 
| 
      
 271 
     | 
    
         
            +
                    msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); for(int rd = 0; rd < {sz}; rd += read(0, buf{i}+rd, {sz}-rd));")
         
     | 
| 
       247 
272 
     | 
    
         
             
                  else:
         
     | 
| 
       248 
273 
     | 
    
         
             
                    msrc.append(f"unsigned int val{i}; read(0, &val{i}, 4);")
         
     | 
| 
      
 274 
     | 
    
         
            +
                msrc.append("unsigned int st = inscount();")
         
     | 
| 
       249 
275 
     | 
    
         
             
                msrc.append(f"{function_name}({', '.join([(f'(void*)buf{i}' if isinstance(b[1][0], PtrDType) else f'val{i}') for i,b in enumerate(bufs)])});")
         
     | 
| 
      
 276 
     | 
    
         
            +
                msrc.append("unsigned int et = inscount() - st; write(1, &et, sizeof(et));")
         
     | 
| 
       250 
277 
     | 
    
         
             
                for i,b in enumerate(bufs):
         
     | 
| 
       251 
278 
     | 
    
         
             
                  if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
         
     | 
| 
       252 
279 
     | 
    
         
             
                msrc.append('exit(0); }')
         
     | 
| 
         @@ -254,19 +281,18 @@ class MockDSPRenderer(DSPRenderer): 
     | 
|
| 
       254 
281 
     | 
    
         | 
| 
       255 
282 
     | 
    
         
             
            class MockDSPProgram:
         
     | 
| 
       256 
283 
     | 
    
         
             
              def __init__(self, name:str, lib:bytes): self.lib = lib
         
     | 
| 
       257 
     | 
    
         
            -
              def __call__(self, *bufs, vals: 
     | 
| 
      
 284 
     | 
    
         
            +
              def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
         
     | 
| 
       258 
285 
     | 
    
         
             
                with tempfile.NamedTemporaryFile(suffix=".out") as dsp_lib:
         
     | 
| 
       259 
286 
     | 
    
         
             
                  dsp_lib.write(self.lib)
         
     | 
| 
       260 
287 
     | 
    
         
             
                  dsp_lib.flush()
         
     | 
| 
       261 
288 
     | 
    
         
             
                  os.chmod(dsp_lib.name, 0o0777)
         
     | 
| 
       262 
289 
     | 
    
         
             
                  # NOTE: this timing includes a docker launch
         
     | 
| 
       263 
     | 
    
         
            -
                  start = time.perf_counter()
         
     | 
| 
       264 
290 
     | 
    
         
             
                  proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
         
     | 
| 
       265 
     | 
    
         
            -
                                        "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >=  
     | 
| 
      
 291 
     | 
    
         
            +
                                        "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
         
     | 
| 
       266 
292 
     | 
    
         
             
                                        input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
         
     | 
| 
       267 
     | 
    
         
            -
             
     | 
| 
       268 
     | 
    
         
            -
                offset = 0
         
     | 
| 
      
 293 
     | 
    
         
            +
                offset = 4
         
     | 
| 
       269 
294 
     | 
    
         
             
                for x in bufs:
         
     | 
| 
       270 
295 
     | 
    
         
             
                  x[:] = proc.stdout[offset:offset+len(x)]
         
     | 
| 
       271 
296 
     | 
    
         
             
                  offset += len(x)
         
     | 
| 
       272 
     | 
    
         
            -
                 
     | 
| 
      
 297 
     | 
    
         
            +
                assert offset == len(proc.stdout)
         
     | 
| 
      
 298 
     | 
    
         
            +
                return struct.unpack("I", proc.stdout[0:4])[0] / 1e9  # pretend it's 1 Ghz, but this is an inscount, not a time
         
     | 
    
        tinygrad/runtime/ops_llvm.py
    CHANGED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            import ctypes, platform 
     | 
| 
      
 1 
     | 
    
         
            +
            import ctypes, platform
         
     | 
| 
       2 
2 
     | 
    
         
             
            from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
         
     | 
| 
       3 
     | 
    
         
            -
            from tinygrad.helpers import OSX, getenv, capstone_flatdump
         
     | 
| 
      
 3 
     | 
    
         
            +
            from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
         
     | 
| 
       4 
4 
     | 
    
         
             
            from tinygrad.renderer.llvmir import LLVMRenderer
         
     | 
| 
       5 
5 
     | 
    
         
             
            import tinygrad.runtime.autogen.llvm as llvm
         
     | 
| 
       6 
6 
     | 
    
         
             
            from tinygrad.runtime.support.elf import jit_loader
         
     | 
| 
         @@ -12,17 +12,19 @@ def expect(x, err, ret=None): 
     | 
|
| 
       12 
12 
     | 
    
         
             
              return ret
         
     | 
| 
       13 
13 
     | 
    
         | 
| 
       14 
14 
     | 
    
         
             
            class LLVMCompiler(Compiler):
         
     | 
| 
       15 
     | 
    
         
            -
              def __init__(self, host_arch:str 
     | 
| 
       16 
     | 
    
         
            -
                for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
         
     | 
| 
      
 15 
     | 
    
         
            +
              def __init__(self, host_arch:str):
         
     | 
| 
      
 16 
     | 
    
         
            +
                for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
       18 
18 
     | 
    
         
             
                triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
         
     | 
| 
       19 
19 
     | 
    
         
             
                target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
         
     | 
| 
       20 
     | 
    
         
            -
                # +reserve-x18 here does the same thing as -ffixed-x18 in  
     | 
| 
       21 
     | 
    
         
            -
                 
     | 
| 
      
 20 
     | 
    
         
            +
                # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
         
     | 
| 
      
 21 
     | 
    
         
            +
                cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
         
     | 
| 
      
 22 
     | 
    
         
            +
                if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
         
     | 
| 
      
 23 
     | 
    
         
            +
                self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
         
     | 
| 
       22 
24 
     | 
    
         
             
                                                                   llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
         
     | 
| 
       23 
25 
     | 
    
         | 
| 
       24 
26 
     | 
    
         
             
                self.pbo = llvm.LLVMCreatePassBuilderOptions()
         
     | 
| 
       25 
     | 
    
         
            -
                if opt:
         
     | 
| 
      
 27 
     | 
    
         
            +
                if (opt:=bool(getenv("LLVMOPT", "1"))):
         
     | 
| 
       26 
28 
     | 
    
         
             
                  self.passes = b'default<O2>'
         
     | 
| 
       27 
29 
     | 
    
         
             
                  llvm.LLVMPassBuilderOptionsSetLoopUnrolling(self.pbo, True)
         
     | 
| 
       28 
30 
     | 
    
         
             
                  llvm.LLVMPassBuilderOptionsSetLoopVectorization(self.pbo, True)
         
     | 
| 
         @@ -33,18 +35,18 @@ class LLVMCompiler(Compiler): 
     | 
|
| 
       33 
35 
     | 
    
         | 
| 
       34 
36 
     | 
    
         
             
                super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
         
     | 
| 
       35 
37 
     | 
    
         | 
| 
       36 
     | 
    
         
            -
              def __del__(self):
         
     | 
| 
       37 
     | 
    
         
            -
                llvm.LLVMDisposePassBuilderOptions(self.pbo)
         
     | 
| 
      
 38 
     | 
    
         
            +
              def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
         
     | 
| 
       38 
39 
     | 
    
         | 
| 
       39 
40 
     | 
    
         
             
              def compile(self, src:str) -> bytes:
         
     | 
| 
       40 
41 
     | 
    
         
             
                src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
         
     | 
| 
       41 
42 
     | 
    
         
             
                mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
         
     | 
| 
       42 
43 
     | 
    
         
             
                expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
         
     | 
| 
       43 
44 
     | 
    
         
             
                expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
         
     | 
| 
      
 45 
     | 
    
         
            +
                if DEBUG >= 7: print(ctypes.string_at(llvm.LLVMPrintModuleToString(mod)).decode())
         
     | 
| 
       44 
46 
     | 
    
         
             
                obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
         
     | 
| 
       45 
47 
     | 
    
         
             
                                                                          ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
         
     | 
| 
       46 
     | 
    
         
            -
                obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
         
     | 
| 
       47 
48 
     | 
    
         
             
                llvm.LLVMDisposeModule(mod)
         
     | 
| 
      
 49 
     | 
    
         
            +
                obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
         
     | 
| 
       48 
50 
     | 
    
         
             
                llvm.LLVMDisposeMemoryBuffer(obj_buf)
         
     | 
| 
       49 
51 
     | 
    
         
             
                return jit_loader(obj)
         
     | 
| 
       50 
52 
     | 
    
         | 
| 
         @@ -52,5 +54,5 @@ class LLVMCompiler(Compiler): 
     | 
|
| 
       52 
54 
     | 
    
         | 
| 
       53 
55 
     | 
    
         
             
            class LLVMDevice(Compiled):
         
     | 
| 
       54 
56 
     | 
    
         
             
              def __init__(self, device:str):
         
     | 
| 
       55 
     | 
    
         
            -
                compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()] 
     | 
| 
       56 
     | 
    
         
            -
                super().__init__(device, MallocAllocator, LLVMRenderer( 
     | 
| 
      
 57 
     | 
    
         
            +
                compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()])
         
     | 
| 
      
 58 
     | 
    
         
            +
                super().__init__(device, MallocAllocator, LLVMRenderer(), compiler, CPUProgram)
         
     |