PyPI - tinygrad - Versions diffs - 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

tinygrad 0.10.2py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

tinygrad/__init__.py +1 -1
tinygrad/apps/llm.py +206 -0
tinygrad/codegen/__init__.py +116 -0
tinygrad/codegen/devectorizer.py +315 -172
tinygrad/codegen/expander.py +8 -16
tinygrad/codegen/gpudims.py +89 -0
tinygrad/codegen/linearize.py +205 -203
tinygrad/codegen/lowerer.py +92 -139
tinygrad/codegen/opt/__init__.py +38 -0
tinygrad/codegen/opt/heuristic.py +125 -0
tinygrad/codegen/opt/kernel.py +510 -0
tinygrad/{engine → codegen/opt}/search.py +51 -35
tinygrad/codegen/opt/swizzler.py +134 -0
tinygrad/codegen/opt/tc.py +127 -0
tinygrad/codegen/quantize.py +67 -0
tinygrad/device.py +122 -132
tinygrad/dtype.py +152 -35
tinygrad/engine/jit.py +81 -54
tinygrad/engine/memory.py +46 -27
tinygrad/engine/realize.py +82 -41
tinygrad/engine/schedule.py +70 -445
tinygrad/frontend/__init__.py +0 -0
tinygrad/frontend/onnx.py +1253 -0
tinygrad/frontend/torch.py +5 -0
tinygrad/gradient.py +19 -27
tinygrad/helpers.py +95 -47
tinygrad/nn/__init__.py +7 -8
tinygrad/nn/optim.py +72 -41
tinygrad/nn/state.py +37 -23
tinygrad/renderer/__init__.py +40 -60
tinygrad/renderer/cstyle.py +143 -128
tinygrad/renderer/llvmir.py +113 -62
tinygrad/renderer/ptx.py +50 -32
tinygrad/renderer/wgsl.py +27 -23
tinygrad/runtime/autogen/am/am.py +5861 -0
tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
tinygrad/runtime/autogen/comgr.py +35 -9
tinygrad/runtime/autogen/comgr_3.py +906 -0
tinygrad/runtime/autogen/cuda.py +2419 -494
tinygrad/runtime/autogen/hsa.py +57 -16
tinygrad/runtime/autogen/ib.py +7171 -0
tinygrad/runtime/autogen/io_uring.py +917 -118
tinygrad/runtime/autogen/kfd.py +748 -26
tinygrad/runtime/autogen/libc.py +613 -218
tinygrad/runtime/autogen/libusb.py +1643 -0
tinygrad/runtime/autogen/nv/nv.py +8602 -0
tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
tinygrad/runtime/autogen/opencl.py +2 -4
tinygrad/runtime/autogen/sqtt.py +1789 -0
tinygrad/runtime/autogen/vfio.py +3 -3
tinygrad/runtime/autogen/webgpu.py +273 -264
tinygrad/runtime/graph/cuda.py +3 -3
tinygrad/runtime/graph/hcq.py +68 -29
tinygrad/runtime/graph/metal.py +29 -13
tinygrad/runtime/graph/remote.py +114 -0
tinygrad/runtime/ops_amd.py +537 -320
tinygrad/runtime/ops_cpu.py +108 -7
tinygrad/runtime/ops_cuda.py +12 -14
tinygrad/runtime/ops_disk.py +13 -10
tinygrad/runtime/ops_dsp.py +47 -40
tinygrad/runtime/ops_gpu.py +13 -11
tinygrad/runtime/ops_hip.py +6 -9
tinygrad/runtime/ops_llvm.py +35 -15
tinygrad/runtime/ops_metal.py +29 -19
tinygrad/runtime/ops_npy.py +5 -3
tinygrad/runtime/ops_null.py +28 -0
tinygrad/runtime/ops_nv.py +306 -234
tinygrad/runtime/ops_python.py +62 -52
tinygrad/runtime/ops_qcom.py +28 -39
tinygrad/runtime/ops_remote.py +482 -0
tinygrad/runtime/ops_webgpu.py +28 -28
tinygrad/runtime/support/am/amdev.py +114 -249
tinygrad/runtime/support/am/ip.py +211 -172
tinygrad/runtime/support/amd.py +138 -0
tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
tinygrad/runtime/support/compiler_cuda.py +8 -11
tinygrad/runtime/support/elf.py +2 -1
tinygrad/runtime/support/hcq.py +184 -97
tinygrad/runtime/support/ib.py +172 -0
tinygrad/runtime/support/llvm.py +3 -4
tinygrad/runtime/support/memory.py +251 -0
tinygrad/runtime/support/nv/__init__.py +0 -0
tinygrad/runtime/support/nv/ip.py +581 -0
tinygrad/runtime/support/nv/nvdev.py +183 -0
tinygrad/runtime/support/system.py +170 -0
tinygrad/runtime/support/usb.py +268 -0
tinygrad/runtime/support/webgpu.py +18 -0
tinygrad/schedule/__init__.py +0 -0
tinygrad/schedule/grouper.py +119 -0
tinygrad/schedule/kernelize.py +368 -0
tinygrad/schedule/multi.py +231 -0
tinygrad/shape/shapetracker.py +40 -46
tinygrad/shape/view.py +88 -52
tinygrad/tensor.py +968 -542
tinygrad/uop/__init__.py +117 -0
tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
tinygrad/uop/mathtraits.py +169 -0
tinygrad/uop/ops.py +1021 -0
tinygrad/uop/spec.py +228 -0
tinygrad/{codegen → uop}/symbolic.py +239 -216
tinygrad/uop/upat.py +163 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
tinygrad/viz/index.html +203 -403
tinygrad/viz/js/index.js +718 -0
tinygrad/viz/js/worker.js +29 -0
tinygrad/viz/serve.py +224 -102
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
tinygrad-0.11.0.dist-info/RECORD +141 -0
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/kernel.py +0 -693
tinygrad/engine/multi.py +0 -161
tinygrad/ops.py +0 -1003
tinygrad/runtime/ops_cloud.py +0 -220
tinygrad/runtime/support/allocator.py +0 -94
tinygrad/spec.py +0 -155
tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
tinygrad/viz/perfetto.html +0 -178
tinygrad-0.10.2.dist-info/RECORD +0 -99
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0

tinygrad/runtime/support/nv/nvdev.py ADDED Viewed

@@ -0,0 +1,183 @@
+from __future__ import annotations
+import ctypes, time, functools, re, gzip, struct
+from tinygrad.helpers import getenv, DEBUG, fetch, getbits, to_mv
+from tinygrad.runtime.support.hcq import MMIOInterface
+from tinygrad.runtime.support.memory import TLSFAllocator, MemoryManager
+from tinygrad.runtime.support.nv.ip import NV_FLCN, NV_FLCN_COT, NV_GSP
+from tinygrad.runtime.support.system import System, PCIDevImplBase
+NV_DEBUG = getenv("NV_DEBUG", 0)
+class NVReg:
+  def __init__(self, nvdev, base, off, fields=None): self.nvdev, self.base, self.off, self.fields = nvdev, base, off, fields
+  def __getitem__(self, idx:int): return NVReg(self.nvdev, self.base, self.off(idx), fields=self.fields)
+  def add_field(self, name:str, start:int, end:int): self.fields[name] = (start, end)
+  def with_base(self, base:int): return NVReg(self.nvdev, base + self.base, self.off, self.fields)
+  def read(self): return self.nvdev.rreg(self.base + self.off)
+  def read_bitfields(self) -> dict[str, int]: return self.decode(self.read())
+  def write(self, _ini_val:int=0, **kwargs): self.nvdev.wreg(self.base + self.off, _ini_val | self.encode(**kwargs))
+  def update(self, **kwargs): self.write(self.read() & ~self.mask(*kwargs.keys()), **kwargs)
+  def mask(self, *names):
+    return functools.reduce(int.__or__, ((((1 << (self.fields[nm][1]-self.fields[nm][0] + 1)) - 1) << self.fields[nm][0]) for nm in names), 0)
+  def encode(self, **kwargs) -> int: return functools.reduce(int.__or__, (value << self.fields[name][0] for name,value in kwargs.items()), 0)
+  def decode(self, val: int) -> dict: return {name:getbits(val, start, end) for name,(start,end) in self.fields.items()}
+class NVPageTableEntry:
+  def __init__(self, nvdev, paddr, lv): self.nvdev, self.paddr, self.lv, self.entries = nvdev, paddr, lv, nvdev.vram.view(paddr, 0x1000, fmt='Q')
+  def _is_dual_pde(self) -> bool: return self.lv == self.nvdev.mm.level_cnt - 2
+  def set_entry(self, entry_id:int, paddr:int, table=False, uncached=False, system=False, snooped=False, frag=0, valid=True):
+    if not table:
+      x = self.nvdev.pte_t.encode(valid=valid, address_sys=paddr >> 12, aperture=2 if system else 0, kind=6,
+        **({'pcf': int(uncached)} if self.nvdev.mmu_ver == 3 else {'vol': uncached}))
+    else:
+      pde = self.nvdev.dual_pde_t if self._is_dual_pde() else self.nvdev.pde_t
+      small, sys = ("_small" if self._is_dual_pde() else ""), "" if self.nvdev.mmu_ver == 3 else "_sys"
+      x = pde.encode(is_pte=False, **{f'aperture{small}': 1 if valid else 0, f'address{small}{sys}': paddr >> 12},
+        **({f'pcf{small}': 0b10} if self.nvdev.mmu_ver == 3 else {'no_ats': 1}))
+    if self._is_dual_pde(): self.entries[2*entry_id], self.entries[2*entry_id+1] = x & 0xffffffffffffffff, x >> 64
+    else: self.entries[entry_id] = x
+  def entry(self, entry_id:int) -> int:
+    return (self.entries[2*entry_id+1]<<64) | self.entries[2*entry_id] if self._is_dual_pde() else self.entries[entry_id]
+  def read_fields(self, entry_id:int) -> dict:
+    if self.is_huge_page(entry_id): return self.nvdev.pte_t.decode(self.entry(entry_id))
+    return (self.nvdev.dual_pde_t if self._is_dual_pde() else self.nvdev.pde_t).decode(self.entry(entry_id))
+  def is_huge_page(self, entry_id) -> bool: return (self.entry(entry_id) & 1 == 1) if self.lv < self.nvdev.mm.level_cnt - 1 else True
+  def supports_huge_page(self, paddr:int): return self.lv >= self.nvdev.mm.level_cnt - 3 and paddr % self.nvdev.mm.pte_covers[self.lv] == 0
+  def valid(self, entry_id):
+    if self.is_huge_page(entry_id): return self.read_fields(entry_id)['valid']
+    return self.read_fields(entry_id)['aperture_small' if self._is_dual_pde() else 'aperture'] != 0
+  def address(self, entry_id:int) -> int:
+    small, sys = ("_small" if self._is_dual_pde() else ""), "_sys" if self.nvdev.mmu_ver == 2 or self.lv == self.nvdev.mm.level_cnt - 1 else ""
+    return self.read_fields(entry_id)[f'address{small}{sys}'] << 12
+class NVMemoryManager(MemoryManager):
+  va_allocator = TLSFAllocator((1 << 44), base=0x1000000000) # global for all devices.
+  def on_range_mapped(self): self.dev.NV_VIRTUAL_FUNCTION_PRIV_MMU_INVALIDATE.write((1 << 0) | (1 << 1) | (1 << 6) | (1 << 31))
+class NVDev(PCIDevImplBase):
+  def __init__(self, devfmt:str, mmio:MMIOInterface, vram:MMIOInterface, venid:int, subvenid:int, rev:int, bars:dict):
+    self.devfmt, self.mmio, self.vram, self.venid, self.subvenid, self.rev, self.bars = devfmt, mmio, vram, venid, subvenid, rev, bars
+    self.lock_fd = System.flock_acquire(f"nv_{self.devfmt}.lock")
+    self.smi_dev, self.is_booting = False, True
+    self._early_init()
+    # UVM depth   HW level                            VA bits
+    # 0           PDE4                                56:56 (hopper+)
+    # 1           PDE3                                55:47
+    # 2           PDE2                                46:38
+    # 3           PDE1 (or 512M PTE)                  37:29
+    # 4           PDE0 (dual 64k/4k PDE, or 2M PTE)   28:21
+    # 5           PTE_64K / PTE_4K                    20:16 / 20:12
+    bits, shifts = (56, [12, 21, 29, 38, 47, 56]) if self.mmu_ver == 3 else (48, [12, 21, 29, 38, 47])
+    self.mm = NVMemoryManager(self, self.vram_size, boot_size=(2 << 20), pt_t=NVPageTableEntry, va_bits=bits, va_shifts=shifts, va_base=0,
+      palloc_ranges=[(x, x) for x in [512 << 20, 2 << 20, 4 << 10]])
+    self.flcn:NV_FLCN|NV_FLCN_COT = NV_FLCN_COT(self) if self.fmc_boot else NV_FLCN(self)
+    self.gsp:NV_GSP = NV_GSP(self)
+    # Turn the booting early, gsp client is loaded from the clean.
+    self.is_booting = False
+    for ip in [self.flcn, self.gsp]: ip.init_sw()
+    for ip in [self.flcn, self.gsp]: ip.init_hw()
+  def fini(self):
+    for ip in [self.gsp, self.flcn]: ip.fini_hw()
+  def reg(self, reg:str) -> NVReg: return self.__dict__[reg]
+  def wreg(self, addr:int, value:int):
+    self.mmio[addr // 4] = value
+    if NV_DEBUG >= 4: print(f"wreg: {hex(addr)} = {hex(value)}")
+  def rreg(self, addr:int) -> int: return self.mmio[addr // 4]
+  def _early_init(self):
+    self.reg_names:set[str] = set()
+    self.reg_offsets:dict[str, tuple[int, int]] = {}
+    self.include("src/common/inc/swref/published/nv_ref.h")
+    self.chip_id = self.reg("NV_PMC_BOOT_0").read()
+    self.chip_details = self.reg("NV_PMC_BOOT_42").read_bitfields()
+    self.chip_name = {0x17: "GA1", 0x19: "AD1", 0x1b: "GB2"}[self.chip_details['architecture']] + f"{self.chip_details['implementation']:02d}"
+    self.mmu_ver, self.fmc_boot = (3, True) if self.chip_details['architecture'] >= 0x1a else (2, False)
+    self.include("src/common/inc/swref/published/turing/tu102/dev_fb.h")
+    if self.reg("NV_PFB_PRI_MMU_WPR2_ADDR_HI").read() != 0:
+      if DEBUG >= 2: print(f"nv {self.devfmt}: WPR2 is up. Issuing a full reset.")
+      System.pci_reset(self.devfmt)
+      time.sleep(0.5)
+    self.include("src/common/inc/swref/published/turing/tu102/dev_vm.h")
+    self.include("src/common/inc/swref/published/ampere/ga102/dev_gc6_island.h")
+    self.include("src/common/inc/swref/published/ampere/ga102/dev_gc6_island_addendum.h")
+    # MMU Init
+    self.reg_names.update(mmu_pd_names:=[f'NV_MMU_VER{self.mmu_ver}_PTE', f'NV_MMU_VER{self.mmu_ver}_PDE', f'NV_MMU_VER{self.mmu_ver}_DUAL_PDE'])
+    for name in mmu_pd_names: self.__dict__[name] = NVReg(self, None, None, fields={})
+    self.include(f"kernel-open/nvidia-uvm/hwref/{'hopper/gh100' if self.mmu_ver == 3 else 'turing/tu102'}/dev_mmu.h")
+    self.pte_t, self.pde_t, self.dual_pde_t = tuple([self.__dict__[name] for name in mmu_pd_names])
+    self.vram_size = self.reg("NV_PGC6_AON_SECURE_SCRATCH_GROUP_42").read() << 20
+  def _alloc_boot_struct(self, struct:ctypes.Structure) -> tuple[ctypes.Structure, int]:
+    va, paddrs = System.alloc_sysmem(sz:=ctypes.sizeof(type(struct)), contiguous=True)
+    to_mv(va, sz)[:] = bytes(struct)
+    return type(struct).from_address(va), paddrs[0]
+  def _download(self, file:str) -> str:
+    url = f"https://raw.githubusercontent.com/NVIDIA/open-gpu-kernel-modules/8ec351aeb96a93a4bb69ccc12a542bf8a8df2b6f/{file}"
+    return fetch(url, subdir="defines").read_text()
+  def extract_fw(self, file:str, dname:str) -> bytes:
+    # Extracts the firmware binary from the given header
+    tname = file.replace("kgsp", "kgspGet")
+    text = self._download(f"src/nvidia/generated/g_bindata_{tname}_{self.chip_name}.c")
+    info, sl = text[text[:text.index(dnm:=f'{file}_{self.chip_name}_{dname}')].rindex("COMPRESSION:"):][:16], text[text.index(dnm) + len(dnm) + 7:]
+    image = bytes.fromhex(sl[:sl.find("};")].strip().replace("0x", "").replace(",", "").replace(" ", "").replace("\n", ""))
+    return gzip.decompress(struct.pack("<4BL2B", 0x1f, 0x8b, 8, 0, 0, 0, 3) + image) if "COMPRESSION: YES" in info else image
+  def include(self, file:str):
+    regs_off = {'NV_PFALCON_FALCON': 0x0, 'NV_PGSP_FALCON': 0x0, 'NV_PSEC_FALCON': 0x0, 'NV_PRISCV_RISCV': 0x1000, 'NV_PGC6_AON': 0x0, 'NV_PFSP': 0x0,
+      'NV_PGC6_BSI': 0x0, 'NV_PFALCON_FBIF': 0x600, 'NV_PFALCON2_FALCON': 0x1000, 'NV_PBUS': 0x0, 'NV_PFB': 0x0, 'NV_PMC': 0x0, 'NV_PGSP_QUEUE': 0x0,
+      'NV_VIRTUAL_FUNCTION':0xb80000}
+    for raw in self._download(file).splitlines():
+      if not raw.startswith("#define "): continue
+      if m:=re.match(r'#define\s+(\w+)\s+([0-9\+\-\*\(\)]+):([0-9\+\-\*\(\)]+)', raw): # bitfields
+        name, hi, lo = m.groups()
+        reg = next((r for r in self.reg_names if name.startswith(r+"_")), None)
+        if reg is not None: self.__dict__[reg].add_field(name[len(reg)+1:].lower(), eval(lo), eval(hi))
+        else: self.reg_offsets[name] = (eval(lo), eval(hi))
+        continue
+      if m:=re.match(r'#define\s+(\w+)\s*\(\s*(\w+)\s*\)\s*(.+)', raw): # reg set
+        fn = m.groups()[2].strip().rstrip('\\').split('/*')[0].rstrip()
+        name, value = m.groups()[0], eval(f"lambda {m.groups()[1]}: {fn}")
+      elif m:=re.match(r'#define\s+(\w+)\s+([0-9A-Fa-fx]+)(?![^\n]*:)', raw): name, value = m.groups()[0], int(m.groups()[1], 0) # reg value
+      else: continue
+      reg_pref = next((prefix for prefix in regs_off.keys() if name.startswith(prefix)), None)
+      not_already_reg = not any(name.startswith(r+"_") for r in self.reg_names)
+      if reg_pref is not None and not_already_reg:
+        fields = {k[len(name)+1:]: v for k, v in self.reg_offsets.items() if k.startswith(name+'_')}
+        self.__dict__[name] = NVReg(self, regs_off[reg_pref], value, fields=fields)
+        self.reg_names.add(name)
+      else: self.__dict__[name] = value

tinygrad/runtime/support/system.py ADDED Viewed

@@ -0,0 +1,170 @@
+import os, mmap, array, functools, ctypes, select, contextlib, dataclasses, sys
+from typing import cast, ClassVar
+from tinygrad.helpers import round_up, to_mv, getenv, OSX, temp
+from tinygrad.runtime.autogen import libc, vfio
+from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface, HCQBuffer
+from tinygrad.runtime.support.memory import MemoryManager, VirtMapping
+MAP_FIXED, MAP_LOCKED, MAP_POPULATE, MAP_NORESERVE = 0x10, 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000), 0x400
+class _System:
+  def reserve_hugepages(self, cnt): os.system(f"sudo sh -c 'echo {cnt} > /proc/sys/vm/nr_hugepages'")
+  def memory_barrier(self): lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5) if (lib:=self.atomic_lib()) is not None else None
+  def lock_memory(self, addr:int, size:int):
+    if libc.mlock(ctypes.c_void_p(addr), size): raise RuntimeError(f"Failed to lock memory at {addr:#x} with size {size:#x}")
+  def system_paddrs(self, vaddr:int, size:int) -> list[int]:
+    self.pagemap().seek(vaddr // mmap.PAGESIZE * 8)
+    return [(x & ((1<<55) - 1)) * mmap.PAGESIZE for x in array.array('Q', self.pagemap().read(size//mmap.PAGESIZE*8, binary=True))]
+  def alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False, data:bytes|None=None) -> tuple[int, list[int]]:
+    assert not contiguous or size <= (2 << 20), "Contiguous allocation is only supported for sizes up to 2MB"
+    flags = (libc.MAP_HUGETLB if contiguous and (size:=round_up(size, mmap.PAGESIZE)) > 0x1000 else 0) | (MAP_FIXED if vaddr else 0)
+    va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS|MAP_POPULATE|MAP_LOCKED|flags, 0)
+    if data is not None: to_mv(va, len(data))[:] = data
+    return va, self.system_paddrs(va, size)
+  def pci_reset(self, gpu): os.system(f"sudo sh -c 'echo 1 > /sys/bus/pci/devices/{gpu}/reset'")
+  def pci_scan_bus(self, target_vendor:int, target_devices:list[int]) -> list[str]:
+    result = []
+    for pcibus in FileIOInterface("/sys/bus/pci/devices").listdir():
+      vendor = int(FileIOInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
+      device = int(FileIOInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
+      if vendor == target_vendor and device in target_devices: result.append(pcibus)
+    return sorted(result)
+  @functools.cache
+  def atomic_lib(self): return ctypes.CDLL(ctypes.util.find_library('atomic')) if sys.platform == "linux" else None
+  @functools.cache
+  def pagemap(self) -> FileIOInterface:
+    if FileIOInterface(reloc_sysfs:="/proc/sys/vm/compact_unevictable_allowed", os.O_RDONLY).read()[0] != "0":
+      os.system(cmd:=f"sudo sh -c 'echo 0 > {reloc_sysfs}'")
+      assert FileIOInterface(reloc_sysfs, os.O_RDONLY).read()[0] == "0", f"Failed to disable migration of locked pages. Please run {cmd} manually."
+    return FileIOInterface("/proc/self/pagemap", os.O_RDONLY)
+  @functools.cache
+  def vfio(self) -> FileIOInterface|None:
+    try:
+      if not FileIOInterface.exists("/sys/module/vfio"): os.system("sudo modprobe vfio-pci disable_idle_d3=1")
+      FileIOInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
+      vfio_fd = FileIOInterface("/dev/vfio/vfio", os.O_RDWR)
+      vfio.VFIO_CHECK_EXTENSION(vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
+      return vfio_fd
+    except OSError: return None
+  def flock_acquire(self, name:str) -> int:
+    import fcntl # to support windows
+    os.umask(0) # Set umask to 0 to allow creating files with 0666 permissions
+    # Avoid O_CREAT because we don’t want to re-create/replace an existing file (triggers extra perms checks) when opening as non-owner.
+    if os.path.exists(lock_name:=temp(name)): self.lock_fd = os.open(lock_name, os.O_RDWR)
+    else: self.lock_fd = os.open(lock_name, os.O_RDWR | os.O_CREAT | os.O_CLOEXEC, 0o666)
+    try: fcntl.flock(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    except OSError: raise RuntimeError(f"Failed to take lock file {name}. It's already in use.")
+    return self.lock_fd
+System = _System()
+class PCIDevice:
+  def __init__(self, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
+    self.pcibus, self.irq_poller = pcibus, None
+    if FileIOInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
+      FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
+    for i in resize_bars or []:
+      if FileIOInterface.exists(rpath:=f"/sys/bus/pci/devices/{self.pcibus}/resource{i}_resize"):
+        try: FileIOInterface(rpath, os.O_RDWR).write(str(int(FileIOInterface(rpath, os.O_RDONLY).read(), 16).bit_length() - 1))
+        except OSError as e: raise RuntimeError(f"Cannot resize BAR {i}: {e}. Ensure the resizable BAR option is enabled on your system.") from e
+    if getenv("VFIO", 0) and (vfio_fd:=System.vfio()) is not None:
+      FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
+      FileIOInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
+      iommu_group = FileIOInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
+      self.vfio_group = FileIOInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
+      vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(vfio_fd.fd))
+      with contextlib.suppress(OSError): vfio.VFIO_SET_IOMMU(vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) # set iommu works only once for the fd.
+      self.vfio_dev = FileIOInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
+      self.irq_fd = FileIOInterface.eventfd(0, 0)
+      self.irq_poller = select.poll()
+      self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
+      irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
+        argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
+      vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
+    else: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
+    self.cfg_fd = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
+    self.bar_fds = {b: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{b}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for b in bars}
+    bar_info = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
+    self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
+  def read_config(self, offset:int, size:int): return int.from_bytes(self.cfg_fd.read(size, binary=True, offset=offset), byteorder='little')
+  def write_config(self, offset:int, value:int, size:int): self.cfg_fd.write(value.to_bytes(size, byteorder='little'), binary=True, offset=offset)
+  def map_bar(self, bar:int, off:int=0, addr:int=0, size:int|None=None, fmt='B') -> MMIOInterface:
+    fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
+    libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
+    return MMIOInterface(loc, sz, fmt=fmt)
+class PCIDevImplBase:
+  mm: MemoryManager
+@dataclasses.dataclass
+class PCIAllocationMeta: mapping:VirtMapping; has_cpu_mapping:bool; hMemory:int=0 # noqa: E702
+class PCIIfaceBase:
+  dev_impl:PCIDevImplBase
+  gpus:ClassVar[list[str]] = []
+  def __init__(self, dev, dev_id, vendor, devices, bars, vram_bar, va_start, va_size):
+    if len((cls:=type(self)).gpus) == 0:
+      cls.gpus = System.pci_scan_bus(vendor, devices)
+      visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', '')).split(',') if x.strip()]
+      cls.gpus = [cls.gpus[x] for x in visible_devices] if visible_devices else cls.gpus
+      # Acquire va range to avoid collisions.
+      FileIOInterface.anon_mmap(va_start, va_size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, 0)
+    self.pci_dev, self.dev, self.vram_bar = PCIDevice(cls.gpus[dev_id], bars=bars, resize_bars=[vram_bar]), dev, vram_bar
+    self.p2p_base_addr = self.pci_dev.bar_info[vram_bar][0]
+  def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, **kwargs) -> HCQBuffer:
+    if host or (uncached and cpu_access): # host or gtt-like memory.
+      vaddr = self.dev_impl.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
+      paddrs = [(paddr, mmap.PAGESIZE) for paddr in System.alloc_sysmem(size, vaddr=vaddr, contiguous=contiguous)[1]]
+      mapping = self.dev_impl.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
+      return HCQBuffer(vaddr, size, meta=PCIAllocationMeta(mapping, has_cpu_mapping=True, hMemory=paddrs[0][0]),
+        view=MMIOInterface(mapping.va_addr, size, fmt='B'), owner=self.dev)
+    mapping = self.dev_impl.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contiguous=cpu_access)
+    if cpu_access: self.pci_dev.map_bar(bar=self.vram_bar, off=mapping.paddrs[0][0], addr=mapping.va_addr, size=mapping.size)
+    return HCQBuffer(mapping.va_addr, size, view=MMIOInterface(mapping.va_addr, size, fmt='B') if cpu_access else None,
+      meta=PCIAllocationMeta(mapping, has_cpu_mapping=cpu_access, hMemory=mapping.paddrs[0][0]), owner=self.dev)
+  def free(self, b:HCQBuffer):
+    for dev in b.mapped_devs[1:]: dev.iface.dev_impl.mm.unmap_range(b.va_addr, b.size)
+    if not b.meta.mapping.system: self.dev_impl.mm.vfree(b.meta.mapping)
+    if b.owner == self.dev and b.meta.has_cpu_mapping: FileIOInterface.munmap(b.va_addr, b.size)
+  def map(self, b:HCQBuffer):
+    if b.owner is not None and b.owner._is_cpu():
+      System.lock_memory(cast(int, b.va_addr), b.size)
+      paddrs, snooped, uncached = [(x, 0x1000) for x in System.system_paddrs(cast(int, b.va_addr), round_up(b.size, 0x1000))], True, False
+    elif (ifa:=getattr(b.owner, "iface", None)) is not None and isinstance(ifa, PCIIfaceBase):
+      paddrs = [(paddr if b.meta.mapping.system else (paddr + ifa.p2p_base_addr), size) for paddr,size in b.meta.mapping.paddrs]
+      snooped, uncached = b.meta.mapping.snooped, b.meta.mapping.uncached
+    else: raise RuntimeError(f"map failed: {b.owner} -> {self.dev}")
+    self.dev_impl.mm.map_range(cast(int, b.va_addr), round_up(b.size, 0x1000), paddrs, system=True, snooped=snooped, uncached=uncached)

tinygrad/runtime/support/usb.py ADDED Viewed

@@ -0,0 +1,268 @@
+import ctypes, struct, dataclasses, array, itertools
+from typing import Sequence
+from tinygrad.runtime.autogen import libusb
+from tinygrad.helpers import DEBUG, to_mv, round_up, OSX
+from tinygrad.runtime.support.hcq import MMIOInterface
+class USB3:
+  def __init__(self, vendor:int, dev:int, ep_data_in:int, ep_stat_in:int, ep_data_out:int, ep_cmd_out:int, max_streams:int=31):
+    self.vendor, self.dev = vendor, dev
+    self.ep_data_in, self.ep_stat_in, self.ep_data_out, self.ep_cmd_out = ep_data_in, ep_stat_in, ep_data_out, ep_cmd_out
+    self.max_streams = max_streams
+    self.ctx = ctypes.POINTER(libusb.struct_libusb_context)()
+    if libusb.libusb_init(ctypes.byref(self.ctx)): raise RuntimeError("libusb_init failed")
+    if DEBUG >= 6: libusb.libusb_set_option(self.ctx, libusb.LIBUSB_OPTION_LOG_LEVEL, 4)
+    self.handle = libusb.libusb_open_device_with_vid_pid(self.ctx, self.vendor, self.dev)
+    if not self.handle: raise RuntimeError(f"device {self.vendor:04x}:{self.dev:04x} not found. sudo required?")
+    # Detach kernel driver if needed
+    if libusb.libusb_kernel_driver_active(self.handle, 0):
+      libusb.libusb_detach_kernel_driver(self.handle, 0)
+      libusb.libusb_reset_device(self.handle)
+    # Set configuration and claim interface
+    if libusb.libusb_set_configuration(self.handle, 1): raise RuntimeError("set_configuration failed")
+    if libusb.libusb_claim_interface(self.handle, 0): raise RuntimeError("claim_interface failed. sudo required?")
+    if libusb.libusb_set_interface_alt_setting(self.handle, 0, 1): raise RuntimeError("alt_setting failed")
+    # Clear any stalled endpoints
+    all_eps = (self.ep_data_out, self.ep_data_in, self.ep_stat_in, self.ep_cmd_out)
+    for ep in all_eps: libusb.libusb_clear_halt(self.handle, ep)
+    # Allocate streams
+    stream_eps = (ctypes.c_uint8 * 3)(self.ep_data_out, self.ep_data_in, self.ep_stat_in)
+    if (rc:=libusb.libusb_alloc_streams(self.handle, self.max_streams * len(stream_eps), stream_eps, len(stream_eps))) < 0:
+      raise RuntimeError(f"alloc_streams failed: {rc}")
+    # Base cmd
+    cmd_template = bytes([0x01, 0x00, 0x00, 0x01, *([0] * 12), 0xE4, 0x24, 0x00, 0xB2, 0x1A, 0x00, 0x00, 0x00, *([0] * 8)])
+    # Init pools
+    self.tr = {ep: [libusb.libusb_alloc_transfer(0) for _ in range(self.max_streams)] for ep in all_eps}
+    self.buf_cmd = [(ctypes.c_uint8 * len(cmd_template))(*cmd_template) for _ in range(self.max_streams)]
+    self.buf_stat = [(ctypes.c_uint8 * 64)() for _ in range(self.max_streams)]
+    self.buf_data_in = [(ctypes.c_uint8 * 0x1000)() for _ in range(self.max_streams)]
+    self.buf_data_out = [(ctypes.c_uint8 * 0x80000)() for _ in range(self.max_streams)]
+    self.buf_data_out_mvs = [to_mv(ctypes.addressof(self.buf_data_out[i]), 0x80000) for i in range(self.max_streams)]
+    for slot in range(self.max_streams): struct.pack_into(">B", self.buf_cmd[slot], 3, slot + 1)
+  def _prep_transfer(self, tr, ep, stream_id, buf, length):
+    tr.contents.dev_handle, tr.contents.endpoint, tr.contents.length, tr.contents.buffer = self.handle, ep, length, buf
+    tr.contents.status, tr.contents.flags, tr.contents.timeout, tr.contents.num_iso_packets = 0xff, 0, 1000, 0
+    tr.contents.type = (libusb.LIBUSB_TRANSFER_TYPE_BULK_STREAM if stream_id is not None else libusb.LIBUSB_TRANSFER_TYPE_BULK)
+    if stream_id is not None: libusb.libusb_transfer_set_stream_id(tr, stream_id)
+    return tr
+  def _submit_and_wait(self, cmds):
+    for tr in cmds: libusb.libusb_submit_transfer(tr)
+    running = len(cmds)
+    while running:
+      libusb.libusb_handle_events(self.ctx)
+      running = len(cmds)
+      for tr in cmds:
+        if tr.contents.status == libusb.LIBUSB_TRANSFER_COMPLETED: running -= 1
+        elif tr.contents.status != 0xFF: raise RuntimeError(f"EP 0x{tr.contents.endpoint:02X} error: {tr.contents.status}")
+  def send_batch(self, cdbs:list[bytes], idata:list[int]|None=None, odata:list[bytes|None]|None=None) -> list[bytes|None]:
+    idata, odata = idata or [0] * len(cdbs), odata or [None] * len(cdbs)
+    results, tr_window, op_window = [], [], []
+    for idx, (cdb, rlen, send_data) in enumerate(zip(cdbs, idata, odata)):
+      # allocate slot and stream. stream is 1-based
+      slot, stream = idx % self.max_streams, (idx % self.max_streams) + 1
+      # build cmd packet
+      self.buf_cmd[slot][16:16+len(cdb)] = list(cdb)
+      # cmd + stat transfers
+      tr_window.append(self._prep_transfer(self.tr[self.ep_cmd_out][slot], self.ep_cmd_out, None, self.buf_cmd[slot], len(self.buf_cmd[slot])))
+      tr_window.append(self._prep_transfer(self.tr[self.ep_stat_in][slot], self.ep_stat_in, stream, self.buf_stat[slot], 64))
+      if rlen:
+        if rlen > len(self.buf_data_in[slot]): self.buf_data_in[slot] = (ctypes.c_uint8 * round_up(rlen, 0x1000))()
+        tr_window.append(self._prep_transfer(self.tr[self.ep_data_in][slot], self.ep_data_in, stream, self.buf_data_in[slot], rlen))
+      if send_data is not None:
+        if len(send_data) > len(self.buf_data_out[slot]):
+          self.buf_data_out[slot] = (ctypes.c_uint8 * len(send_data))()
+          self.buf_data_out_mvs[slot] = to_mv(ctypes.addressof(self.buf_data_out[slot]), len(send_data))
+        self.buf_data_out_mvs[slot][:len(send_data)] = bytes(send_data)
+        tr_window.append(self._prep_transfer(self.tr[self.ep_data_out][slot], self.ep_data_out, stream, self.buf_data_out[slot], len(send_data)))
+      op_window.append((idx, slot, rlen))
+      if (idx + 1 == len(cdbs)) or len(op_window) >= self.max_streams:
+        self._submit_and_wait(tr_window)
+        for idx, slot, rlen in op_window: results.append(bytes(self.buf_data_in[slot][:rlen]) if rlen else None)
+        tr_window = []
+    return results
+@dataclasses.dataclass(frozen=True)
+class WriteOp: addr:int; data:bytes; ignore_cache:bool=True # noqa: E702
+@dataclasses.dataclass(frozen=True)
+class ReadOp: addr:int; size:int # noqa: E702
+@dataclasses.dataclass(frozen=True)
+class ScsiWriteOp: data:bytes; lba:int=0 # noqa: E702
+class ASM24Controller:
+  def __init__(self):
+    self.usb = USB3(0xADD1, 0x0001, 0x81, 0x83, 0x02, 0x04)
+    self._cache: dict[int, int|None] = {}
+    self._pci_cacheable: list[tuple[int, int]] = []
+    self._pci_cache: dict[int, int|None] = {}
+    # Init controller.
+    self.exec_ops([WriteOp(0x54b, b' '), WriteOp(0x54e, b'\x04'), WriteOp(0x5a8, b'\x02'), WriteOp(0x5f8, b'\x04'),
+      WriteOp(0x7ec, b'\x01\x00\x00\x00'), WriteOp(0xc422, b'\x02'), WriteOp(0x0, b'\x33')])
+  def exec_ops(self, ops:Sequence[WriteOp|ReadOp|ScsiWriteOp]):
+    cdbs:list[bytes] = []
+    idata:list[int] = []
+    odata:list[bytes|None] = []
+    def _add_req(cdb:bytes, i:int, o:bytes|None):
+      nonlocal cdbs, idata, odata
+      cdbs, idata, odata = cdbs + [cdb], idata + [i], odata + [o]
+    for op in ops:
+      if isinstance(op, WriteOp):
+        for off, value in enumerate(op.data):
+          addr = ((op.addr + off) & 0x1FFFF) | 0x500000
+          if not op.ignore_cache and self._cache.get(addr) == value: continue
+          _add_req(struct.pack('>BBBHB', 0xE5, value, addr >> 16, addr & 0xFFFF, 0), 0, None)
+          self._cache[addr] = value
+      elif isinstance(op, ReadOp):
+        assert op.size <= 0xff
+        addr = (op.addr & 0x1FFFF) | 0x500000
+        _add_req(struct.pack('>BBBHB', 0xE4, op.size, addr >> 16, addr & 0xFFFF, 0), op.size, None)
+        for i in range(op.size): self._cache[addr + i] = None
+      elif isinstance(op, ScsiWriteOp):
+        sectors = round_up(len(op.data), 512) // 512
+        _add_req(struct.pack('>BBQIBB', 0x8A, 0, op.lba, sectors, 0, 0), 0, op.data+b'\x00'*((sectors*512)-len(op.data)))
+    return self.usb.send_batch(cdbs, idata, odata)
+  def write(self, base_addr:int, data:bytes, ignore_cache:bool=True): return self.exec_ops([WriteOp(base_addr, data, ignore_cache)])
+  def scsi_write(self, buf:bytes, lba:int=0):
+    if len(buf) > 0x4000: buf += b'\x00' * (round_up(len(buf), 0x10000) - len(buf))
+    for i in range(0, len(buf), 0x10000):
+      self.exec_ops([ScsiWriteOp(buf[i:i+0x10000], lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True)])
+      self.exec_ops([WriteOp(0xce6e, b'\x00\x00', ignore_cache=True)])
+    if len(buf) > 0x4000:
+      for i in range(4): self.exec_ops([WriteOp(0xce40 + i, b'\x00', ignore_cache=True)])
+  def read(self, base_addr:int, length:int, stride:int=0xff) -> bytes:
+    parts = self.exec_ops([ReadOp(base_addr + off, min(stride, length - off)) for off in range(0, length, stride)])
+    return b''.join(p or b'' for p in parts)[:length]
+  def _is_pci_cacheable(self, addr:int) -> bool: return any(x <= addr <= x + sz for x, sz in self._pci_cacheable)
+  def pcie_prep_request(self, fmt_type:int, address:int, value:int|None=None, size:int=4) -> list[WriteOp]:
+    if fmt_type == 0x60 and size == 4 and self._is_pci_cacheable(address) and self._pci_cache.get(address) == value: return []
+    assert fmt_type >> 8 == 0 and size > 0 and size <= 4, f"Invalid fmt_type {fmt_type} or size {size}"
+    if DEBUG >= 5: print("pcie_request", hex(fmt_type), hex(address), value, size)
+    masked_address, offset = address & 0xFFFFFFFC, address & 0x3
+    assert size + offset <= 4 and (value is None or value >> (8 * size) == 0)
+    self._pci_cache[address] = value if size == 4 and fmt_type == 0x60 else None
+    return ([WriteOp(0xB220, struct.pack('>I', value << (8 * offset)), ignore_cache=False)] if value is not None else []) + \
+      [WriteOp(0xB218, struct.pack('>I', masked_address), ignore_cache=False), WriteOp(0xB21c, struct.pack('>I', address>>32), ignore_cache=False),
+       WriteOp(0xB217, bytes([((1 << size) - 1) << offset]), ignore_cache=False), WriteOp(0xB210, bytes([fmt_type]), ignore_cache=False),
+       WriteOp(0xB254, b"\x0f", ignore_cache=True), WriteOp(0xB296, b"\x04", ignore_cache=True)]
+  def pcie_request(self, fmt_type, address, value=None, size=4, cnt=10):
+    self.exec_ops(self.pcie_prep_request(fmt_type, address, value, size))
+    # Fast path for write requests
+    if ((fmt_type & 0b11011111) == 0b01000000) or ((fmt_type & 0b10111000) == 0b00110000): return
+    while (stat:=self.read(0xB296, 1)[0]) & 2 == 0:
+      if stat & 1:
+        self.write(0xB296, bytes([0x01]))
+        if cnt > 0: return self.pcie_request(fmt_type, address, value, size, cnt=cnt-1)
+    assert stat == 2, f"stat read 2 was {stat}"
+    # Retrieve completion data from Link Status (0xB22A, 0xB22B)
+    b284 = self.read(0xB284, 1)[0]
+    completion = struct.unpack('>H', self.read(0xB22A, 2))
+    # Validate completion status based on PCIe request typ
+    # Completion TLPs for configuration requests always have a byte count of 4.
+    assert completion[0] & 0xfff == (4 if (fmt_type & 0xbe == 0x04) else size)
+    # Extract completion status field
+    status = (completion[0] >> 13) & 0x7
+    # Handle completion errors or inconsistencies
+    if status or ((fmt_type & 0xbe == 0x04) and (((value is None) and (not (b284 & 0x01))) or ((value is not None) and (b284 & 0x01)))):
+      status_map = {0b001: f"Unsupported Request: invalid address/function (target might not be reachable): {address:#x}",
+                    0b100: "Completer Abort: abort due to internal error", 0b010: "Configuration Request Retry Status: configuration space busy"}
+      raise RuntimeError(f"TLP status: {status_map.get(status, 'Reserved (0b{:03b})'.format(status))}")
+    if value is None: return (struct.unpack('>I', self.read(0xB220, 4))[0] >> (8 * (address & 0x3))) & ((1 << (8 * size)) - 1)
+  def pcie_cfg_req(self, byte_addr, bus=1, dev=0, fn=0, value=None, size=4):
+    assert byte_addr >> 12 == 0 and bus >> 8 == 0 and dev >> 5 == 0 and fn >> 3 == 0, f"Invalid byte_addr {byte_addr}, bus {bus}, dev {dev}, fn {fn}"
+    fmt_type = (0x44 if value is not None else 0x4) | int(bus > 0)
+    address = (bus << 24) | (dev << 19) | (fn << 16) | (byte_addr & 0xfff)
+    return self.pcie_request(fmt_type, address, value, size)
+  def pcie_mem_req(self, address, value=None, size=4): return self.pcie_request(0x60 if value is not None else 0x20, address, value, size)
+  def pcie_mem_write(self, address, values, size):
+    ops = [self.pcie_prep_request(0x60, address + i * size, value, size) for i, value in enumerate(values)]
+    # Send in batches of 4 for OSX and 16 for Linux (benchmarked values)
+    for i in range(0, len(ops), bs:=(4 if OSX else 16)): self.exec_ops(list(itertools.chain.from_iterable(ops[i:i+bs])))
+class USBMMIOInterface(MMIOInterface):
+  def __init__(self, usb, addr, size, fmt, pcimem=True):
+    self.usb, self.addr, self.nbytes, self.fmt, self.pcimem, self.el_sz = usb, addr, size, fmt, pcimem, struct.calcsize(fmt)
+  def __getitem__(self, index): return self._access_items(index)
+  def __setitem__(self, index, val): self._access_items(index, val)
+  def _access_items(self, index, val=None):
+    if isinstance(index, slice): return self._acc((index.start or 0) * self.el_sz, ((index.stop or len(self))-(index.start or 0)) * self.el_sz, val)
+    return self._acc_one(index * self.el_sz, self.el_sz, val) if self.pcimem else self._acc(index * self.el_sz, self.el_sz, val)
+  def view(self, offset:int=0, size:int|None=None, fmt=None):
+    return USBMMIOInterface(self.usb, self.addr+offset, size or (self.nbytes - offset), fmt=fmt or self.fmt, pcimem=self.pcimem)
+  def _acc_size(self, sz): return next(x for x in [('I', 4), ('H', 2), ('B', 1)] if sz % x[1] == 0)
+  def _acc_one(self, off, sz, val=None):
+    upper = 0 if sz < 8 else self.usb.pcie_mem_req(self.addr + off + 4, val if val is None else (val >> 32), 4)
+    lower = self.usb.pcie_mem_req(self.addr + off, val if val is None else val & 0xffffffff, min(sz, 4))
+    if val is None: return lower | (upper << 32)
+  def _acc(self, off, sz, data=None):
+    if data is None: # read op
+      if not self.pcimem:
+        return int.from_bytes(self.usb.read(self.addr + off, sz), "little") if sz == self.el_sz else self.usb.read(self.addr + off, sz)
+      acc, acc_size = self._acc_size(sz)
+      return bytes(array.array(acc, [self._acc_one(off + i * acc_size, acc_size) for i in range(sz // acc_size)]))
+    else: # write op
+      data = struct.pack(self.fmt, data) if isinstance(data, int) else bytes(data)
+      if not self.pcimem:
+        # Fast path for writing into buffer 0xf000
+        use_cache = 0xa800 <= self.addr <= 0xb000
+        return self.usb.scsi_write(bytes(data)) if self.addr == 0xf000 else self.usb.write(self.addr + off, bytes(data), ignore_cache=not use_cache)
+      _, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))
+      self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz)

tinygrad/runtime/support/webgpu.py ADDED Viewed

@@ -0,0 +1,18 @@
+import ctypes, ctypes.util, os, subprocess, platform, sysconfig
+from tinygrad.helpers import OSX
+WEBGPU_PATH: str | None
+if OSX:
+  if not os.path.exists(brew_prefix:=subprocess.check_output(['brew', '--prefix', 'dawn']).decode().strip()):
+    raise FileNotFoundError('dawn library not found. Install it with `brew tap wpmed92/dawn && brew install dawn`')
+  WEBGPU_PATH = os.path.join(brew_prefix, 'lib', 'libwebgpu_dawn.dylib')
+elif platform.system() == "Windows":
+  if not os.path.exists(pydawn_path:=os.path.join(sysconfig.get_paths()["purelib"], "pydawn")):
+    raise FileNotFoundError("dawn library not found. Install it with `pip install dawn-python`")
+  WEBGPU_PATH = os.path.join(pydawn_path, "lib", "libwebgpu_dawn.dll")
+else:
+  if (WEBGPU_PATH:=ctypes.util.find_library('webgpu_dawn')) is None:
+    raise FileNotFoundError("dawn library not found. " +
+    "Install it with `sudo curl -L https://github.com/wpmed92/pydawn/releases/download/v0.3.0/" +
+    f"libwebgpu_dawn_{platform.machine()}.so -o /usr/lib/libwebgpu_dawn.so`")

tinygrad/schedule/__init__.py ADDED Viewed

File without changes

tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

tinygrad 0.10.2py3-none-any.whl → 0.11.0py3-none-any.whl