PyPI - tinygrad - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

tinygrad 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

tinygrad/codegen/devectorizer.py +247 -0
tinygrad/codegen/expander.py +121 -0
tinygrad/codegen/kernel.py +141 -201
tinygrad/codegen/linearize.py +223 -84
tinygrad/codegen/lowerer.py +60 -42
tinygrad/codegen/symbolic.py +476 -0
tinygrad/codegen/transcendental.py +22 -13
tinygrad/device.py +187 -47
tinygrad/dtype.py +39 -28
tinygrad/engine/jit.py +83 -65
tinygrad/engine/memory.py +4 -5
tinygrad/engine/multi.py +161 -0
tinygrad/engine/realize.py +62 -108
tinygrad/engine/schedule.py +396 -357
tinygrad/engine/search.py +55 -66
tinygrad/gradient.py +73 -0
tinygrad/helpers.py +81 -59
tinygrad/nn/__init__.py +30 -32
tinygrad/nn/datasets.py +1 -2
tinygrad/nn/optim.py +22 -26
tinygrad/nn/state.py +91 -66
tinygrad/ops.py +492 -641
tinygrad/renderer/__init__.py +95 -36
tinygrad/renderer/cstyle.py +99 -92
tinygrad/renderer/llvmir.py +83 -34
tinygrad/renderer/ptx.py +83 -99
tinygrad/renderer/wgsl.py +95 -0
tinygrad/runtime/autogen/amd_gpu.py +39507 -12
tinygrad/runtime/autogen/comgr.py +2 -0
tinygrad/runtime/autogen/kfd.py +4 -3
tinygrad/runtime/autogen/kgsl.py +1 -1
tinygrad/runtime/autogen/libc.py +404 -71
tinygrad/runtime/autogen/llvm.py +11379 -0
tinygrad/runtime/autogen/pci.py +1333 -0
tinygrad/runtime/autogen/vfio.py +891 -0
tinygrad/runtime/autogen/webgpu.py +6985 -0
tinygrad/runtime/graph/cuda.py +8 -9
tinygrad/runtime/graph/hcq.py +84 -79
tinygrad/runtime/graph/metal.py +40 -43
tinygrad/runtime/ops_amd.py +498 -334
tinygrad/runtime/ops_cloud.py +34 -34
tinygrad/runtime/ops_cpu.py +24 -0
tinygrad/runtime/ops_cuda.py +30 -27
tinygrad/runtime/ops_disk.py +62 -63
tinygrad/runtime/ops_dsp.py +159 -42
tinygrad/runtime/ops_gpu.py +30 -30
tinygrad/runtime/ops_hip.py +29 -31
tinygrad/runtime/ops_llvm.py +48 -41
tinygrad/runtime/ops_metal.py +149 -113
tinygrad/runtime/ops_npy.py +2 -2
tinygrad/runtime/ops_nv.py +238 -273
tinygrad/runtime/ops_python.py +55 -50
tinygrad/runtime/ops_qcom.py +129 -157
tinygrad/runtime/ops_webgpu.py +225 -0
tinygrad/runtime/support/allocator.py +94 -0
tinygrad/runtime/support/am/__init__.py +0 -0
tinygrad/runtime/support/am/amdev.py +396 -0
tinygrad/runtime/support/am/ip.py +463 -0
tinygrad/runtime/support/compiler_cuda.py +4 -2
tinygrad/runtime/support/elf.py +28 -4
tinygrad/runtime/support/hcq.py +256 -324
tinygrad/runtime/support/llvm.py +26 -0
tinygrad/shape/shapetracker.py +85 -53
tinygrad/shape/view.py +104 -140
tinygrad/spec.py +155 -0
tinygrad/tensor.py +835 -527
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
tinygrad/viz/index.html +544 -0
tinygrad/viz/perfetto.html +178 -0
tinygrad/viz/serve.py +205 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
tinygrad-0.10.2.dist-info/RECORD +99 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
tinygrad/codegen/uopgraph.py +0 -506
tinygrad/engine/lazy.py +0 -228
tinygrad/function.py +0 -212
tinygrad/multi.py +0 -177
tinygrad/runtime/graph/clang.py +0 -39
tinygrad/runtime/ops_clang.py +0 -35
tinygrad-0.10.0.dist-info/RECORD +0 -77
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0

tinygrad/runtime/support/hcq.py CHANGED Viewed

@@ -1,48 +1,106 @@
 from __future__ import annotations
-from typing import List, Optional, Dict, Tuple, Any, cast, Protocol, Type, Union
-import contextlib, decimal, statistics, random, json, atexit, time, array, ctypes
-from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv
+from typing import cast, Type, TypeVar, Generic, Any
+import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
+from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
 from tinygrad.renderer import Renderer
-from tinygrad.device import BufferOptions, Allocator, Compiler, Compiled, LRUAllocator
+from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent
+from tinygrad.ops import sym_infer, sint, Variable, UOp
+from tinygrad.runtime.autogen import libc
-# **************** for HCQ Compatible Devices ****************
-def hcq_command(func):
+class HWInterface:
   """
-  Decorator for HWCommandQueue commands. Enables command indexing and stores metadata for command updates.
-  For example:
-    ```python
-      @hcq_command
-      def command_method(self, ...): ...
-    ```
+  Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices.
   """
-  def __wrapper(self, *args, **kwargs):
-    self.cmds_offset.append(len(self.q))
-    func(self, *args, **kwargs)
-    self.cmds_len.append(len(self.q) - self.cmds_offset[-1])
-    self.cmds_meta.append(func.__name__)
-    return self
-  return __wrapper
-class HWCommandQueue:
+  def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None):
+    self.path:str = path
+    self.fd:int = fd or os.open(path, flags)
+  def __del__(self):
+    if hasattr(self, 'fd'): os.close(self.fd)
+  def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg)
+  def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset)
+  def read(self, size=None, binary=False, offset=None):
+    if offset is not None: self.seek(offset)
+    with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size)
+  def write(self, content, binary=False, offset=None):
+    if offset is not None: self.seek(offset)
+    with open(self.fd, "wb" if binary else "w", closefd=False) as file: file.write(content)
+  def listdir(self): return os.listdir(self.path)
+  def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET)
+  @staticmethod
+  def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset)
+  @staticmethod
+  def munmap(buf, sz): return libc.munmap(buf, sz)
+  @staticmethod
+  def exists(path): return os.path.exists(path)
+  @staticmethod
+  def readlink(path): return os.readlink(path)
+  @staticmethod
+  def eventfd(initval, flags=None): return HWInterface(fd=os.eventfd(initval, flags))  # type: ignore[attr-defined]
+if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockHWInterface as HWInterface  # noqa: F401 # pylint: disable=unused-import
+# **************** for HCQ Compatible Devices ****************
+SignalType = TypeVar('SignalType', bound='HCQSignal')
+DeviceType = TypeVar('DeviceType', bound='HCQCompiled')
+ProgramType = TypeVar('ProgramType', bound='HCQProgram')
+ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState')
+QueueType = TypeVar('QueueType', bound='HWQueue')
+class BumpAllocator:
+  def __init__(self, size:int, base:int=0, wrap:bool=True): self.size, self.ptr, self.base, self.wrap = size, 0, base, wrap
+  def alloc(self, size:int, alignment:int=1) -> int:
+    if round_up(self.ptr, alignment) + size > self.size:
+      if not self.wrap: raise RuntimeError("Out of memory")
+      self.ptr = 0
+    self.ptr = (res:=round_up(self.ptr, alignment)) + size
+    return res + self.base
+class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
   """
   A base class for hardware command queues in the HCQ (Hardware Command Queue) API.
-  Both compute and copy queues should have the following commands implemented.
   """
-  def __init__(self): self.q, self.binded_device, self.cmds_offset, self.cmds_len, self.cmds_meta = [], None, [], [], []
-  def __len__(self): return len(self.cmds_offset)
-  def _patch(self, cmd_idx, offset, data): self.q[(st:=self.cmds_offset[cmd_idx]+offset):st+len(data)] = array.array('I', data)
-  def _cur_cmd_idx(self) -> int:
+  def __init__(self):
+    self._q:Any = []
+    self.binded_device:DeviceType|None = None
+    self.q_sints:list[tuple[int, int]] = []
+    self.mv_sints:list[tuple[memoryview, int, int, int|None]] = []
+    self.syms:list[sint] = []
+    self._prev_resolved_syms:list[int|None] = []
+  def _new_sym(self, sym:sint) -> int:
+    if sym not in self.syms:
+      self.syms.append(sym)
+      self._prev_resolved_syms.append(None)
+    return self.syms.index(sym)
+  def q(self, *values):
     """
-    Returns the index of the command currently being enqueued.
-    Should be called only within functions that enqueue commands and are decorated with `@hcq_command`.
+    Enqueues values in the queue.
+    Args:
+      values: The values to enqueue in the queue.
     """
-    return len(self) - 1
-  @hcq_command
-  def signal(self, signal:HCQSignal, value:int):
+    for v in values:
+      if isinstance(v, UOp):
+        self.q_sints.append((len(self._q), self._new_sym(v)))
+        self._q.append(0xbadc0ded)
+      else: self._q.append(v)
+  # *** common commands  ***
+  def timestamp(self, signal:SignalType):
+    """
+    Enqueues a timestamp command which records the current time in a signal after all previously enqueued commands are completed.
+    Args:
+      signal: The signal to store the timestamp
+    """
+  def signal(self, signal:SignalType, value:sint):
     """
     Enqueues a signal command which sets the signal to the given value, ensuring all previous operations are completed.
@@ -50,11 +108,8 @@ class HWCommandQueue:
       signal: The signal to set
       value: The value to set the signal to
     """
-    self._signal(signal, value)
-  def _signal(self, signal:HCQSignal, value:int): raise NotImplementedError("backend should overload this function")
-  @hcq_command
-  def wait(self, signal:HCQSignal, value:int):
+  def wait(self, signal:SignalType, value:sint):
     """
     Enqueues a wait command which halts execution until the signal is greater than or equal to a specific value.
@@ -62,49 +117,40 @@ class HWCommandQueue:
       signal: The signal to wait on
       value: The value to wait for
     """
-    self._wait(signal, value)
-  def _wait(self, signal, value): raise NotImplementedError("backend should overload this function")
-  @hcq_command
-  def timestamp(self, signal:HCQSignal):
-    """
-    Enqueues a timestamp command which records the current time in a signal after all previously enqueued commands are completed.
+  # *** commands for compute queues ***
-    Args:
-      signal: The signal to store the timestamp
+  def memory_barrier(self):
+    """
+    Enqueues a memory barrier command to ensure memory coherence between agents. Only on compute queues.
     """
-    self._timestamp(signal)
-  def _timestamp(self, signal): raise NotImplementedError("backend should overload this function")
-  def update_signal(self, cmd_idx:int, signal:Optional[Any]=None, value:Optional[int]=None):
+  def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
     """
-    Updates a previously queued signal command.
+    Enqueues an execution command for a kernel program. Only on compute queues.
     Args:
-      cmd_idx: Index of the signal command to update
-      signal: New signal to set (if None, keeps the original)
-      value: New value to set (if None, keeps the original)
+      prg: The program to execute
+      args_state: The args state to execute program with
+      global_size: The global work size
+      local_size: The local work size
     """
-    if self.cmds_meta[cmd_idx] != "signal": raise RuntimeError("called update_signal not on a signal command")
-    self._update_signal(cmd_idx, signal, value)
-    return self
-  def _update_signal(self, cmd_idx:int, signal:Optional[Any], value:Optional[int]): raise NotImplementedError("backend should overload this function")
-  def update_wait(self, cmd_idx:int, signal:Optional[Any]=None, value:Optional[int]=None):
+  # *** commands for copy queues ***
+  def copy(self, dest:sint, src:sint, copy_size:int):
     """
-    Updates a previously queued wait command.
+    Enqueues a copy command to transfer data. Only on copy queues.
     Args:
-      cmd_idx: Index of the wait command to update
-      signal: New signal to wait on (if None, keeps the original)
-      value: New value to wait for (if None, keeps the original)
+      dest: The destination of the copy
+      src: The source of the copy
+      copy_size: The size of data to copy
     """
-    if self.cmds_meta[cmd_idx] != "wait": raise RuntimeError("called update_wait not on a wait command")
-    self._update_wait(cmd_idx, signal, value)
-    return self
-  def _update_wait(self, cmd_idx:int, signal:Optional[Any], value:Optional[int]): raise NotImplementedError("backend should overload this function")
-  def bind(self, device:HCQCompiled):
+  # *** submit and bind commands  ***
+  def bind(self, dev:DeviceType):
     """
     Associates the queue with a specific device for optimized execution.
@@ -112,99 +158,65 @@ class HWCommandQueue:
     the need to copy queues into the device, thereby enhancing performance.
     Args:
-      device: The target device for queue optimization.
+      dev: The target device for queue optimization.
     Note:
       Implementing this method is optional but recommended for performance gains.
     """
-  def submit(self, device:HCQCompiled):
-    """
-    Submits the command queue to a specific device for execution.
+  def bind_args_state(self, args_state:ArgsStateType):
+    for vals, ptr, fmt in args_state.bind_data: self.bind_sints_to_ptr(*vals, ptr=ptr, fmt=fmt)
-    Args:
-      device: The device to submit the queue to
-    """
-    if self.q: self._submit(device)
-    return self
-  def _submit(self, device:HCQCompiled): raise NotImplementedError("backend should overload this function")
+  def bind_sints(self, *vals:sint, struct:ctypes.Structure, start_field:str, fmt, mask:int|None=None):
+    self.bind_sints_to_ptr(*vals, ptr=ctypes.addressof(struct) + getattr(type(struct), start_field).offset, fmt=fmt, mask=mask)
-class HWComputeQueue(HWCommandQueue):
-  @hcq_command
-  def memory_barrier(self):
-    """
-    Enqueues a memory barrier command to ensure memory coherence between agents.
-    """
-    self._memory_barrier()
-  def _memory_barrier(self): pass
+  def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt, mask:int|None=None):
+    mv = to_mv(ptr, 8*len(vals)).cast(fmt)
+    for i, val in enumerate(vals):
+      if isinstance(val, int): mv[i] = val if mask is None else ((mv[i] & ~mask) | val)
+      else: self.mv_sints.append((mv, i, self._new_sym(val), mask))
-  @hcq_command
-  def exec(self, prg:HCQProgram, args_state:HCQArgsState, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int]):
-    """
-    Enqueues an execution command for a kernel program.
+  def _apply_var_vals(self, var_vals:dict[Variable, int]):
+    resolved_syms = [sym_infer(sym, var_vals) for sym in self.syms]
-    Args:
-      prg: The program to execute
-      args_state: The args state to execute program with
-      global_size: The global work size
-      local_size: The local work size
-    """
-    self._exec(prg, args_state, global_size, local_size)
-  def _exec(self, prg, args_state, global_size, local_size): raise NotImplementedError("backend should overload this function")
+    for off, sym_idx in self.q_sints:
+      if self._prev_resolved_syms[sym_idx] == resolved_syms[sym_idx]: continue
+      self._q[off] = resolved_syms[sym_idx]
-  def update_exec(self, cmd_idx:int, global_size:Optional[Tuple[int,int,int]]=None, local_size:Optional[Tuple[int,int,int]]=None):
-    """
-    Updates a previously queued execution command.
+    for mv, off, sym_idx, mask in self.mv_sints:
+      if self._prev_resolved_syms[sym_idx] == resolved_syms[sym_idx]: continue
+      mv[off] = resolved_syms[sym_idx] if mask is None else ((mv[off] & ~mask) | resolved_syms[sym_idx])
-    Args:
-      cmd_idx: Index of the execution command to update
-      global_size: New global work size (if None, keeps the original)
-      local_size: New local work size (if None, keeps the original)
-    """
-    if self.cmds_meta[cmd_idx] != "exec": raise RuntimeError("called update_exec not on an exec command")
-    self._update_exec(cmd_idx, global_size, local_size)
-    return self
-  def _update_exec(self, cmd_idx, global_size, local_size): raise NotImplementedError("backend should overload this function")
+    self._prev_resolved_syms = cast(list[int|None], resolved_syms)
-class HWCopyQueue(HWCommandQueue):
-  @hcq_command
-  def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
+  def submit(self, dev:DeviceType, var_vals:dict[Variable, int]|None=None):
     """
-    Enqueues a copy command to transfer data.
+    Submits the command queue to a specific device for execution.
     Args:
-      dest: The destination of the copy
-      src: The source of the copy
-      copy_size: The size of data to copy
+      dev: The device to submit the queue to
     """
-    self._copy(dest, src, copy_size)
-  def _copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int): raise NotImplementedError("backend should overload this function")
-  def update_copy(self, cmd_idx:int, dest:Optional[HCQBuffer]=None, src:Optional[HCQBuffer]=None):
-    """
-    Updates a previously queued copy command.
-    Args:
-      cmd_idx: Index of the copy command to update
-      dest: New destination of the copy (if None, keeps the original)
-      src: New source of the copy (if None, keeps the original)
-    """
-    if self.cmds_meta[cmd_idx] != "copy": raise RuntimeError("called update_copy not on an copy command")
-    self._update_copy(cmd_idx, dest, src)
+    if var_vals is not None: self._apply_var_vals(var_vals)
+    self._submit(dev)
     return self
-  def _update_copy(self, cmd_idx, dest, src): raise NotImplementedError("backend should overload this function")
+  def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit")
-class HCQSignal:
-  def __init__(self, value:int=0, is_timeline:bool=False): self._set_value(value)
+class HCQSignal(Generic[DeviceType]):
+  def __init__(self, base_addr:sint=0, value:int=0, timeline_for_device:DeviceType|None=None, timestamp_divider=1, value_off=0, timestamp_off=8):
+    self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off
+    self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
+    self.timeline_for_device:DeviceType|None = timeline_for_device
+    if isinstance(base_addr, int):
+      self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
+      self.value_mv[0] = value
   @property
-  def value(self) -> int: return self._get_value()
+  def value(self) -> int: return self.value_mv[0]
   @value.setter
-  def value(self, new_value:int): self._set_value(new_value)
-  def _get_value(self) -> int: raise NotImplementedError("_get_value() method must be implemented")
-  def _set_value(self, new_value:int): raise NotImplementedError("_set_value() method must be implemented")
+  def value(self, new_value:int): self.value_mv[0] = new_value
   @property
   def timestamp(self) -> decimal.Decimal:
@@ -216,8 +228,12 @@ class HCQSignal:
     Returns:
       The timestamp in microseconds.
     """
-    return self._get_timestamp()
-  def _get_timestamp(self) -> decimal.Decimal: raise NotImplementedError("_get_timestamp() method must be implemented")
+    return self.timestamp_mv[0] / self.timestamp_divider
+  def _sleep(self, time_spent_waiting_ms:int):
+    """
+    Optional function which can implement sleep functionality for the signal.
+    """
   def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
     """
@@ -227,17 +243,18 @@ class HCQSignal:
       value: The value to wait for.
       timeout: Maximum time to wait in milliseconds. Defaults to 10s.
     """
-    start_time = time.time() * 1000
-    while time.time() * 1000 - start_time < timeout:
-      if self.value >= value: return
-    raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
+    start_time = int(time.perf_counter() * 1000)
+    while self.value < value and (time_spent:=int(time.perf_counter() * 1000) - start_time) < timeout:
+      self._sleep(time_spent)
+    if self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
 @contextlib.contextmanager
-def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
+def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Type[HWQueue]|None=None, queue:HWQueue|None=None):
   st, en = (dev.signal_t(), dev.signal_t()) if enabled else (None, None)
   if enabled and queue is not None: queue.timestamp(st)
   elif enabled:
+    assert queue_type is not None
     queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(st).signal(dev.timeline_signal, dev.timeline_value).submit(dev)
     dev.timeline_value += 1
@@ -245,21 +262,33 @@ def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
   finally:
     if enabled and queue is not None: queue.timestamp(en)
     elif enabled:
+      assert queue_type is not None
       queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(en).signal(dev.timeline_signal, dev.timeline_value).submit(dev)
       dev.timeline_value += 1
-    if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
+    if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
+class HCQArgsState(Generic[ProgramType]):
+  def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
+    self.ptr, self.prg = ptr, prg
+    self.bind_data:list[tuple[tuple[sint, ...], int, str]] = []
+  def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
+class CLikeArgsState(HCQArgsState[ProgramType]):
+  def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:list[int]|None=None):
+    super().__init__(ptr, prg, bufs, vals=vals)
+    if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
-class HCQArgsState:
-  def __init__(self, ptr:int, prg:HCQProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()): self.ptr, self.prg = ptr, prg
-  def update_buffer(self, index:int, buf:HCQBuffer): raise NotImplementedError("need update_buffer")
-  def update_var(self, index:int, val:int): raise NotImplementedError("need update_var")
+    self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.ptr + len(prefix or []) * 4, fmt='Q')
+    self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
-class HCQProgram:
-  def __init__(self, args_state_t:Type[HCQArgsState], device:HCQCompiled, name:str, kernargs_alloc_size:int):
-    self.args_state_t, self.device, self.name, self.kernargs_alloc_size = args_state_t, device, name, kernargs_alloc_size
+class HCQProgram(Generic[DeviceType]):
+  def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
+    self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
-  def fill_kernargs(self, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=(), kernargs_ptr:Optional[int]=None) -> HCQArgsState:
+  def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:int|None=None) -> HCQArgsState:
     """
     Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.
     Args:
@@ -269,10 +298,10 @@ class HCQProgram:
     Returns:
       Arguments state with the given buffers and values set for the program.
     """
-    return self.args_state_t(kernargs_ptr or self.device._alloc_kernargs(self.kernargs_alloc_size), self, bufs, vals=vals)
+    return self.args_state_t(kernargs_ptr or self.dev.kernargs_allocator.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
-  def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
-               vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
+  def __call__(self, *bufs:HCQBuffer, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1),
+               vals:tuple[int, ...]=(), wait:bool=False) -> float|None:
     """
     Enqueues the program for execution with the given arguments and dimensions.
@@ -288,103 +317,52 @@ class HCQProgram:
     """
     kernargs = self.fill_kernargs(bufs, vals)
-    q = self.device.hw_compute_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
+    q = self.dev.hw_compute_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1).memory_barrier()
-    with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
+    with hcq_profile(self.dev, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
       q.exec(self, kernargs, global_size, local_size)
-    q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-    self.device.timeline_value += 1
+    q.signal(self.dev.timeline_signal, self.dev.timeline_value).submit(self.dev)
+    self.dev.timeline_value += 1
-    if wait: self.device.synchronize()
+    if wait: self.dev.synchronize()
     return (float(sig_en.timestamp - sig_st.timestamp) / 1e6) if wait else None
-class ProfileLogger:
-  writers: int = 0
-  mjson: List[Dict] = []
-  actors: Dict[Union[str, Tuple[str, str]], int] = {}
-  def __init__(self): self.events, self.deps, ProfileLogger.writers = [], [], ProfileLogger.writers + 1
-  def add_event(self, ev_name, ev_start, ev_end, actor, subactor=None, args=None): self.events += [(ev_name, ev_start, ev_end, actor, subactor, args)]
-  def _ensure_actor(self, actor_name, subactor_name):
-    if actor_name not in self.actors:
-      self.actors[actor_name] = (pid:=len(self.actors))
-      self.mjson.append({"name": "process_name", "ph": "M", "pid": pid, "args": {"name": actor_name}})
-    if (subactor_key:=(actor_name,subactor_name)) not in self.actors:
-      self.actors[subactor_key] = (tid:=len(self.actors))
-      self.mjson.append({"name": "thread_name", "ph": "M", "pid": self.actors[actor_name], "tid":tid, "args": {"name": subactor_name}})
-    return self.actors[actor_name], self.actors.get(subactor_key, -1)
-  def __del__(self):
-    # perfetto json docs: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
-    for name, st, et, actor_name, subactor_name, args in self.events:
-      pid, tid = self._ensure_actor(actor_name,subactor_name)
-      args = {k: (v if v.__class__ is str else v(et-st)) for k, v in args.items()} if args is not None else None
-      self.mjson.append({"name": name, "ph": "X", "pid": pid, "tid": tid, "ts": st, "dur": et-st, "args": args})
-    for en,st,dep_actor_name,dep_subactor_name,actor_name,subactor_name in self.deps:
-      dep_pid, dep_tid = self._ensure_actor(dep_actor_name,dep_subactor_name)
-      pid, tid = self._ensure_actor(actor_name,subactor_name)
-      self.mjson.append({"ph": "s", "pid": dep_pid, "tid": dep_tid, "id": len(self.mjson), "ts": en, "bp": "e"})
-      self.mjson.append({"ph": "f", "pid": pid, "tid": tid, "id": len(self.mjson)-1, "ts": st, "bp": "e"})
-    ProfileLogger.writers -= 1
-    if ProfileLogger.writers == 0 and len(self.mjson) > 0:
-      with open(PROFILEPATH.value, "w") as f: f.write(json.dumps({"traceEvents": self.mjson}))
-      print(f"Saved profile to {PROFILEPATH.value}. Use https://ui.perfetto.dev/ to open it.")
-class HCQCompiled(Compiled):
+class HCQCompiled(Compiled, Generic[SignalType]):
   """
   A base class for devices compatible with the HCQ (Hardware Command Queue) API.
   """
-  devices: List[HCQCompiled] = []
-  gpu2cpu_copy_time_diff: decimal.Decimal = decimal.Decimal('nan')
-  gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan')
+  devices: list[HCQCompiled] = []
-  def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal],
-               comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]):
+  def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
+               comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None):
+    self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
     self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
     self.timeline_value:int = 1
-    self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True)
-    self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
-    self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
-    self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
-    if PROFILE: self._prof_setup()
+    self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
+    self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
+    self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
     from tinygrad.runtime.graph.hcq import HCQGraph
     super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
-    self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferOptions(cpu_access=True))
-    self.kernargs_ptr:int = self.kernargs_page.va_addr
+    self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
+    self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True)
     self.devices.append(self)
   def synchronize(self):
-    try: self.timeline_signal.wait(self.timeline_value - 1) if not hasattr(self, '_syncdev') else self._syncdev()
+    try: self.timeline_signal.wait(self.timeline_value - 1)
     except RuntimeError as e:
       if hasattr(self, 'on_device_hang'): self.on_device_hang()
       else: raise e
     if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
     if PROFILE:
-      self.raw_prof_records += [(st.timestamp, en.timestamp, name, is_cp, None) for st, en, name, is_cp in self.sig_prof_records]
+      Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
       self.sig_prof_records = []
-  def _alloc_kernargs(self, alloc_size:int) -> int:
-    """
-    Allocates space for arguments passed to the kernel.
-    """
-    if self.kernargs_ptr >= (self.kernargs_page.va_addr + self.kernargs_page.size - alloc_size): self.kernargs_ptr = self.kernargs_page.va_addr
-    self.kernargs_ptr = (res:=self.kernargs_ptr) + alloc_size
-    return res
-  def _ensure_shared_time_base(self):
-    if not self.gpu2cpu_compute_time_diff.is_nan(): return
-    def _sync_cpu_queue(d, q_t):
+  def _at_profile_finalize(self):
+    def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
       q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)
       d.timeline_value += 1
       st = time.perf_counter_ns()
@@ -392,134 +370,94 @@ class HCQCompiled(Compiled):
       et = time.perf_counter_ns()
       return (decimal.Decimal(et+st) / 2000) - d.timeline_signal.timestamp
-    # randomly sample the timing from GPU to CPU
-    choices: List = [(d, d.hw_compute_queue_t, []) for d in self.devices]
-    choices += [(d, d.hw_copy_queue_t, []) for d in self.devices if d.hw_copy_queue_t is not None]
-    for _ in range(100*len(self.devices)):
-      d,q,l = random.choice(choices)
-      l.append(_sync_cpu_queue(d,q))
-    for d,q,l in choices:
-      if q == d.hw_compute_queue_t: d.gpu2cpu_compute_time_diff = statistics.median(l)
-      if q == d.hw_copy_queue_t: d.gpu2cpu_copy_time_diff = statistics.median(l)
-    def _sync_gpu_to_gpu_queue(d1, d2, q1_t, q2_t):
-      q1_t().signal(d1.timeline_signal, d1.timeline_value).wait(d2.timeline_signal, d2.timeline_value) \
-            .timestamp(d1.timeline_signal).signal(d1.timeline_signal, d1.timeline_value+1).submit(d1)
-      q2_t().signal(d2.timeline_signal, d2.timeline_value).wait(d1.timeline_signal, d1.timeline_value) \
-            .timestamp(d2.timeline_signal).signal(d2.timeline_signal, d2.timeline_value+1).submit(d2)
-      d1.timeline_value += 2
-      d2.timeline_value += 2
-      d1.timeline_signal.wait(d1.timeline_value - 1)
-      d2.timeline_signal.wait(d2.timeline_value - 1)
-      return d2.timeline_signal.timestamp - d1.timeline_signal.timestamp
-    # then test it by timing the GPU to GPU times
-    jitter_matrix = [[float('nan')]*len(self.devices) for _ in range(len(self.devices))]
-    for i1, d1 in enumerate(self.devices):
-      for i2, d2 in enumerate(self.devices):
-        if d1 == d2: continue
-        d1_to_d2 = statistics.median(_sync_gpu_to_gpu_queue(d1, d2, d1.hw_compute_queue_t, d2.hw_compute_queue_t) - \
-                                     _sync_gpu_to_gpu_queue(d2, d1, d2.hw_compute_queue_t, d1.hw_compute_queue_t) for _ in range(20)) / 2
-        jitter_matrix[i1][i2] = d1_to_d2 - (d1.gpu2cpu_compute_time_diff - d2.gpu2cpu_compute_time_diff)
-    print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix]))
-  def _gpu2cpu_time(self, gpu_time:decimal.Decimal, is_copy:bool) -> float:
-    """
-    Translates local gpu time (timestamp) into global cpu time.
-    """
-    self._ensure_shared_time_base()
-    return float(gpu_time + (self.gpu2cpu_copy_time_diff if is_copy else self.gpu2cpu_compute_time_diff))
-  def _prof_setup(self):
-    if hasattr(self, 'profile_logger'): return
-    atexit.register(self._prof_finalize)
-    self.profile_logger = ProfileLogger()
-  def _prof_finalize(self):
-    qname = ["COMPUTE", "DMA"]
-    # Sync to be sure all events on the device are recorded.
-    self.synchronize()
-    for st, en, name, is_cp, args in self.raw_prof_records:
-      self.profile_logger.events += [(name, self._gpu2cpu_time(st, is_cp), self._gpu2cpu_time(en, is_cp), self.dname, qname[is_cp], args)]
-    for a_st, a_en, a_dev, a_is_copy, b_st, b_en, b_dev, b_is_copy in self.dep_prof_records:
-      # Perfetto connects nodes based on timing data, ensuring every choice is valid by averaging times to a midpoint.
-      a_tm, b_tm = a_dev._gpu2cpu_time((a_st+a_en)/decimal.Decimal(2), a_is_copy), b_dev._gpu2cpu_time((b_st+b_en)/decimal.Decimal(2), b_is_copy)
-      self.profile_logger.deps += [(a_tm, b_tm, a_dev.dname, qname[a_is_copy], b_dev.dname, qname[b_is_copy])]
-    self.raw_prof_records, self.dep_prof_records = [], []
-    # Remove the logger, this flushes all data written by the device.
-    del self.profile_logger
+    gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
+    if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
+    else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
+    Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff)]
   def _wrap_timeline_signal(self):
     self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
     self.timeline_signal.value = 0
-    cast(HCQAllocator, self.allocator).b_timeline = [0] * len(cast(HCQAllocator, self.allocator).b)
+    cast(HCQAllocatorBase, self.allocator).b_timeline = [0] * len(cast(HCQAllocatorBase, self.allocator).b)
-# Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size.
-class HCQBuffer(Protocol): va_addr:int; size:int # noqa: E702
+  def _realloc(self, oldbuf:HCQBuffer|None, new_size:int, options:BufferSpec|None=None) -> tuple[HCQBuffer, bool]:
+    if oldbuf is not None: self.allocator.free(oldbuf, oldbuf.size, options=options)
+    try: buf, realloced = self.allocator.alloc(new_size, options=options), True
+    except MemoryError: buf, realloced = self.allocator.alloc(oldbuf.size if oldbuf is not None else new_size, options=options), False
+    return buf, realloced
-class HCQAllocator(LRUAllocator): # pylint: disable=abstract-method
+class HCQBuffer:
+  def __init__(self, va_addr:sint, size:int, texture_info:Any=None, meta:Any=None, _base:HCQBuffer|None=None):
+    self.va_addr, self.size, self.texture_info, self.meta, self._base = va_addr, size, texture_info, meta, _base
+class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
   """
   A base allocator class compatible with the HCQ (Hardware Command Queue) API.
-  This class implements basic copy operations following the HCQ API, utilizing both `HWComputeQueue` and `HWCopyQueue`.
+  This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`.
   """
-  def __init__(self, device:HCQCompiled, batch_size:int=(2 << 20), batch_cnt:int=32):
-    self.device:Any = device
-    self.b = [self._alloc(batch_size, BufferOptions(host=True)) for _ in range(batch_cnt)]
+  def __init__(self, dev:DeviceType, batch_size:int=(2 << 20), batch_cnt:int=32):
+    self.dev:DeviceType = dev
+    self.b = [self._alloc(batch_size, BufferSpec(host=True)) for _ in range(batch_cnt)]
     self.b_timeline, self.b_next = [0] * len(self.b), 0
     super().__init__()
-  def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer: raise NotImplementedError("need hcq compat alloc")
+  def map(self, buf:HCQBuffer): pass
+  def _offset(self, buf, size:int, offset:int) -> HCQBuffer:
+    return HCQBuffer(va_addr=buf.va_addr + offset, size=size, texture_info=buf.texture_info, meta=buf.meta, _base=buf._base or buf)
-  def copyin(self, dest:HCQBuffer, src:memoryview):
-    with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
+class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
+  def _copyin(self, dest:HCQBuffer, src:memoryview):
+    assert self.dev.hw_copy_queue_t is not None
+    with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"CPU -> {self.dev.device}", enabled=PROFILE):
       for i in range(0, src.nbytes, self.b[0].size):
         self.b_next = (self.b_next + 1) % len(self.b)
-        self.device.timeline_signal.wait(self.b_timeline[self.b_next])
+        self.dev.timeline_signal.wait(self.b_timeline[self.b_next])
         ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
-        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
-                                     .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
-                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-        self.b_timeline[self.b_next] = self.device.timeline_value
-        self.device.timeline_value += 1
+        self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
+                                  .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
+                                  .signal(self.dev.timeline_signal, self.dev.timeline_value).submit(self.dev)
+        self.b_timeline[self.b_next] = self.dev.timeline_value
+        self.dev.timeline_value += 1
   def copy_from_disk(self, dest:HCQBuffer, src, size):
     def _get_temp_buf():
       # Check if the next buffer is safe to be used (its signal has passed) and reserve it.
-      if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.device.timeline_signal.value:
+      if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.dev.timeline_signal.value:
         self.b_timeline[(self.b_next + 1) % len(self.b)], self.b_next = (1 << 64), (self.b_next + 1) % len(self.b)
         return (self.b[self.b_next].va_addr, self.b_next)
       return None
-    with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
+    assert self.dev.hw_copy_queue_t is not None
+    with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE):
       for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
-        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
-                                     .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
-                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-        self.b_timeline[batch_info[1]] = self.device.timeline_value
-        self.device.timeline_value += 1
+        self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
+                                  .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
+                                  .signal(self.dev.timeline_signal, self.dev.timeline_value).submit(self.dev)
+        self.b_timeline[batch_info[1]] = self.dev.timeline_value
+        self.dev.timeline_value += 1
-  def copyout(self, dest:memoryview, src:HCQBuffer):
-    self.device.synchronize()
+  def _copyout(self, dest:memoryview, src:HCQBuffer):
+    self.dev.synchronize()
-    with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
+    assert self.dev.hw_copy_queue_t is not None
+    with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> CPU", enabled=PROFILE):
       for i in range(0, dest.nbytes, self.b[0].size):
-        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
-                                     .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
-                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-        self.device.timeline_signal.wait(self.device.timeline_value)
-        self.device.timeline_value += 1
+        self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
+                                  .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
+                                  .signal(self.dev.timeline_signal, self.dev.timeline_value).submit(self.dev)
+        self.dev.timeline_signal.wait(self.dev.timeline_value)
+        self.dev.timeline_value += 1
         ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
-  def transfer(self, dest:HCQBuffer, src:HCQBuffer, sz:int, src_dev, dest_dev):
-    src_dev.allocator.map(dest)
+  def _transfer(self, dest:HCQBuffer, src:HCQBuffer, sz:int, src_dev:DeviceType, dest_dev:DeviceType):
+    cast(HCQAllocator, src_dev.allocator).map(dest)
-    with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
+    assert src_dev.hw_copy_queue_t is not None
+    with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE):
       src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
                                .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
                                .copy(dest.va_addr, src.va_addr, sz) \
@@ -531,9 +469,3 @@ class HCQAllocator(LRUAllocator): # pylint: disable=abstract-method
                                    .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
                                    .signal(dest_dev.timeline_signal, dest_dev.timeline_value).submit(dest_dev)
       dest_dev.timeline_value += 1
-  def map(self, buf:HCQBuffer): pass
-  def offset(self, buf, size:int, offset:int) -> HCQBuffer:
-    return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']},
-                     **{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']}, _base=buf)

tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

tinygrad 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl