PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +11 -6
tinygrad/codegen/kernel.py +308 -175
tinygrad/codegen/linearize.py +95 -0
tinygrad/codegen/lowerer.py +143 -0
tinygrad/codegen/transcendental.py +257 -0
tinygrad/codegen/uopgraph.py +506 -0
tinygrad/device.py +72 -171
tinygrad/dtype.py +122 -47
tinygrad/engine/jit.py +184 -87
tinygrad/{lazy.py → engine/lazy.py} +74 -66
tinygrad/engine/memory.py +51 -0
tinygrad/engine/realize.py +86 -61
tinygrad/engine/schedule.py +366 -317
tinygrad/engine/search.py +58 -47
tinygrad/function.py +59 -58
tinygrad/helpers.py +120 -102
tinygrad/multi.py +82 -78
tinygrad/nn/__init__.py +116 -67
tinygrad/nn/datasets.py +12 -5
tinygrad/nn/optim.py +1 -1
tinygrad/nn/state.py +91 -6
tinygrad/ops.py +1126 -143
tinygrad/renderer/__init__.py +47 -23
tinygrad/renderer/cstyle.py +338 -265
tinygrad/renderer/llvmir.py +125 -143
tinygrad/renderer/ptx.py +225 -0
tinygrad/runtime/autogen/adreno.py +17904 -0
tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/io_uring.py +97 -63
tinygrad/runtime/autogen/kfd.py +60 -47
tinygrad/runtime/autogen/kgsl.py +1386 -0
tinygrad/runtime/autogen/libc.py +5462 -0
tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/autogen/opencl.py +11 -11
tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
tinygrad/runtime/graph/clang.py +3 -3
tinygrad/runtime/graph/cuda.py +11 -15
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +71 -43
tinygrad/runtime/ops_amd.py +244 -323
tinygrad/runtime/ops_clang.py +12 -5
tinygrad/runtime/ops_cloud.py +220 -0
tinygrad/runtime/ops_cuda.py +42 -99
tinygrad/runtime/ops_disk.py +25 -26
tinygrad/runtime/ops_dsp.py +181 -0
tinygrad/runtime/ops_gpu.py +29 -16
tinygrad/runtime/ops_hip.py +68 -0
tinygrad/runtime/ops_llvm.py +15 -10
tinygrad/runtime/ops_metal.py +147 -64
tinygrad/runtime/ops_nv.py +356 -397
tinygrad/runtime/ops_python.py +78 -79
tinygrad/runtime/ops_qcom.py +405 -0
tinygrad/runtime/support/__init__.py +0 -0
tinygrad/runtime/support/compiler_cuda.py +77 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/runtime/support/hcq.py +539 -0
tinygrad/shape/shapetracker.py +40 -50
tinygrad/shape/view.py +102 -63
tinygrad/tensor.py +1109 -365
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
tinygrad-0.10.0.dist-info/RECORD +77 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad/codegen/uops.py +0 -451
tinygrad/engine/graph.py +0 -100
tinygrad/renderer/assembly.py +0 -269
tinygrad/shape/symbolic.py +0 -327
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0

tinygrad/device.py CHANGED Viewed

@@ -1,41 +1,44 @@
 from __future__ import annotations
-import multiprocessing
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from collections import defaultdict
-from typing import List, Optional, Dict, Tuple, Any, cast
-import importlib, inspect, functools, pathlib, os, ctypes, atexit, time, contextlib
-from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, ProfileLogger, PROFILE
-from tinygrad.dtype import DType, ImageDType
+from typing import Optional, Dict, Tuple, Any, Iterator
+import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys
+from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
+from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes
 from tinygrad.renderer import Renderer
 # **************** Device ****************
 class _Device:
-  def __init__(self) -> None: self._devices: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]  # noqa: E501
+  def __init__(self) -> None:
+    self._devices = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
   @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
-  def _canonicalize(self, device:str) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "")   # noqa: E501
+  def _canonicalize(self, device:str) -> str: return ((d:=device.split(":", 1)[0].upper()) + device[len(d):]).replace(":0", "")
   # NOTE: you can't cache canonicalize in case Device.DEFAULT changes
   def canonicalize(self, device:Optional[str]) -> str: return self._canonicalize(device) if device is not None else Device.DEFAULT
   def __getitem__(self, ix:str) -> Compiled: return self.__get_canonicalized_item(self.canonicalize(ix))
   @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
   def __get_canonicalized_item(self, ix:str) -> Compiled:
-    assert ((cpn:=multiprocessing.current_process().name) == "MainProcess") or ix.split(":")[0] in ["DISK", "NPY"], \
-      f"can only open device {ix} from parent, not {cpn}"
+    cpn = multiprocessing.current_process().name
+    assert (cpn == "MainProcess") or ix.split(":")[0] in ["DISK", "NPY"], f"can only open device {ix} from parent, not {cpn}"
     x = ix.split(":")[0].upper()
-    ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._devices][0](ix)  # noqa: E501
+    ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{__name__.split(".")[0]}.runtime.ops_{x.lower()}')) \
+           if (cname.lower() == x.lower() + "device")][0](ix)
     if DEBUG >= 1: print(f"opened device {ix} from pid:{os.getpid()}")
     return ret
+  @property
+  def default(self) -> Compiled: return self[self.DEFAULT]
+  def get_available_devices(self) -> Iterator[str]:
+    for device in ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CLANG", "LLVM"]:
+      with contextlib.suppress(Exception): yield self[device].dname
   @functools.cached_property
   def DEFAULT(self) -> str:
-    device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._devices, None)   # type: ignore
-    if device_from_env: return device_from_env
-    for device in ["METAL", "AMD", "CUDA", "GPU", "CLANG", "LLVM"]:
-      try:
-        if self[device]:
-          os.environ[device] = "1"   # we set this in environment for spawned children
-          return device
-      except Exception: pass
-    raise RuntimeError("no usable devices")
+    if (from_env:=next((d for d in self._devices if d not in ["DISK", "NPY"] and getenv(d) == 1), None)): return from_env
+    try:
+      device = next(self.get_available_devices())
+      os.environ[device] = "1"   # we set this in environment for spawned children
+      return device
+    except StopIteration as exc: raise RuntimeError("no usable devices") from exc
 Device = _Device()
 # **************** Buffer + Allocators ****************
@@ -47,12 +50,13 @@ class BufferOptions:
   cpu_access: bool = False
   host: bool = False
   nolru: bool = False
+  external_ptr: Optional[int] = None
 class Buffer:
   def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None,
                initial_value:Optional[bytes]=None, lb_refcount=0, base:Optional[Buffer]=None, offset:int=0, preallocate=False):
-    assert isinstance(dtype, DType)
     if isinstance(dtype, ImageDType): options = BufferOptions(image=dtype) # TODO: image hack shouldn't be here. where should it be?
+    else: assert isinstance(dtype, DType) and not isinstance(dtype, PtrDType)
     self.device, self.size, self.dtype, self.options, self.offset = device, size, dtype, options, offset
     if base is None:
       assert offset == 0, "base buffers can't have offset"
@@ -73,10 +77,12 @@ class Buffer:
   def lb_refcount(self): return self.base._lb_refcount
   def ref(self, cnt): self.base._lb_refcount += cnt
   def is_allocated(self) -> bool: return hasattr(self, '_buf')
-  def ensure_allocated(self) -> Buffer: return self.allocate() if not hasattr(self, '_buf') else self
-  def allocate(self, opaque=None) -> Buffer:
-    assert not hasattr(self, '_buf'), "can't allocate already allocated buffer"
+  def ensure_allocated(self) -> Buffer: return self.allocate() if not self.is_allocated() else self
+  def allocate(self, opaque=None, external_ptr=None) -> Buffer:
+    assert not self.is_allocated(), "can't allocate already allocated buffer"
     self.allocator = Device[self.device].allocator
+    if external_ptr is not None:
+      self.options = replace(self.options, external_ptr=external_ptr) if self.options else BufferOptions(external_ptr=external_ptr)
     if self._base is not None:
       self._base.ensure_allocated()
       assert hasattr(self.allocator, "offset"), "offset function required for view"
@@ -88,7 +94,7 @@ class Buffer:
   def __reduce__(self):
     buf = None
     if self._base is not None:
-      return self.__class__, (self.device, self.size, self.dtype, None, None, None, 0, self.base, self.offset, hasattr(self, '_buf'))
+      return self.__class__, (self.device, self.size, self.dtype, None, None, None, 0, self.base, self.offset, self.is_allocated())
     if self.device == "NPY": return self.__class__, (self.device, self.size, self.dtype, self._buf, self.options, None, self.lb_refcount)
     if self.is_allocated():
       buf = bytearray(self.nbytes)
@@ -97,17 +103,17 @@ class Buffer:
   @property
   def nbytes(self): return self.size*self.dtype.itemsize
   def __del__(self):
-    if not hasattr(self, '_buf'): return
-    if self._base is None:
+    if not self.is_allocated(): return
+    if self._base is None and (self.options is None or self.options.external_ptr is None):
       if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes
       self.allocator.free(self._buf, self.nbytes, self.options)
   def __repr__(self):
-    return f"<buf real:{hasattr(self, '_buf')} device:{self.device} size:{self.size} dtype:{self.dtype}" + \
-           (f" offset:{self.offset}" if hasattr(self, "base") else "") + \
-           (">" if self.options is None else f" {self.options=}>")
+    return f"<buf real:{self.is_allocated()} device:{self.device} size:{self.size} dtype:{self.dtype}" + \
+           (f" offset:{self.offset}" if hasattr(self, "base") else "") + (f" {self.options=}" if self.options is not None else "") + ">"
   def as_buffer(self, allow_zero_copy=False, force_zero_copy=False) -> memoryview:
     # zero copy with as_buffer (disabled by default due to use after free)
-    if (force_zero_copy or allow_zero_copy) and hasattr(self.allocator, 'as_buffer'): return self.allocator.as_buffer(self._buf)
+    if (force_zero_copy or allow_zero_copy) and hasattr(self.allocator, 'as_buffer') and (self.options is None or self.options.image is None):
+      return self.allocator.as_buffer(self._buf)
     assert not force_zero_copy, "force zero copy was passed, but copy is required"
     return self.copyout(memoryview(bytearray(self.nbytes)))
   def copyin(self, mv:memoryview):
@@ -133,13 +139,16 @@ class Allocator:
     assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
     return self._alloc(size, options if options is not None else BufferOptions())
   def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
-  def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
-    self._free(opaque, options if options is not None else BufferOptions())
+  def free(self, opaque, size:int, options:Optional[BufferOptions]=None): self._free(opaque, options if options is not None else BufferOptions())
   def _free(self, opaque, options:BufferOptions): pass  # if opaque is a Python object, you don't need a free
   def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
   def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
 class LRUAllocator(Allocator):  # pylint: disable=abstract-method
+  """
+  The LRU Allocator is responsible for caching buffers.
+  It ensures that buffers are not freed until it is absolutely necessary, optimizing performance.
+  """
   def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list)
   def alloc(self, size:int, options:Optional[BufferOptions]=None):
     if len(c := self.cache[(size, options)]): return c.pop()
@@ -156,7 +165,8 @@ class LRUAllocator(Allocator):  # pylint: disable=abstract-method
     else: super().free(opaque, size, options)
 class _MallocAllocator(LRUAllocator):
-  def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
+  def _alloc(self, size:int, options:BufferOptions):
+    return (ctypes.c_uint8 * size).from_address(options.external_ptr) if options.external_ptr else (ctypes.c_uint8 * size)()
   def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
   def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
   def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
@@ -170,151 +180,42 @@ class CompileError(Exception): pass
 class Compiler:
   def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
-  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
+  def compile(self, src:str) -> bytes: return src.encode()   # NOTE: empty compiler is the default
   def compile_cached(self, src:str) -> bytes:
     if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
       assert not getenv("ASSERT_COMPILE"), f"tried to compile with ASSERT_COMPILE set\n{src}"
       lib = self.compile(src)
       if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
     return lib
+  def disassemble(self, lib:bytes): pass
 class Compiled:
   def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
     self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler or Compiler(), runtime, graph
     self.renderer = renderer or Renderer()
-  def synchronize(self): pass  # override this in your device
-# **************** for HCQ Compatible Devices ****************
-@contextlib.contextmanager
-def hcq_profile(dev, queue_type, enabled, desc):
-  st, en = (dev._get_signal(), dev._get_signal()) if enabled else (None, None)
-  if enabled: queue_type().timestamp(st).submit(dev)
-  try: yield (st, en)
-  finally:
-    if enabled: queue_type().timestamp(en).submit(dev)
-    if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
-class HCQCompatCompiled(Compiled):
-  def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, comp_queue_t, copy_queue_t, timeline_signals):
-    self.hw_compute_queue_t, self.hw_copy_queue_t = comp_queue_t, copy_queue_t
-    self.timeline_value: int = 1
-    self.timeline_signal, self._shadow_timeline_signal = timeline_signals
-    self.sig_prof_records: List[Tuple[Any, Any, str, bool]] = []
-    self.raw_prof_records: List[Tuple[int, int, str, bool]] = []
-    if PROFILE: self._prof_setup()
-    from tinygrad.runtime.graph.hcq import HCQGraph
-    super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
-  @classmethod
-  def _read_signal(self, sig): raise NotImplementedError("need _read_signal") # reads a value for a signal
-  @classmethod
-  def _read_timestamp(self, sig): raise NotImplementedError("need _read_timestamp") # reads a timestamp for a signal
-  @classmethod
-  def _set_signal(self, sig, value): raise NotImplementedError("need _set_signal") # sets a value for a signal
-  @classmethod
-  def _get_signal(self, value=0, **kwargs): raise NotImplementedError("need _get_signal") # allocates a new signal
-  @classmethod
-  def _wait_signal(self, signal, value=0, timeout=10000): raise NotImplementedError("need _wait_signal") # waits for a signal value
-  def _gpu2cpu_time(self, gpu_time, is_copy): raise NotImplementedError("need _gpu2cpu_time")
-  def _prof_setup(self):
-    self.profile_logger = ProfileLogger()
-    def _sync_queue(q_t):
-      q_t().timestamp(self.timeline_signal).signal(self.timeline_signal, self.timeline_value).submit(self)
-      self.timeline_value += 1
-      cpu_start_time = time.perf_counter_ns() / 1e3
-      self._wait_signal(self.timeline_signal, self.timeline_value - 1)
-      return cpu_start_time, self._read_timestamp(self.timeline_signal)
-    self.cpu_start_time, self.gpu_start_time = _sync_queue(self.hw_compute_queue_t)
-    self.copy_cpu_start_time, self.copy_gpu_start_time = _sync_queue(self.hw_copy_queue_t)
-    atexit.register(self._prof_finalize)
-  def _prof_process_events(self):
-    self.raw_prof_records += [(self._read_timestamp(st), self._read_timestamp(en), name, is_cp) for st, en, name, is_cp in self.sig_prof_records]
-    for st, en, _, _ in self.sig_prof_records: self.signals_pool += [st, en] # type: ignore
-    self.sig_prof_records = []
-  def _prof_finalize(self):
-    for st, en, name, is_cp in self.raw_prof_records:
-      self.profile_logger.events += [(name, self._gpu2cpu_time(st, is_cp), self._gpu2cpu_time(en, is_cp), self.dname, ["COMPUTE", "DMA"][is_cp])]
-    del self.profile_logger
-  def _wrap_timeline_signal(self):
-    self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
-    self._set_signal(self.timeline_signal, 0)
-    cast(HCQCompatAllocator, self.allocator).b_timeline = [0] * len(cast(HCQCompatAllocator, self.allocator).b)
-class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
-  def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
-    self.device = device
-    self.b = [self._alloc(batch_size, BufferOptions(host=True)) for _ in range(batch_cnt)]
-    self.b_timeline, self.b_next = [0] * len(self.b), 0
-    super().__init__()
-  def copyin(self, dest, src: memoryview):
-    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
-      for i in range(0, src.nbytes, self.b[0].size):
-        self.b_next = (self.b_next + 1) % len(self.b)
-        self.device._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
-        ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
-        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
-                                     .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
-                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-        self.b_timeline[self.b_next] = self.device.timeline_value
-        self.device.timeline_value += 1
-  def copy_from_disk(self, dest, src, size):
-    def _get_temp_buf():
-      # Check if the next buffer is safe to be used (its signal has passed) and reserve it.
-      if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.device._read_signal(self.device.timeline_signal):
-        self.b_timeline[(self.b_next + 1) % len(self.b)], self.b_next = (1 << 64), (self.b_next + 1) % len(self.b)
-        return (self.b[self.b_next].va_addr, self.b_next)
-      return None
-    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
-      for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
-        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
-                                     .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
-                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-        self.b_timeline[batch_info[1]] = self.device.timeline_value
-        self.device.timeline_value += 1
-  def copyout(self, dest:memoryview, src):
-    self.device.synchronize()
-    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
-      for i in range(0, dest.nbytes, self.b[0].size):
-        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
-                                     .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
-                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-        self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value)
-        self.device.timeline_value += 1
-        ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
-  def transfer(self, dest, src, sz: int, src_dev, dest_dev):
-    src_dev._gpu_map(dest)
-    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
-      src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
-                               .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
-                               .copy(dest.va_addr, src.va_addr, sz) \
-                               .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
-      src_dev.timeline_value += 1
-    if src_dev != dest_dev:
-      dest_dev.hw_compute_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
-                                   .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
-                                   .signal(dest_dev.timeline_signal, dest_dev.timeline_value).submit(dest_dev)
-      dest_dev.timeline_value += 1
-  def offset(self, buf, size:int, offset:int): return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)
+  def synchronize(self):
+    """
+    Synchronize all pending operations on the device.
+    This method ensures that all previously queued operations on the device have been completed before proceeding.
+    """
+    # override this in your device implementation
+# TODO: move this to each Device
+def is_dtype_supported(dtype:DType, device:Optional[str]=None) -> bool:
+  if device is None: device = Device.DEFAULT
+  if dtype == dtypes.bfloat16:
+    # NOTE: this requires bf16 buffer support
+    return device in {"AMD"} or (device in {"CUDA", "NV"} and not CI and not getenv("PTX"))
+  if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32]
+  # for CI GPU and OSX, cl_khr_fp16 isn't supported
+  # for CI LLVM, it segfaults because it can't link to the casting function
+  # CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
+  # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
+  if dtype == dtypes.half:
+    if device == "GPU": return not CI and not OSX
+    if device in ["CUDA", "NV"]: return not CI
+    if device == "LLVM": return OSX
+    if device == "PYTHON": return sys.version_info >= (3, 12)
+  if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "GPU")
+  return True

tinygrad/dtype.py CHANGED Viewed

@@ -1,47 +1,79 @@
-from typing import Final, Optional, ClassVar, Set, Tuple, Dict, Union
-from dataclasses import dataclass
-import functools
+from __future__ import annotations
+from typing import Final, Optional, ClassVar, Set, Tuple, Dict, Union, Callable
+import math, struct, ctypes, functools
+from dataclasses import dataclass, fields
 from tinygrad.helpers import getenv
 ConstType = Union[float, int, bool]
-@dataclass(frozen=True, order=True)
-class DType:
+# all DTypes should only be created once
+class DTypeMetaClass(type):
+  dcache: Dict[Tuple, DType] = {}
+  def __call__(cls, *args, **kwargs):
+    if (ret:=DTypeMetaClass.dcache.get(args, None)) is not None: return ret
+    DTypeMetaClass.dcache[args] = ret = super().__call__(*args)
+    return ret
+@dataclass(frozen=True, eq=False)
+class DType(metaclass=DTypeMetaClass):
   priority: int  # this determines when things get upcasted
   itemsize: int
   name: str
   fmt: Optional[str]
   count: int
-  def __repr__(self): return f"dtypes.{'_'*(c:=self.count!=1)}{INVERSE_DTYPES_DICT[self.name if not c else self.scalar().name]}{str(self.count)*c}"
-  def vec(self, sz:int):
-    assert sz > 1 and self.count == 1, f"can't vectorize {self} with size {sz}"
-    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self.name]}{sz}", None, sz)
-  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.count))]] if self.count > 1 else self
-# dependent typing?
-@dataclass(frozen=True, repr=False)
-class ImageDType(DType):
-  shape: Tuple[int, ...]   # arbitrary arg for the dtype, used in image for the shape
-  base: DType
-  def scalar(self): return self.base
-  def vec(self, sz:int): return self.base.vec(sz)
-  def __repr__(self): return f"dtypes.{self.name}({self.shape})"
-# @dataclass(frozen=True, init=False, repr=False, eq=False)
+  _scalar: Optional[DType]
+  @staticmethod
+  def new(priority:int, itemsize:int, name:str, fmt:Optional[str]): return DType(priority, itemsize, name, fmt, 1, None)
+  def __reduce__(self): return type(self), tuple(getattr(self, f.name) for f in fields(self))
+  def __repr__(self): return f"dtypes.{INVERSE_DTYPES_DICT[self.scalar().name]}"+(f".vec({self.count})" if self.count > 1 else "")
+  def __lt__(self, o:DType): return (self.priority, self.itemsize, self.name, self.fmt, self.count) < (o.priority, o.itemsize, o.name, o.fmt, o.count)
+  @property
+  def base(self): return self
+  @property
+  def vcount(self): return self.count
+  @functools.lru_cache(None)  # pylint: disable=method-cache-max-size-none
+  def vec(self, sz:int) -> DType:
+    assert self.count == 1, f"can't vectorize {self} with size {sz}"
+    if sz == 1 or self == dtypes.void: return self  # void doesn't vectorize, and sz=1 is scalar
+    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self.name]}{sz}", None, sz, self)
+  def ptr(self, local=False) -> PtrDType: return PtrDType(self.priority, self.itemsize, self.name, self.fmt, self.count, None, self, local, 1)
+  def scalar(self) -> DType: return self._scalar if self._scalar is not None else self
+@dataclass(frozen=True, eq=False)
 class PtrDType(DType):
-  def __init__(self, dt:DType): super().__init__(dt.priority, dt.itemsize, dt.name, dt.fmt, dt.count)
-  def __repr__(self): return f"ptr.{super().__repr__()}"
-  def __hash__(self): return super().__hash__()
-  def __eq__(self, dt): return self.priority==dt.priority and self.itemsize==dt.itemsize and self.name==dt.name and self.count==dt.count
-  def __ne__(self, dt): return not (self == dt)
+  _base: DType
+  local: bool
+  v: int
+  @property
+  def base(self): return self._base
+  @functools.lru_cache(None)  # pylint: disable=method-cache-max-size-none
+  def vec(self, sz:int) -> DType:
+    assert self.v == 1, f"can't vectorize ptr {self} with size {sz}"
+    if sz == 1: return self  # sz=1 is a scalar
+    return type(self)(*tuple(sz if f.name == 'v' else (self if f.name == '_scalar' else getattr(self, f.name)) for f in fields(self)))
+  def ptr(self, local=False): raise RuntimeError("can't make a pointer from a pointer")
+  @property
+  def vcount(self): return self.v
+  def __repr__(self): return f"{self.base.__repr__()}.ptr({'local=True' if self.local else ''})" + (f'.vec({self.v})' if self.v != 1 else '')
+@dataclass(frozen=True, eq=False)
+class ImageDType(PtrDType):
+  shape: Tuple[int, ...] = ()   # shape of the Image
+  def ptr(self, local=False) -> PtrDType:
+    assert not local, "images can't be local"
+    return self
+  def __repr__(self): return f"dtypes.{self.name}({self.shape})" + (f'.vec({self.v})' if self.v != 1 else '')
 class dtypes:
   @staticmethod
-  def is_float(x: DType) -> bool: return x.scalar() in (dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64)
+  @functools.lru_cache(None)
+  def is_float(x: DType) -> bool: return x.scalar() in dtypes.floats or isinstance(x, ImageDType)
   @staticmethod # static methds on top, or bool in the type info will refer to dtypes.bool
-  def is_int(x: DType) -> bool: return x.scalar() in (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64) or dtypes.is_unsigned(x)
+  @functools.lru_cache(None)
+  def is_int(x: DType) -> bool: return x.scalar() in dtypes.ints
   @staticmethod
-  def is_unsigned(x: DType) -> bool: return x.scalar() in (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
+  @functools.lru_cache(None)
+  def is_unsigned(x: DType) -> bool: return x.scalar() in dtypes.uints
   @staticmethod
   def from_py(x) -> DType:
     if x.__class__ is float: return dtypes.default_float
@@ -51,23 +83,44 @@ class dtypes:
     if x.__class__ is list or x.__class__ is tuple: return max(dtypes.from_py(xi) for xi in x) if x else dtypes.default_float
     raise RuntimeError(f"Could not infer dtype of {x} with type {type(x)}")
   @staticmethod
-  def as_const(val: ConstType, dtype:DType): return int(val) if dtypes.is_int(dtype) else float(val) if dtypes.is_float(dtype) else bool(val)
+  def as_const(val: Tuple[ConstType, ...]|ConstType, dtype:DType):
+    if isinstance(val, tuple):
+      assert len(val) == dtype.count, f"mismatch {val} {dtype}"
+      return tuple(dtypes.as_const(x, dtype) for x in val)
+    # TODO: should truncate here
+    return int(val) if dtypes.is_int(dtype) else float(val) if dtypes.is_float(dtype) else bool(val)
+  @staticmethod
+  @functools.lru_cache(None)
+  def min(dtype:DType):
+    if dtypes.is_int(dtype): return 0 if dtypes.is_unsigned(dtype) else -2**(dtype.itemsize*8-1)
+    return -float("inf") if dtypes.is_float(dtype) else False
+  @staticmethod
+  @functools.lru_cache(None)
+  def max(dtype:DType):
+    if dtypes.is_int(dtype): return (2**(dtype.itemsize*8-(0 if dtypes.is_unsigned(dtype) else 1)))-1
+    return float("inf") if dtypes.is_float(dtype) else True
+  @staticmethod
+  def finfo(dtype:DType) -> Tuple[int, int]:
+    """(exponent, mantissa)"""
+    if not dtypes.is_float(dtype): raise ValueError(f"{dtype} is not a floating point type")
+    return {dtypes.float16: (5, 10), dtypes.bfloat16: (8, 7), dtypes.float32: (8, 23), dtypes.float64: (11, 52)}[dtype]
   @staticmethod
   def fields() -> Dict[str, DType]: return DTYPES_DICT
-  bool: Final[DType] = DType(0, 1, "bool", '?', 1)
-  int8: Final[DType] = DType(1, 1, "char", 'b', 1)
-  uint8: Final[DType] = DType(2, 1, "unsigned char", 'B', 1)
-  int16: Final[DType] = DType(3, 2, "short", 'h', 1)
-  uint16: Final[DType] = DType(4, 2, "unsigned short", 'H', 1)
-  int32: Final[DType] = DType(5, 4, "int", 'i', 1)
-  uint32: Final[DType] = DType(6, 4, "unsigned int", 'I', 1)
-  int64: Final[DType] = DType(7, 8, "long", 'l', 1)
-  uint64: Final[DType] = DType(8, 8, "unsigned long", 'L', 1)
-  float16: Final[DType] = DType(9, 2, "half", 'e', 1)
+  void: Final[DType] = DType.new(-1, 0, "void", None)
+  bool: Final[DType] = DType.new(0, 1, "bool", '?')
+  int8: Final[DType] = DType.new(1, 1, "char", 'b')
+  uint8: Final[DType] = DType.new(2, 1, "unsigned char", 'B')
+  int16: Final[DType] = DType.new(3, 2, "short", 'h')
+  uint16: Final[DType] = DType.new(4, 2, "unsigned short", 'H')
+  int32: Final[DType] = DType.new(5, 4, "int", 'i')
+  uint32: Final[DType] = DType.new(6, 4, "unsigned int", 'I')
+  int64: Final[DType] = DType.new(7, 8, "long", 'q')
+  uint64: Final[DType] = DType.new(8, 8, "unsigned long", 'Q')
+  float16: Final[DType] = DType.new(9, 2, "half", 'e')
   # bfloat16 has higher priority than float16, so least_upper_dtype(dtypes.int64, dtypes.uint64) = dtypes.float16
-  bfloat16: Final[DType] = DType(10, 2, "__bf16", None, 1)
-  float32: Final[DType] = DType(11, 4, "float", 'f', 1)
-  float64: Final[DType] = DType(12, 8, "double", 'd', 1)
+  bfloat16: Final[DType] = DType.new(10, 2, "__bf16", None)
+  float32: Final[DType] = DType.new(11, 4, "float", 'f')
+  float64: Final[DType] = DType.new(12, 8, "double", 'd')
   # dtype aliases
   half = float16; float = float32; double = float64 # noqa: E702
@@ -76,17 +129,25 @@ class dtypes:
   # NOTE: these are image dtypes
   @staticmethod
-  def imageh(shp): return ImageDType(100, 2, "imageh", 'e', 1, shape=shp, base=dtypes.float32)
+  def imageh(shp): return ImageDType(100, 2, "imageh", 'e', 1, None, dtypes.float32, False, 1, shp)
   @staticmethod
-  def imagef(shp): return ImageDType(100, 4, "imagef", 'f', 1, shape=shp, base=dtypes.float32)
+  def imagef(shp): return ImageDType(100, 4, "imagef", 'f', 1, None, dtypes.float32, False, 1, shp)
   default_float: ClassVar[DType] = float32
   default_int: ClassVar[DType] = int32
+  floats = (float16, bfloat16, float32, float64)
+  uints = (uint8, uint16, uint32, uint64)
+  sints = (int8, int16, int32, int64)
+  ints = uints + sints
 if (env_default_float := getenv("DEFAULT_FLOAT", "")):
   dtypes.default_float = getattr(dtypes, env_default_float.lower())
   assert dtypes.is_float(dtypes.default_float), f"{env_default_float} is not a float dtype"
+DTypeLike = Union[str, DType]
+def to_dtype(dtype:DTypeLike) -> DType: return dtype if isinstance(dtype, DType) else getattr(dtypes, dtype)
 # https://jax.readthedocs.io/en/latest/jep/9407-type-promotion.html
 # we don't support weak type and complex type
 promo_lattice = { dtypes.bool: [dtypes.int8, dtypes.uint8], dtypes.int8: [dtypes.int16], dtypes.int16: [dtypes.int32], dtypes.int32: [dtypes.int64],
@@ -103,11 +164,25 @@ def least_upper_dtype(*ds:DType) -> DType:
 def least_upper_float(dt:DType) -> DType: return dt if dtypes.is_float(dt) else least_upper_dtype(dt, dtypes.float32)
 # HACK: staticmethods are not callable in 3.8 so we have to compare the class
-DTYPES_DICT = {k: v for k, v in dtypes.__dict__.items() if not (k.startswith(('__', 'default')) or v.__class__ is staticmethod)}
+DTYPES_DICT = {k: v for k, v in dtypes.__dict__.items() if not (k.startswith(('__', 'default', 'void'))
+                                                                or v.__class__ is staticmethod or isinstance(v, tuple))}
 INVERSE_DTYPES_DICT = {v.name:k for k,v in DTYPES_DICT.items()}
+INVERSE_DTYPES_DICT['void'] = 'void'
 def sum_acc_dtype(dt:DType):
   # default acc dtype for sum
   if dtypes.is_unsigned(dt): return least_upper_dtype(dt, dtypes.uint)
   if dtypes.is_int(dt) or dt == dtypes.bool: return least_upper_dtype(dt, dtypes.int)
-  return least_upper_dtype(dt, dtypes.float)
+  return least_upper_dtype(dt, dtypes.float)
+def truncate_fp16(x):
+  try: return struct.unpack("@e", struct.pack("@e", float(x)))[0]
+  except OverflowError: return math.copysign(math.inf, x)
+truncate: Dict[DType, Callable] = {dtypes.bool: bool,
+  # TODO: bfloat16
+  dtypes.float16: truncate_fp16, dtypes.float32: lambda x: ctypes.c_float(x).value, dtypes.float64: lambda x: ctypes.c_double(x).value,
+  dtypes.uint8: lambda x: ctypes.c_uint8(x).value, dtypes.uint16: lambda x: ctypes.c_uint16(x).value,
+  dtypes.uint32: lambda x: ctypes.c_uint32(x).value, dtypes.uint64: lambda x: ctypes.c_uint64(x).value,
+  dtypes.int8: lambda x: ctypes.c_int8(x).value, dtypes.int16: lambda x: ctypes.c_int16(x).value, dtypes.int32: lambda x: ctypes.c_int32(x).value,
+  dtypes.int64: lambda x: ctypes.c_int64(x).value}

tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl