PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/kernel.py +230 -190
tinygrad/codegen/linearizer.py +278 -384
tinygrad/codegen/uops.py +415 -0
tinygrad/device.py +132 -275
tinygrad/dtype.py +53 -37
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +195 -0
tinygrad/engine/realize.py +191 -0
tinygrad/engine/schedule.py +362 -0
tinygrad/engine/search.py +196 -0
tinygrad/{mlops.py → function.py} +28 -14
tinygrad/helpers.py +72 -43
tinygrad/lazy.py +141 -240
tinygrad/multi.py +169 -0
tinygrad/nn/__init__.py +179 -8
tinygrad/nn/datasets.py +7 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +86 -17
tinygrad/ops.py +70 -44
tinygrad/renderer/__init__.py +61 -0
tinygrad/renderer/assembly.py +276 -0
tinygrad/renderer/cstyle.py +299 -206
tinygrad/renderer/llvmir.py +118 -123
tinygrad/runtime/autogen/amd_gpu.py +1900 -0
tinygrad/runtime/autogen/comgr.py +865 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5761 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33328 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/hip_comgr.py +47 -0
tinygrad/runtime/driver/hsa.py +143 -0
tinygrad/runtime/graph/clang.py +38 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +143 -0
tinygrad/runtime/graph/hsa.py +171 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +564 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +130 -38
tinygrad/runtime/ops_disk.py +45 -42
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_hsa.py +278 -0
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +42 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +630 -0
tinygrad/runtime/ops_python.py +204 -0
tinygrad/shape/shapetracker.py +41 -105
tinygrad/shape/symbolic.py +98 -95
tinygrad/shape/view.py +137 -35
tinygrad/tensor.py +2367 -442
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/METADATA +19 -9
tinygrad-0.9.0.dist-info/RECORD +60 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0

tinygrad/device.py CHANGED Viewed

@@ -1,326 +1,183 @@
 from __future__ import annotations
-import numpy as np
+import multiprocessing
+from dataclasses import dataclass
 from collections import defaultdict
-from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable
-import importlib, inspect, functools, pathlib, time, re, ctypes
-from tinygrad.dtype import DType, dtypes, ImageDType
-from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put
-from tinygrad.shape.symbolic import Variable, sym_infer, sint
-from tinygrad.ops import LazyOp, TernaryOps, get_lazyop_info, ReduceOps, BufferOps, BinaryOps, UnaryOps, Op, GlobalCounters
-if TYPE_CHECKING:
-  from tinygrad.codegen.linearizer import Linearizer
-  from tinygrad.codegen.kernel import LinearizerOptions
+from typing import List, Optional, Dict, Tuple, Any
+import importlib, inspect, functools, pathlib, os, ctypes
+from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
+from tinygrad.dtype import DType, ImageDType
+from tinygrad.renderer import Renderer
 # **************** Device ****************
 class _Device:
   def __init__(self) -> None: self._devices: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]  # noqa: E501
-  def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT  # noqa: E501
-  def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]: return self.__get_canonicalized_item(self.canonicalize(ix))
   @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
-  def __get_canonicalized_item(self, ix:str) -> Union[Interpreted, Compiled]:
+  def _canonicalize(self, device:str) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "")   # noqa: E501
+  # NOTE: you can't cache canonicalize in case Device.DEFAULT changes
+  def canonicalize(self, device:Optional[str]) -> str: return self._canonicalize(device) if device is not None else Device.DEFAULT
+  def __getitem__(self, ix:str) -> Compiled: return self.__get_canonicalized_item(self.canonicalize(ix))
+  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
+  def __get_canonicalized_item(self, ix:str) -> Compiled:
+    if DEBUG >= 1: print(f"opening device {ix} from pid:{os.getpid()}")
+    assert multiprocessing.current_process().name == "MainProcess" or ix.split(":")[0] in ["DISK", "NPY"], f"can only open device {ix} from parent"
     x = ix.split(":")[0].upper()
-    ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._devices][0]  # noqa: E501
-    if isinstance(ret, type): ret = ret(ix)
-    return ret
+    return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._devices][0](ix)  # noqa: E501
   @functools.cached_property
   def DEFAULT(self) -> str:
     device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._devices, None)   # type: ignore
     if device_from_env: return device_from_env
-    for device in ["METAL", "CUDA", "HIP", "GPU"]:
+    for device in ["METAL", "HSA", "CUDA", "GPU", "CLANG", "LLVM"]:
       try:
-        if self[device]: return device
+        if self[device]:
+          os.environ[device] = "1"   # we set this in environment for spawned children
+          return device
       except Exception: pass
-    return "CPU"
+    raise RuntimeError("no usable devices")
 Device = _Device()
-# **************** base Runner + helpers ****************
-class JITRunner:
-  def __init__(self):
-    self.op_estimate, self.mem_estimate = 0, 0
-  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
-    var_vals = var_vals if var_vals is not None else {}
-    from tinygrad.jit import CacheCollector
-    et = self(rawbufs, var_vals)
-    CacheCollector.add(self, rawbufs, var_vals)
-    return et
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
-    raise NotImplementedError("override this")
-def update_stats(name:str, op_estimate:sint, mem_estimate:sint, var_vals: Optional[Dict[Variable, int]], et: Optional[float], buf_count:int, jit=False, num_kernels=1, lra: Optional[Dict]=None, device:str="", first_run=False):  # noqa: E501
-  if var_vals is None: var_vals = {}
-  op_estimate, mem_estimate = sym_infer(op_estimate, var_vals), sym_infer(mem_estimate, var_vals)
-  GlobalCounters.kernel_count += num_kernels
-  GlobalCounters.global_ops += op_estimate
-  GlobalCounters.global_mem += mem_estimate
-  if et is not None: GlobalCounters.time_sum_s += et
-  if DEBUG >= 2:
-    ptm = (colored(f"{et*1e3:9.2f}ms", "yellow") if et > 0.01 else f"{et*1e6:9.2f}us") if et is not None else ""
-    print(f"{colored(f'*** {device[:7]:7s} {GlobalCounters.kernel_count:4d}', ('magenta' if num_kernels == 1 else 'CYAN') if jit else ('green' if first_run else None))} {name+' '*(37-ansilen(name))} arg {buf_count:3d} mem {GlobalCounters.mem_used/1e9:5.2f} GB " +  # noqa: E501
-          (str() if et is None else f"tm {ptm}/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_estimate/((et or 1e-20)*1e9):8.2f} GFLOPS, {mem_estimate/((et or 1e-20)*1e9):7.2f} GB/s)"))  # noqa: E501
+# **************** Buffer + Allocators ****************
-# **************** Buffer / Allocator ****************
+@dataclass(frozen=True, eq=True)
+class BufferOptions:
+  image: Optional[ImageDType] = None
+  uncached: bool = False
+  cpu_access: bool = False
+  host: bool = False
+  nolru: bool = False
 class Buffer:
-  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None):
+  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None,
+               initial_value:Optional[bytes]=None, lb_refcount=0, base:Optional[Buffer]=None, offset:int=0, preallocate=False):
     assert isinstance(dtype, DType)
-    self.device, self.size, self.dtype = device, size, dtype
+    if isinstance(dtype, ImageDType): options = BufferOptions(image=dtype) # TODO: image hack shouldn't be here. where should it be?
+    self.device, self.size, self.dtype, self.options, self.offset = device, size, dtype, options, offset
+    if base is None:
+      assert offset == 0, "base buffers can't have offset"
+      self._base = None
+      self._lb_refcount = lb_refcount
+      if opaque is not None: self.allocate(opaque)
+      if initial_value is not None:
+        self.allocate()
+        self.copyin(memoryview(initial_value))
+    else:
+      assert base._base is None, "base can't have a base"
+      assert device == base.device, "base must have the same device"
+      self._base = base
+    if preallocate: self.allocate()
+  @property
+  def base(self) -> Buffer: return self._base if self._base is not None else self
+  @property
+  def lb_refcount(self): return self.base._lb_refcount
+  def ref(self, cnt): self.base._lb_refcount += cnt
+  def is_allocated(self) -> bool: return hasattr(self, '_buf')
+  def ensure_allocated(self) -> Buffer: return self.allocate() if not hasattr(self, '_buf') else self
+  def allocate(self, opaque=None) -> Buffer:
+    assert not hasattr(self, '_buf'), "can't allocate already allocated buffer"
     self.allocator = Device[self.device].allocator
-    # TODO: image hack shouldn't be here. where should it be?
-    self._buf = opaque if opaque is not None else self.allocator.alloc(dtype if isinstance(dtype, ImageDType) else size * dtype.itemsize)
-    # TODO: mem_used for all devices
-    if not self.device.startswith("DISK"): GlobalCounters.mem_used += self.size * self.dtype.itemsize
+    if self._base is not None:
+      self._base.ensure_allocated()
+      assert hasattr(self.allocator, "offset"), "offset function required for view"
+      self._buf: Any = self.allocator.offset(self.base._buf, self.nbytes, self.offset)
+    else:
+      self._buf = opaque if opaque is not None else self.allocator.alloc(self.nbytes, self.options)
+      if not self.device.startswith("DISK"): GlobalCounters.mem_used += self.nbytes
+    return self
+  def __reduce__(self):
+    buf = None
+    if self._base is not None:
+      return self.__class__, (self.device, self.size, self.dtype, None, None, None, 0, self.base, self.offset, hasattr(self, '_buf'))
+    if self.device == "NPY": return self.__class__, (self.device, self.size, self.dtype, self._buf, self.options, None, self.lb_refcount)
+    if self.is_allocated():
+      buf = bytearray(self.nbytes)
+      self.copyout(memoryview(buf))
+    return self.__class__, (self.device, self.size, self.dtype, None, self.options, buf, self.lb_refcount)
+  @property
+  def nbytes(self): return self.size*self.dtype.itemsize
   def __del__(self):
-    if not hasattr(self, '_buf'): return # happens when __init__ has raised exception
-    if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.size * self.dtype.itemsize
-    if isinstance(self.dtype, ImageDType): self.allocator.free(self._buf, self.dtype)
-    else: self.allocator.free(self._buf, self.size * self.dtype.itemsize)
-  def __repr__(self): return f"<buf device:{self.device} size:{self.size} dtype:{self.dtype}>"
+    if not hasattr(self, '_buf'): return
+    if self._base is None:
+      if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes
+      self.allocator.free(self._buf, self.nbytes, self.options)
+  def __repr__(self):
+    return f"<buf real:{hasattr(self, '_buf')} device:{self.device} size:{self.size} dtype:{self.dtype}" + \
+           (f" offset:{self.offset}" if hasattr(self, "base") else "") + \
+           (">" if self.options is None else f" {self.options=}>")
+  def as_buffer(self, allow_zero_copy=False, force_zero_copy=False) -> memoryview:
+    # zero copy with as_buffer (disabled by default due to use after free)
+    if (force_zero_copy or allow_zero_copy) and hasattr(self.allocator, 'as_buffer'): return self.allocator.as_buffer(self._buf)
+    assert not force_zero_copy, "force zero copy was passed, but copy is required"
+    return self.copyout(memoryview(bytearray(self.nbytes)))
   def copyin(self, mv:memoryview):
     mv = flat_mv(mv)
-    assert len(mv) == self.size*self.dtype.itemsize, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
+    assert len(mv) == self.nbytes, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
+    assert self.is_allocated(), "can't copyin to unallocated buffer"
     self.allocator.copyin(self._buf, mv)
     return self
-  @staticmethod
-  def fromCPU(device:str, x:np.ndarray): return Buffer(device, x.size, dtypes.from_np(x.dtype)).copyin(x.data)
-  def toCPU(self) -> np.ndarray:
-    # zero copy with as_buffer
-    if hasattr(self.allocator, 'as_buffer'):
-      return np.frombuffer(self.allocator.as_buffer(self._buf), dtype=np.dtype(self.dtype.np, metadata={"backing": self._buf}))  # type: ignore
-    ret = np.empty(self.size, self.dtype.np)
-    if self.size > 0: self.allocator.copyout(flat_mv(ret.data), self._buf)
-    return ret
-def _internal_buffer_copy(dest:Buffer, src:Buffer):
-  if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator):  # noqa: E721
-    # fast path, used on HIP between GPUs
-    # NOTE: it's important we use the dest device here to ensure the transfer is ready
-    Device[src.device].synchronize()   # TODO: async this
-    dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
-    return
-  if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):
-    # fast path, used on Metal in OS X Sonoma
-    # NOTE: this is *only* faster if the pages from disk are already loaded into memory
-    fb = dest.allocator.from_buffer(src.allocator.as_buffer(src._buf))
-    if fb:
-      dest.allocator.transfer(dest._buf, fb, dest.size*dest.dtype.itemsize)
-      return
-  if hasattr(dest.allocator, 'copy_from_fd') and src.device.startswith("DISK") and src.size*src.dtype.itemsize >= 4096 and src._buf.ud.fd is not None:
-    dest.allocator.copy_from_fd(dest._buf, src._buf.ud.fd, src._buf.offset, src.size*src.dtype.itemsize)
-  elif hasattr(dest.allocator, 'as_buffer'):
-    # fast(ish) path, uses readinto in diskbuffers
-    src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf)
-  elif hasattr(src.allocator, 'as_buffer'):
-    dest.allocator.copyin(dest._buf, src.allocator.as_buffer(src._buf))
-  else:
-    # slow path, allocates a CPU buffer
-    dest.copyin(src.toCPU().data)
-class _BufferCopy(JITRunner):
-  # TODO: make wait work
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False):
-    dest, src = rawbufs
-    assert dest.size == src.size, f"buffer copy size mismatch, {dest.size} != {src.size}"
-    assert dest.dtype == src.dtype, f"buffer copy dtype mismatch, {dest.dtype} != {src.dtype}"
-    st = time.perf_counter()
-    _internal_buffer_copy(dest, src)
-    et = None
-    if wait or DEBUG >= 2:
-      Device[dest.device].synchronize()
-      et = time.perf_counter() - st
-    update_stats(colored(f"copy {dest.size:8d}, {dest.device[:7]:>7s} <- {src.device[:7]:7s}", "yellow"), 0, dest.size*dest.dtype.itemsize, {}, et, 2, jit, device=dest.device)  # noqa: E501
-BufferCopy = _BufferCopy()
+  def copyout(self, mv:memoryview) -> memoryview:
+    mv = flat_mv(mv)
+    assert len(mv) == self.nbytes, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
+    assert self.is_allocated(), "can't copyout unallocated buffer"
+    self.allocator.copyout(mv, self._buf)
+    return mv
+  def view(self, size:int, dtype:DType, offset:int) -> Buffer:
+    assert offset < self.nbytes, "offset must be less than nbytes"
+    if self._base is not None: return Buffer(self.device, size, dtype, base=self._base, offset=self.offset+offset)
+    return Buffer(self.device, size, dtype, base=self, offset=offset)
 # TODO: size, dest, src are the same type. can we enforce this?
-sz_type = Union[ImageDType, int]
 class Allocator:
-  def alloc(self, size:sz_type):
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
     assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
-    return self._alloc_image(size) if isinstance(size, ImageDType) else self._alloc(size)
-  def _alloc(self, size:int): raise NotImplementedError("need alloc")
-  def _alloc_image(self, dtype:ImageDType): raise RuntimeError("need alloc image")
-  def free(self, opaque, size:sz_type): self._free(opaque) # if you are returning a Python object, you don't need a free
-  def _free(self, opaque): pass
+    return self._alloc(size, options if options is not None else BufferOptions())
+  def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
+  def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
+    self._free(opaque, options if options is not None else BufferOptions())
+  def _free(self, opaque, options:BufferOptions): pass  # if opaque is a Python object, you don't need a free
   def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
   def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
 class LRUAllocator(Allocator):  # pylint: disable=abstract-method
-  def __init__(self): self.cache: Dict[sz_type, Any] = defaultdict(list)
-  def alloc(self, size:sz_type):
-    if len(c := self.cache[size]): return c.pop()
-    try:
-      return super().alloc(size)
-    except MemoryError:
+  def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list)
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
+    if len(c := self.cache[(size, options)]): return c.pop()
+    try: return super().alloc(size, options)
+    except (RuntimeError, MemoryError):
       self.free_cache()
-      return super().alloc(size)
+      return super().alloc(size, options)
   def free_cache(self):
-    for opaques in self.cache.values():
-      for opaque in opaques: self._free(opaque)
+    for (sz,options),opaques in self.cache.items():
+      for opaque in opaques: super().free(opaque, sz, options)
       opaques.clear()
-  def free(self, opaque:Any, size:sz_type):
-    if getenv("LRU", 1): self.cache[size].append(opaque)
-    else: self._free(opaque)
+  def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
+    if getenv("LRU", 1) and (options is None or not options.nolru): self.cache[(size, options)].append(opaque)
+    else: super().free(opaque, size, options)
 class _MallocAllocator(LRUAllocator):
-  def _alloc(self, size:int): return (ctypes.c_uint8 * size)()
+  def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
   def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
   def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
   def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
-MallocAllocator = _MallocAllocator()
-# **************** for Interpreted Devices ****************
-class InterpretedASTRunner(JITRunner):
-  def __init__(self, ast:LazyOp, fxn:Callable):
-    super().__init__()
-    self.fxn = fxn
-    info = get_lazyop_info(ast)
-    self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> float:
-    st = time.perf_counter()
-    rawbufs[0]._buf = self.fxn([x._buf for x in rawbufs[1:]], var_vals)
-    et = time.perf_counter() - st
-    update_stats(f"<interpreted {rawbufs[0].size}>", self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit, device=rawbufs[0].device)
-    return et
-class Interpreted:
-  def __init__(self, allocator: Allocator, fxn_for_op:Dict[Op, Callable]):
-    self.allocator, self.fxn_for_op = allocator, fxn_for_op
-    self.synchronize, self.codegen, self.graph = lambda: None, None, None
+  def offset(self, buf, size:int, offset:int): return from_mv(self.as_buffer(buf)[offset:offset+size])
-  @functools.lru_cache(None)    # pylint: disable=method-cache-max-size-none
-  def get_runner(self, ast:LazyOp) -> InterpretedASTRunner: return _get_interpreted_fxn(self.fxn_for_op, ast)
-def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> InterpretedASTRunner:
-  if DEBUG >= 3:
-    from tinygrad.graph import print_tree
-    print_tree(ast)
-  tglob: Dict[str, Any] = {"Variable": Variable}
-  @functools.lru_cache(None)
-  def gstr(x:Any, nm=None) -> str:
-    if ('Variable' in (str_arg := repr(x)) or 'NumNode' in str_arg):
-      str_arg = re.sub(r'Variable\(.*?\)', lambda m: f'var_vals[{str(m.group(0))}]', str_arg)
-      # TODO: (Variable - Variable) might create NumNode. can we remove it?
-      return re.sub(r'NumNode\((.*?)\)', r'\1', str_arg)
-    ret = str(nm).replace(".", "_") if nm else f"m{len(tglob):04d}"
-    tglob[ret] = x
-    return ret
-  lines: List[str] = []
-  @functools.lru_cache(None)
-  def _interpret_ast(ast:LazyOp) -> str:
-    # TODO: shortcutted store won't work with strides
-    if ast.op == BufferOps.STORE: return _interpret_ast(ast.src[0])
-    if TernaryOps.MULACC in fxn_for_op and ast.op == ReduceOps.SUM:
-      if ast.src[0].op == BinaryOps.MUL: ast = LazyOp(TernaryOps.MULACC, ast.src[0].src, ast.arg)
-      if (castop:=ast.src[0]).op == UnaryOps.CAST and (mulop:=castop.src[0]).op == BinaryOps.MUL:
-        # MULACC with acc cast rewrite: MUL -> CAST -> SUM => CAST -> MULACC
-        ast = LazyOp(TernaryOps.MULACC, tuple(LazyOp(UnaryOps.CAST, (s, ), castop.arg) for s in mulop.src), ast.arg)
-    if ast.op in BufferOps:
-      if ast.op == BufferOps.CONST: tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({gstr(ast.arg.val)}, {gstr(ast.arg.dtype)})"
-      else: tmp = f"{gstr(fxn_for_op[UnaryOps.CAST], UnaryOps.CAST)}(inputs[{ast.arg.idx-1}], ({gstr(ast.arg.dtype)}, True))"
-      for mop,arg in ast.arg.st.to_movement_ops(): tmp = f"{gstr(fxn_for_op[mop], mop)}({tmp}, {gstr(arg)})"
-    else:
-      tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({', '.join([_interpret_ast(src) for src in ast.src] + ([gstr(ast.arg)] if ast.arg else []))})"
-    ret = f"a{len(lines)}"
-    lines.append(f"  {ret} = {tmp}")
-    return ret
-  ret = _interpret_ast(ast)
-  src = '\n'.join(['def run(inputs, var_vals):'] + lines + [f"  return {ret}"])
-  if DEBUG >= 4: print(functools.reduce(lambda x,y: (x.replace(y[0], str(y[1])) if y[0][0:2] == "m0" else x), tglob.items(), src))
-  exec(compile(src, "<ast>", "exec"), tglob) # pylint: disable=exec-used
-  return InterpretedASTRunner(ast, tglob['run'])
+MallocAllocator = _MallocAllocator()
 # **************** for Compiled Devices ****************
-class CompiledASTRunner(JITRunner):
-  def __init__(self, ast:Optional[LazyOp], name:str, prg:str, lib:bytes, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None):  # noqa: E501
-    super().__init__()
-    if DEBUG >= 4: print(prg)
-    if global_size is not None: global_size = global_size + [1]*(3-len(global_size))
-    if local_size is not None: local_size = local_size + [1]*(3-len(local_size))
-    self.name, self.display_name, self.prg, self.lib, self.global_size, self.local_size, self.first_run = \
-      to_function_name(name), name, prg, lib, global_size, local_size, True
-    self.vars: List[Variable] = []
-    if ast:
-      info = get_lazyop_info(ast)
-      self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
-      self.vars = ast.vars()
-      assert all(v._val is None for v in self.vars), f"ASTRunner contains bound Variable {self.vars}"
+class CompileError(Exception): pass
-  def build(self, runtime):
-    self.clprg = runtime(self.name, self.lib)
-    return self
-  def launch_dims(self, var_vals):
-    global_size = [sym_infer(sz, var_vals) for sz in self.global_size] if self.global_size is not None else self.global_size
-    local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else self.local_size
-    return global_size, local_size
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False, do_update_stats=True) -> Optional[float]:
-    global_size, local_size = self.launch_dims(var_vals)
-    if global_size is not None and local_size is None and all_int(self.global_size): # type: ignore[arg-type]
-      # TODO: this is copied from get_program
-      from tinygrad.features.search import optimize_local_size
-      local_size = self.local_size = optimize_local_size(self.clprg, global_size, rawbufs)
-      global_size = self.global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
-    lra = {}
-    if global_size: lra['global_size'] = global_size
-    if local_size: lra['local_size'] = local_size
-    et = self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.vars), wait=wait or DEBUG>=2)
-    if do_update_stats: update_stats(self.display_name, self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit, lra=lra, device=rawbufs[0].device, first_run=self.first_run)  # noqa: E501
-    self.first_run = False
-    return et
+class Compiler:
+  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
+  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
+  def compile_cached(self, src:str) -> bytes:
+    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
+      assert not getenv("ASSERT_COMPILE"), "tried to compile with ASSERT_COMPILE set"
+      lib = self.compile(src)
+      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
+    return lib
 class Compiled:
-  def __init__(self, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, runtime, graph=None):
-    self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph = allocator, linearizer_opts, renderer, compiler, runtime, graph  # noqa: E501
+  def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
+    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
+    self.renderer = renderer if renderer else Renderer()
   def synchronize(self): pass  # override this in your device
-  def to_program(self, k:Linearizer) -> CompiledASTRunner:
-    assert self.compiler is not None, f"compiler is None, can't build {k.ast}"
-    k.linearize()
-    src = self.renderer(to_function_name(k.name), k.uops)
-    if getenv("DISABLE_COMPILER_CACHE") or '<' in self.compiler.__name__:
-      lib = self.compiler(src)
-    else:
-      lib = diskcache_get(self.compiler.__name__, src)
-      if lib is None:
-        lib = self.compiler(src)
-        diskcache_put(self.compiler.__name__, src, lib)
-    return CompiledASTRunner(k.ast, k.name, src, lib, k.global_size, k.local_size).build(self.runtime)
-  def get_linearizer(self, ast:LazyOp) -> Linearizer:
-    if DEBUG >= 3:
-      from tinygrad.graph import print_tree
-      print_tree(ast)
-    from tinygrad.codegen.linearizer import Linearizer
-    k = Linearizer(ast, self.linearizer_opts)
-    k.required_optimizations()
-    if not NOOPT:
-      if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
-      if BEAM >= 1:
-        lins = [(("tc" if used_tensor_cores else "hc"), k)]
-        if used_tensor_cores:
-          lins.append(("hc", Linearizer(ast, self.linearizer_opts)))
-          lins[-1][1].hand_coded_optimizations()
-        kb = Linearizer(ast, self.linearizer_opts)
-        kb.required_optimizations()
-        from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
-        # TODO: this shouldn't use Device.DEFAULT, it should get the device from the LinearizerOptions
-        test_rawbuffers = bufs_from_lin(kb)    # allocate scratch buffers for optimization
-        lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
-        timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
-        if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
-        k = timed[0][1]
-    return k
-  @functools.lru_cache(None)    # pylint: disable=method-cache-max-size-none
-  def get_runner(self, ast:LazyOp) -> CompiledASTRunner: return self.to_program(self.get_linearizer(ast))

tinygrad/dtype.py CHANGED Viewed

@@ -1,39 +1,43 @@
-from typing import NamedTuple, Final, Optional, ClassVar, Set, Tuple, Dict
+from typing import Final, Optional, ClassVar, Set, Tuple, Dict, Union
+from dataclasses import dataclass
 import numpy as np  # TODO: remove numpy
 import functools
+from tinygrad.helpers import getenv
-# TODO: migrate this from NamedTuple -> dataclass
-class DType(NamedTuple):
+ConstType = Union[float, int, bool]
+@dataclass(frozen=True, order=True)
+class DType:
   priority: int  # this determines when things get upcasted
   itemsize: int
   name: str
-  np: Optional[type]  # TODO: someday this will be removed with the "remove numpy" project
-  sz: int = 1
-  def __repr__(self): return f"dtypes.{INVERSE_DTYPES_DICT[self]}" if self.sz == 1 else f"dtypes._{INVERSE_DTYPES_DICT[self.scalar()]}{self.sz}"
+  fmt: Optional[str]
+  count: int
+  def __repr__(self): return f"dtypes.{'_'*(c:=self.count!=1)}{INVERSE_DTYPES_DICT[self.name if not c else self.scalar().name]}{str(self.count)*c}"
   def vec(self, sz:int):
-    assert sz > 1 and self.sz == 1, f"can't vectorize {self} with size {sz}"
-    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self]}{sz}", None, sz)
-  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.sz))]] if self.sz > 1 else self
+    assert sz > 1 and self.count == 1, f"can't vectorize {self} with size {sz}"
+    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self.name]}{sz}", None, sz)
+  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.count))]] if self.count > 1 else self
+  # TODO: someday this will be removed with the "remove numpy" project
+  @property
+  def np(self) -> Optional[type]: return np.dtype(self.fmt).type if self.fmt is not None else None
 # dependent typing?
+@dataclass(frozen=True, repr=False)
 class ImageDType(DType):
-  def __new__(cls, priority, itemsize, name, np, shape, base):
-    return super().__new__(cls, priority, itemsize, name, np)
-  def __init__(self, priority, itemsize, name, np, shape, base):
-    self.shape: Tuple[int, ...] = shape  # arbitrary arg for the dtype, used in image for the shape
-    self.base: DType = base
-    super().__init__()
+  shape: Tuple[int, ...]   # arbitrary arg for the dtype, used in image for the shape
+  base: DType
   def scalar(self): return self.base
   def vec(self, sz:int): return self.base.vec(sz)
   def __repr__(self): return f"dtypes.{self.name}({self.shape})"
-  # TODO: fix this to not need these
-  def __hash__(self): return hash((super().__hash__(), self.shape))
-  def __eq__(self, x): return super().__eq__(x) and self.shape == x.shape
-  def __ne__(self, x): return super().__ne__(x) or self.shape != x.shape
+# @dataclass(frozen=True, init=False, repr=False, eq=False)
 class PtrDType(DType):
-  def __new__(cls, dt:DType): return super().__new__(cls, dt.priority, dt.itemsize, dt.name, dt.np, dt.sz)
+  def __init__(self, dt:DType): super().__init__(dt.priority, dt.itemsize, dt.name, dt.fmt, dt.count)
   def __repr__(self): return f"ptr.{super().__repr__()}"
+  def __hash__(self): return super().__hash__()
+  def __eq__(self, dt): return self.priority==dt.priority and self.itemsize==dt.itemsize and self.name==dt.name and self.count==dt.count
+  def __ne__(self, dt): return not (self == dt)
 class dtypes:
   @staticmethod
@@ -43,25 +47,27 @@ class dtypes:
   @staticmethod
   def is_unsigned(x: DType) -> bool: return x.scalar() in (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
   @staticmethod
-  def from_np(x) -> DType: return DTYPES_DICT[np.dtype(x).name]
+  def from_np(x: type) -> DType: return DTYPES_DICT[np.dtype(x).name]
   @staticmethod  # NOTE: isinstance(True, int) is True in python
   def from_py(x) -> DType: return dtypes.default_float if isinstance(x, float) else dtypes.bool if isinstance(x, bool) else dtypes.default_int
   @staticmethod
+  def as_const(val: ConstType, dtype:DType): return int(val) if dtypes.is_int(dtype) else float(val) if dtypes.is_float(dtype) else bool(val)
+  @staticmethod
   def fields() -> Dict[str, DType]: return DTYPES_DICT
-  bool: Final[DType] = DType(0, 1, "bool", np.bool_)
-  int8: Final[DType] = DType(1, 1, "char", np.int8)
-  uint8: Final[DType] = DType(2, 1, "unsigned char", np.uint8)
-  int16: Final[DType] = DType(3, 2, "short", np.int16)
-  uint16: Final[DType] = DType(4, 2, "unsigned short", np.uint16)
-  int32: Final[DType] = DType(5, 4, "int", np.int32)
-  uint32: Final[DType] = DType(6, 4, "unsigned int", np.uint32)
-  int64: Final[DType] = DType(7, 8, "long", np.int64)
-  uint64: Final[DType] = DType(8, 8, "unsigned long", np.uint64)
-  float16: Final[DType] = DType(9, 2, "half", np.float16)
+  bool: Final[DType] = DType(0, 1, "bool", '?', 1)
+  int8: Final[DType] = DType(1, 1, "char", 'b', 1)
+  uint8: Final[DType] = DType(2, 1, "unsigned char", 'B', 1)
+  int16: Final[DType] = DType(3, 2, "short", 'h', 1)
+  uint16: Final[DType] = DType(4, 2, "unsigned short", 'H', 1)
+  int32: Final[DType] = DType(5, 4, "int", 'i', 1)
+  uint32: Final[DType] = DType(6, 4, "unsigned int", 'I', 1)
+  int64: Final[DType] = DType(7, 8, "long", 'l', 1)
+  uint64: Final[DType] = DType(8, 8, "unsigned long", 'L', 1)
+  float16: Final[DType] = DType(9, 2, "half", 'e', 1)
   # bfloat16 has higher priority than float16, so least_upper_dtype(dtypes.int64, dtypes.uint64) = dtypes.float16
-  bfloat16: Final[DType] = DType(10, 2, "__bf16", None)
-  float32: Final[DType] = DType(11, 4, "float", np.float32)
-  float64: Final[DType] = DType(12, 8, "double", np.float64)
+  bfloat16: Final[DType] = DType(10, 2, "__bf16", None, 1)
+  float32: Final[DType] = DType(11, 4, "float", 'f', 1)
+  float64: Final[DType] = DType(12, 8, "double", 'd', 1)
   # dtype aliases
   half = float16; float = float32; double = float64 # noqa: E702
@@ -70,13 +76,17 @@ class dtypes:
   # NOTE: these are image dtypes
   @staticmethod
-  def imageh(shp): return ImageDType(100, 2, "imageh", np.float16, shp, dtypes.float32)
+  def imageh(shp): return ImageDType(100, 2, "imageh", 'e', 1, shape=shp, base=dtypes.float32)
   @staticmethod
-  def imagef(shp): return ImageDType(100, 4, "imagef", np.float32, shp, dtypes.float32)
+  def imagef(shp): return ImageDType(100, 4, "imagef", 'f', 1, shape=shp, base=dtypes.float32)
   default_float: ClassVar[DType] = float32
   default_int: ClassVar[DType] = int32
+if (env_default_float := getenv("DEFAULT_FLOAT", "")):
+  dtypes.default_float = getattr(dtypes, env_default_float.lower())
+  assert dtypes.is_float(dtypes.default_float), f"{env_default_float} is not a float dtype"
 # https://jax.readthedocs.io/en/latest/jep/9407-type-promotion.html
 # we don't support weak type and complex type
 promo_lattice = { dtypes.bool: [dtypes.int8, dtypes.uint8], dtypes.int8: [dtypes.int16], dtypes.int16: [dtypes.int32], dtypes.int32: [dtypes.int64],
@@ -94,4 +104,10 @@ def least_upper_float(dt:DType) -> DType: return dt if dtypes.is_float(dt) else
 # HACK: staticmethods are not callable in 3.8 so we have to compare the class
 DTYPES_DICT = {k: v for k, v in dtypes.__dict__.items() if not (k.startswith(('__', 'default')) or v.__class__ is staticmethod)}
-INVERSE_DTYPES_DICT = {v:k for k,v in DTYPES_DICT.items()}
+INVERSE_DTYPES_DICT = {v.name:k for k,v in DTYPES_DICT.items()}
+def sum_acc_dtype(dt:DType):
+  # default acc dtype for sum
+  if dtypes.is_unsigned(dt): return least_upper_dtype(dt, dtypes.uint)
+  if dtypes.is_int(dt) or dt == dtypes.bool: return least_upper_dtype(dt, dtypes.int)
+  return least_upper_dtype(dt, dtypes.float)

tinygrad/engine/__init__.py ADDED Viewed

File without changes

tinygrad 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl