PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +253 -225
tinygrad/codegen/linearizer.py +398 -436
tinygrad/codegen/uops.py +451 -0
tinygrad/device.py +268 -274
tinygrad/dtype.py +56 -40
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +198 -0
tinygrad/engine/realize.py +192 -0
tinygrad/engine/schedule.py +370 -0
tinygrad/engine/search.py +199 -0
tinygrad/{mlops.py → function.py} +40 -32
tinygrad/helpers.py +144 -46
tinygrad/lazy.py +143 -242
tinygrad/multi.py +173 -0
tinygrad/nn/__init__.py +180 -9
tinygrad/nn/datasets.py +8 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +87 -19
tinygrad/ops.py +104 -45
tinygrad/renderer/__init__.py +65 -0
tinygrad/renderer/assembly.py +269 -0
tinygrad/renderer/cstyle.py +308 -210
tinygrad/renderer/llvmir.py +119 -124
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +13403 -0
tinygrad/runtime/autogen/comgr.py +891 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5893 -0
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33597 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +56 -0
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +39 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +187 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +550 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +129 -37
tinygrad/runtime/ops_disk.py +111 -43
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +41 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +625 -0
tinygrad/runtime/ops_python.py +208 -0
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +46 -107
tinygrad/shape/symbolic.py +99 -98
tinygrad/shape/view.py +162 -45
tinygrad/tensor.py +2492 -483
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/lazy.py CHANGED Viewed

@@ -1,96 +1,127 @@
 from __future__ import annotations
-import sys, math
-import numpy as np
-from collections import defaultdict
-from typing import Union, Optional, Any, Tuple, List, Set, Dict, DefaultDict
-from tinygrad.dtype import dtypes, DType, ImageDType
-from tinygrad.helpers import prod, merge_dicts, flatten, getenv, dedup, DEBUG, all_int, all_same
-from tinygrad.ops import LoadOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, BufferOps, Op, LazyOp, ConstBuffer, MemBuffer, ScheduleItem
+import math
+from typing import Union, Optional, Any, Tuple, List
+from tinygrad.dtype import dtypes, DType, ConstType
+from tinygrad.helpers import prod, getenv, all_int, all_same, DEBUG
+from tinygrad.ops import LoadOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, Op, exec_alu, python_alu
 from tinygrad.shape.symbolic import sint, Variable
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.device import Buffer, Device
-from tinygrad.graph import log_lazybuffer
-from weakref import ref, WeakValueDictionary, ReferenceType
+from tinygrad.device import Buffer
+from weakref import ref, ReferenceType, WeakValueDictionary
-# lazy can recurse a lot
-sys.setrecursionlimit(10000)
+lazycache: WeakValueDictionary[Any, LazyBuffer] = WeakValueDictionary()
+def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
+                      base:Optional[LazyBuffer]=None, enable_cache=bool(getenv("LAZYCACHE", 1))):
+  if st.size == 0: op, arg, srcs, base = LoadOps.CONST, 0, (), None
+  if op is LoadOps.CONST: arg, enable_cache = dtypes.as_const(arg, dtype) if not isinstance(arg, Variable) else arg, True
-lazycache: WeakValueDictionary = WeakValueDictionary()
-def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType,
-                      op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
-                      base:Optional[LazyBuffer]=None):
-  if 0 in st.shape: st, op, arg, srcs = ShapeTracker.from_shape(st.shape), LoadOps.CONST, 0, ()
-  wop = (device, st, dtype, op, arg, tuple(ref(x) for x in srcs), ref(base) if base else None)
-  if wop in lazycache: return lazycache[wop]
+  cache_key = (device, st, dtype, op, arg, tuple(ref(x) for x in srcs)) if base is None else (st, ref(base))
+  if enable_cache and (rret := lazycache.get(cache_key, None)): return rret
   ret = LazyBuffer(device, st, dtype, op, arg, srcs, base=base)
-  # TODO: remove LoadOps.CONST here while keeping a pretty graph and working fusions
-  # TODO: might be possible to remove LoadOps.COPY
-  if op not in {LoadOps.EMPTY, LoadOps.CUSTOM, LoadOps.CONST, LoadOps.COPY} and getenv("LAZYCACHE", 1): lazycache[wop] = ret
+  if enable_cache: lazycache[cache_key] = ret
   return ret
+view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "DISK"}
 class LazyBuffer:
   def __init__(self, device:str, st:ShapeTracker, dtype:DType,
                op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
                base:Optional[LazyBuffer]=None):
-    assert isinstance(device, str) and device == Device.canonicalize(device)
     self.device, self.st, self.dtype, self.shape, self.size = device, st, dtype, st.shape, st.size
+    self._base: Optional[LazyBuffer] = None
     if base is None:
       # properties on base
       self.op, self.arg, self.srcs = op, arg, srcs  # this is a LazyOp, except the src is LazyBuffers and not LazyOps
-      self.realized: Optional[Buffer] = None
-      self.output_buffer: Optional[Buffer] = None
-      self.forced_realize = False
+      assert self.op is not LoadOps.ASSIGN or srcs[1].base.realized is not None, "assign target must be realized"
+      if self.op is LoadOps.VIEW:
+        # some LazyBuffers can be processed with only a view, no AST required
+        self.buffer: Buffer = srcs[0].base.buffer.view(st.size, dtype, srcs[0].st.views[0].offset * srcs[0].dtype.itemsize)
+      else:
+        self.buffer = srcs[1].base.buffer if self.op is LoadOps.ASSIGN else Buffer(device, self.size, dtype)
+      self.buffer.ref(1)
       self.contiguous_child: Optional[Tuple[ReferenceType[LazyBuffer], ShapeTracker]] = None
+      self.forced_realize = False
     else:
       # properties on view
       assert base.base == base, "base must be a base itself"
       self._base = base
+  def __del__(self):
+    if hasattr(self, 'buffer'): self.buffer.ref(-1)
   def __repr__(self) -> str:
-    return f"<LB {self.device} {self.shape} contig:{self.st.contiguous} {self.st if hasattr(self, '_base') else (self.op, self.realized)}>"
+    return f"<LB {self.device} {self.shape} {str(self.dtype)[7:]} {self.st if self.base != self else (self.op, self.realized)}>"
   @property
-  def base(self) -> LazyBuffer: return self._base if hasattr(self, '_base') else self
+  def realized(self) -> Optional[Buffer]:
+    # NOTE: we check for a lack of srcs instead of an allocated buffer to make unrealized assigns return None here
+    return self.buffer if self._base is None and not hasattr(self, 'srcs') else None
+  # NOTE: this has to be a function to prevent self reference
+  @property
+  def base(self) -> LazyBuffer: return self._base if self._base is not None else self
+  # same API as multi
+  @property
+  def lbs(self) -> List[LazyBuffer]: return [self]
   @staticmethod
-  def loadop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, src:Optional[LazyBuffer]=None) -> LazyBuffer:
-    return create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, (src,) if src is not None else ())
+  def loadop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, src:Tuple[LazyBuffer, ...]=(), enable_cache=False) -> LazyBuffer:
+    assert isinstance(src, tuple)
+    return create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, src, enable_cache=enable_cache)
+  def const(self, val:ConstType, shape:Optional[Tuple[sint,...]]=None) -> LazyBuffer:
+    assert isinstance(val, (int,float,bool)), f"{val=} has {type(val)=}, not a ConstType"
+    shape = self.shape if shape is None else shape
+    return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, self.device, arg=val).reshape((1,)*len(shape)).expand(shape)
+  def is_realized(self) -> bool: return self.base.realized is not None
+  def assign(self, x:LazyBuffer) -> LazyBuffer:
+    assert x.size == self.size, f"assign target must have same size {self.size=} != {x.size=}"
+    return LazyBuffer.loadop(LoadOps.ASSIGN, self.shape, self.dtype, self.device, arg=() if self.st.contiguous else (self.st,), src=(x, self.base))
-  def const(self, val:Union[float, int]) -> LazyBuffer:
-    return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, self.device, arg=val).reshape((1,)*len(self.shape)).expand(self.shape)
+  def can_view(self): return self.st.consecutive and not self.is_unrealized_const() and self.device.split(":")[0] in view_supported_devices
-  def contiguous(self):
+  def contiguous(self, allow_buffer_view=True):
     if not self.st.contiguous or self.size != self.base.size or self.is_unrealized_const():
-      ret = self.e(LoadOps.CONTIGUOUS)
-      sti = self.st.invert(self.base.shape)
-      if sti is not None: self.base.contiguous_child = ref(ret), sti
+      ret = self.e(LoadOps.VIEW) if allow_buffer_view and self.can_view() else self.e(LoadOps.CONTIGUOUS)
+      if (sti := self.st.invert(self.base.shape)) is not None: self.base.contiguous_child = ref(ret), sti
       return ret
     self.base.forced_realize = True
     return self
   def cast(self, dtype:DType, bitcast:bool=False):
     if self.dtype == dtype: return self
-    return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), dtype, UnaryOps.CAST, (dtype, bitcast), (self,))
-  def is_unrealized_const(self): return not self.base.realized and self.base.op == LoadOps.CONST
-  def is_unrealized_contiguous_const(self): return self.base == self and not self.base.realized and self.op == LoadOps.CONST
-  def schedule(self, seen=None): return create_schedule([self], seen)
-  @staticmethod
-  def fromCPU(x: np.ndarray) -> LazyBuffer:
-    ret = LazyBuffer("CPU", ShapeTracker.from_shape(x.shape), dtypes.from_np(x.dtype), op=LoadOps.EMPTY)
-    ret.realized = Buffer("CPU", prod(x.shape), dtypes.from_np(x.dtype), x.flatten())
-    return ret
-  def copy_to_device(self, device:str) -> LazyBuffer:
+    if self.device.startswith("DISK") and not bitcast: raise RuntimeError("attempted to cast disk buffer (bitcast only)")
+    if self.is_unrealized_unmasked_const() and not bitcast:
+      return create_lazybuffer(self.device, self.st, dtype, LoadOps.CONST, dtypes.as_const(self.base.arg, dtype))
+    new_shape = self.shape
+    if bitcast and self.dtype.itemsize != dtype.itemsize:
+      if not self.device.startswith("DISK"): raise RuntimeError("shape changing bitcast only supported on DISK right now")
+      if not all_int(new_shape): raise RuntimeError("shape changing bitcast with symbolic shape isn't supported yet")
+      # https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
+      if not (new_shape[-1]*self.dtype.itemsize) % dtype.itemsize == 0: raise RuntimeError("unsupported size in bitcast")
+      new_shape = new_shape[:-1] + ((new_shape[-1]*self.dtype.itemsize) // dtype.itemsize,)
+    elif getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base:
+      # TODO: applying this makes gpt2 slower
+      return self.base.cast(dtype, bitcast)._view(self.st)
+    cast_op: Union[LoadOps, UnaryOps] = (LoadOps.VIEW if self.can_view() else UnaryOps.BITCAST) if bitcast else UnaryOps.CAST
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), dtype, cast_op, dtype, (self,))
+  def is_unrealized_const(self): return self.base.realized is None and self.base.op is LoadOps.CONST and not isinstance(self.base.arg, Variable)
+  def is_unrealized_unmasked_const(self): return self.is_unrealized_const() and all(v.mask is None for v in self.st.views)
+  def _copy(self, device:str) -> LazyBuffer:
+    return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, self.buffer.nbytes, (self,), enable_cache=False)
+  def copy_to_device(self, device:str, force: bool = False) -> LazyBuffer:
     # no COPY
     if self.device == device: return self
     # double COPY = one COPY
-    if self.st.contiguous and self.size == self.base.size and not self.base.realized and self.base.op == LoadOps.COPY:
+    if not force and self.st.contiguous and self.size == self.base.size and not self.base.realized and self.base.op is LoadOps.COPY:
       return self.base.srcs[0].copy_to_device(device).reshape(self.st.shape)
     # const doesn't have to be copied (issues with disk tensor)
@@ -98,11 +129,10 @@ class LazyBuffer:
       return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, device, arg=self.base.arg)._view(self.st)
     # if it's a shrink, do the shrink before the copy with CONTIGUOUS
-    if prod(self.st.shape) < prod(self.base.st.shape):
-      return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, srcs=(self.contiguous(),))
+    if prod(self.st.shape) < prod(self.base.st.shape): return self.contiguous()._copy(device)
     # copy the base and apply the shapetracker on the new device
-    return create_lazybuffer(device, self.base.st, self.dtype, LoadOps.COPY, srcs=(self.base,))._view(self.st)
+    return self.base._copy(device)._view(self.st)
   def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
     srcs: List[LazyBuffer] = []
@@ -111,36 +141,74 @@ class LazyBuffer:
         srcs.append(root._view(s.base.contiguous_child[1]))
       else:
         srcs.append(s)
-    assert all_same(dts:=[x.dtype.scalar() for x in (srcs if op != TernaryOps.WHERE else srcs[1:])]), f"all dtypes must match {dts} on {op}"
+    assert all_same(dts:=[x.dtype.scalar() for x in (srcs[1:] if op is TernaryOps.WHERE else srcs)]), f"all dtypes must match {dts} on {op}"
     assert all_same([x.shape for x in srcs]), f"all shapes must be the same {[x.shape for x in srcs]}"
-    if op == TernaryOps.WHERE: assert srcs[0].dtype == dtypes.bool, "TernaryOps.WHERE must have the first arg be bool"
-    out_dtype = srcs[-1].dtype if op not in (BinaryOps.CMPLT, BinaryOps.CMPEQ) else dtypes.bool
-    ret = create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), out_dtype, op, arg, tuple(srcs))
-    return ret
+    if op is TernaryOps.WHERE: assert srcs[0].dtype == dtypes.bool, "TernaryOps.WHERE must have the first arg be bool"
+    if op is UnaryOps.NEG: assert srcs[0].dtype != dtypes.bool, "UnaryOps.NEG does not accept dtype bool"
+    out_dtype = dtypes.bool if op in (BinaryOps.CMPLT, BinaryOps.CMPNE) else srcs[-1].dtype
+    # const folding
+    if op in python_alu and all(s.is_unrealized_unmasked_const() for s in srcs):
+      return self.cast(out_dtype).const(exec_alu(op, out_dtype, [s.base.arg for s in srcs]))
+    if op is UnaryOps.NEG and self.base.op is UnaryOps.NEG and self.base.realized is None: return self.base.srcs[0]
+    if op in BinaryOps:
+      x, y = self, in_srcs[0]
+      if op is BinaryOps.ADD:
+        if y.is_unrealized_unmasked_const() and y.base.arg == 0: return x
+        if x.is_unrealized_unmasked_const() and x.base.arg == 0: return y
+      if op is BinaryOps.MUL:
+        if x.is_unrealized_unmasked_const() and (val := x.base.arg) in (1, 0, -1):
+          return y if val == 1 else y.const(0) if val == 0 else y.e(UnaryOps.NEG)
+        if y.is_unrealized_unmasked_const() and (val := y.base.arg) in (1, 0, -1):
+          return x if val == 1 else x.const(0) if val == 0 else x.e(UnaryOps.NEG)
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), out_dtype, op, arg, tuple(srcs))
   # *** reduce ops ***
-  def _reduce_op(self, op:ReduceOps, new_shape:Tuple[sint, ...]) -> LazyBuffer:
-    if self.shape == tuple(new_shape): return self
-    unbound_new_shape = tuple(s.unbind()[0] if not isinstance(s, int) else s for s in new_shape)
-    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), self.dtype, op, unbound_new_shape, (self,))
+  def _reduce_op(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
+    assert all(0 <= x < len(self.shape) for x in axis), f"axis args {axis} out of range for shape {self.shape}"
+    axis = tuple(sorted([x for x in axis if self.shape[x] != 1]))
+    if len(axis) == 0: return self
+    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), self.dtype, op, axis, (self,))
+  def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
+    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
+    # TODO: this logic should move to the scheduler
+    if self.size == 0 and 0 not in new_shape: return self.const({ReduceOps.SUM: 0.0, ReduceOps.MAX: -math.inf}[op], new_shape)
+    # const folding
+    # TODO: fold this for symbolic?
+    if self.is_unrealized_unmasked_const() and all_int(self.shape):
+      return self.const(self.base.arg * {ReduceOps.SUM: prod(self.shape[i] for i in axis), ReduceOps.MAX: 1}[op], new_shape)
-  def r(self, op:ReduceOps, new_shape:Tuple[sint, ...]) -> LazyBuffer:
-    assert len(self.shape) == len(new_shape) and all(s == ns or ns == 1 for s,ns in zip(self.shape, new_shape)), \
-      f"reduce shape lens must match {self.shape} {new_shape}"
     # TODO: can we split symbolic shape if the reduce axis is not symbolic?
-    if not all_int(self.shape) or (0 in self.shape) or prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768):
-      return self._reduce_op(op, new_shape)
-    heuristic, divisor, dim_to_split = max(((divisor := math.gcd(256, old))/(stride or math.inf), divisor, i) for i, (old, new, stride) in enumerate(zip(self.shape, new_shape, self.st.real_strides())) if old != new) # type: ignore  # noqa: E501
-    if divisor < 16 or heuristic < 0.1: return self._reduce_op(op, new_shape)
-    # choose largest divisor (>=16) to split on, penalize large strides
-    def splitted_shape(dim_aft_div):
-      return self.shape[:dim_to_split] + (self.shape[dim_to_split]//divisor,) + dim_aft_div + self.shape[dim_to_split+1:]
-    return self.reshape(splitted_shape((divisor,)))._reduce_op(op, splitted_shape((1,))).reshape(splitted_shape(()))._reduce_op(op, new_shape)
+    if not getenv("SPLIT_REDUCEOP", 1) or not all_int(self.shape) or (0 in self.shape) or \
+      prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768):
+      return self._reduce_op(op, axis)
+    # if there are few globals, make some reduces into globals by splitting into two kernels
+    # cap output buffer to 2**22: heuristic number of global outputs to achieve max occupancy with enough locals+upcasts for gemm
+    #   ~2**10 should be enough if GROUP is used
+    # 256 split maximum should be "negligible reduce" for low prod(new_shape), 8 split minimum.
+    # split is moved to the end to provide maximum locality for the second phase reduce.
+    self_real_strides = self.st.real_strides(ignore_valid=True)
+    split_candidates = [(i, x) for i in axis for x in range(min(256,2**getenv("REDUCEOP_SPLIT_SIZE",22)//prod(new_shape)),8-1,-1)
+                        if self.shape[i] % x == 0 and self_real_strides[i] != 0]
+    if not split_candidates: return self._reduce_op(op, axis)
+    dim_to_split, divisor = split_candidates[0]
+    splitted_shape = self.shape[:dim_to_split] + (divisor,) + (self.shape[dim_to_split]//divisor,) + self.shape[dim_to_split+1:]
+    splitted = self.reshape(splitted_shape).permute(tuple([x for x in range(len(splitted_shape)) if x != dim_to_split]+[dim_to_split]))
+    if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
+    return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
   # *** movement ops ***
   def _view(self, new_st:ShapeTracker) -> LazyBuffer:
+    if self.st.size == 0 or (new_st.views[-1].mask is not None and any((x[1]-x[0]) == 0 for x in new_st.views[-1].mask)):
+      return self.const(0, new_st.shape)
     if new_st.contiguous and self.base.shape == new_st.shape: return self.base
     return create_lazybuffer(self.device, new_st, self.dtype, base=self.base)
@@ -150,170 +218,3 @@ class LazyBuffer:
   def permute(self, arg:Tuple[int, ...]): return self._view(self.st.permute(arg))
   def shrink(self, arg:Tuple[Tuple[sint, sint], ...]): return self._view(self.st.shrink(arg))
   def stride(self, arg:Tuple[int, ...]): return self._view(self.st.stride(arg))
-# *** schedule creation ***
-# recursively create a lazyop
-def _recursive_lazyop(buf:LazyBuffer, inputs:List[LazyBuffer], var_vals:Dict[Variable, int], st:ShapeTracker,
-                      realizes:Set[LazyBuffer], cache, first=True) -> LazyOp:
-  if (buf, st) in cache: return cache[(buf, st)]
-  if buf != buf.base:
-    st = buf.st + st
-    buf = buf.base
-  # all buffers here are base now
-  assert buf.op is not None
-  # consts are always fused and generated
-  if buf.op == LoadOps.CONST:
-    # TODO: make shapetracker unbind also return var_vals
-    var_vals.update(merge_dicts([var_vals, st.var_vals]))
-    return LazyOp(BufferOps.CONST, (), ConstBuffer(float(buf.arg), buf.dtype, st.simplify().unbind()))
-  # if we aren't fusing it, it's a load and we add it to the inputs
-  if buf.realized or (buf in realizes and not first):
-    if buf not in inputs: inputs.append(buf)
-    var_vals.update(merge_dicts([var_vals, st.var_vals]))
-    return LazyOp(BufferOps.LOAD, (), MemBuffer(inputs.index(buf)+1, buf.dtype, st.simplify().unbind()))
-  # if a CONTIGUOUS made it all the way here, just skip it
-  if buf.op == LoadOps.CONTIGUOUS:
-    assert first
-    return _recursive_lazyop(buf.srcs[0], inputs, var_vals, st, realizes, cache, False)
-  # if it's a reduce, we have to change the shapetracker
-  if buf.op in ReduceOps:
-    assert st.contiguous, "ReduceOps late fusion must be contiguous"
-    st = ShapeTracker.from_shape(buf.srcs[0].shape)
-  # otherwise we fuse it like normal
-  cache[(buf, st)] = ret = LazyOp(buf.op, tuple(_recursive_lazyop(x, inputs, var_vals, st, realizes, cache, False) for x in buf.srcs), buf.arg)
-  return ret
-# recursively walk back in the graph to create the schedule
-def _recursive_schedule(out:LazyBuffer, seen:Set[LazyBuffer], realizes:Set[LazyBuffer],
-                        reduce_for_op: Dict[LazyBuffer, LazyBuffer]) -> List[ScheduleItem]:
-  if out in seen or out.realized or out.op == LoadOps.CONST: return []
-  assert out.base == out
-  seen.add(out)
-  inputs: List[LazyBuffer] = []
-  var_vals: Dict[Variable, int] = out.st.var_vals.copy()
-  if out.op == LoadOps.COPY:
-    op, inputs = LazyOp(LoadOps.COPY, (), out.srcs[0].base), [out.srcs[0].base]
-  elif out.op == LoadOps.CUSTOM:
-    op, inputs = LazyOp(LoadOps.CUSTOM, (), out.arg), list(out.srcs)
-  elif out.op == LoadOps.EMPTY:
-    op = LazyOp(LoadOps.EMPTY)
-  else:
-    output_st = ShapeTracker.from_shape(reduce_for_op[out].shape if out in reduce_for_op else out.shape)
-    op = _recursive_lazyop(out, inputs, var_vals, output_st, realizes, cache={})
-    op = LazyOp(BufferOps.STORE, (op, ), MemBuffer(0, out.dtype, output_st.simplify().unbind()))
-  return flatten(_recursive_schedule(x.base, seen, realizes, reduce_for_op) for x in inputs) + [ScheduleItem(op, out, tuple(inputs), var_vals)]
-# recursively search the entire graph for all LazyBuffers, insert realizes after expands
-def _recurse_lb(buf:LazyBuffer, realizes:Set[LazyBuffer], allbufs:Dict[LazyBuffer, None],
-                simple_pads:Set[LazyBuffer], children:DefaultDict[LazyBuffer, Dict[LazyBuffer, None]]):
-  if buf in allbufs or buf.base.realized: return
-  log_lazybuffer(buf)
-  if isinstance(buf.dtype, ImageDType) and (prod(buf.shape) != prod(buf.dtype.shape) or
-                                            not any(buf.shape[x]%4 == 0 for x in buf.st.unit_stride_axes())):
-    if DEBUG >= 3: print(f"forcing image {buf.dtype} with shape {buf.shape} to float32")
-    buf.dtype = dtypes.float32  # NOTE; this is what makes the dtype above not match
-  if buf.base != buf:
-    # realize all places where the buffer is expanded
-    if prod(buf.base.st.shape) < prod(buf.st.shape):
-      if len(buf.st.views) == 1 and buf.st.views[-1].mask and all_int(buf.base.st.shape) and \
-          prod(buf.base.st.shape) == prod([y-x for x,y in buf.st.views[-1].mask]):
-        simple_pads.add(buf.base)
-      else:
-        realizes.add(buf.base)
-    return _recurse_lb(buf.base, realizes, allbufs, simple_pads, children)
-  if buf.forced_realize: realizes.add(buf)
-  allbufs[buf] = None
-  if buf.op in LoadOps: realizes.add(buf.base)
-  if buf.op == LoadOps.COPY:
-    assert buf.srcs[0].st.contiguous and buf.srcs[0].size == buf.srcs[0].base.size, "can only copy contig"
-    realizes.add(buf.srcs[0].base)
-  for x in buf.srcs:
-    children[x.base][buf] = None
-    _recurse_lb(x, realizes, allbufs, simple_pads, children)
-UNSAFE_PAD_OPS = {BinaryOps.DIV, BinaryOps.CMPLT, BinaryOps.CMPEQ, UnaryOps.LOG2, UnaryOps.EXP2}
-def _is_padding_okay(buf:LazyBuffer, realizes:Set[LazyBuffer]) -> bool:
-  if buf in realizes or buf.realized: return True
-  # NOTE: this broke to_image_idx and coder with JIT
-  if buf.op in UNSAFE_PAD_OPS: return False
-  return all(_is_padding_okay(x.base, realizes) for x in buf.srcs)
-def create_schedule(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffer]]=None) -> List[ScheduleItem]:
-  if seen is None: seen = set()
-  for out in outs: log_lazybuffer(out, scheduled=True)
-  # start by just realizing the buffers passed in
-  realizes: Set[LazyBuffer] = set([x.base for x in outs if not x.base.realized])
-  allbufs: Dict[LazyBuffer, None] = {}
-  simple_pads: Set[LazyBuffer] = set()
-  children: DefaultDict[LazyBuffer, Dict[LazyBuffer, None]] = defaultdict(dict)
-  for out in outs: _recurse_lb(out.base, realizes, allbufs, simple_pads, children)
-  # check if we have to realize pads
-  for p in simple_pads:
-    if not _is_padding_okay(p, realizes):
-      realizes.add(p)
-  # find all reduces, and pair them to a elementwise op. if they can't be cleanly paired, force realize the reduce (or a contig child)
-  reduce_for_op: Dict[LazyBuffer, LazyBuffer] = {}
-  for r in allbufs.keys():
-    if r != r.base or r.op not in ReduceOps or r in realizes: continue
-    # follow the reduce down
-    child_set: Dict[LazyBuffer, ShapeTracker] = {r: r.st}
-    realized_children: Dict[LazyBuffer, ShapeTracker] = {}
-    forced_realize = False
-    can_chase = True
-    while not forced_realize and len(child_set):
-      next_child_set = {}
-      for tr,st in child_set.items():
-        if tr in realizes:
-          realized_children[tr] = st
-          # can only have one output buffer
-          # can only reduce contiguous
-          # max one reduceop per kernel
-          if len(realized_children) > 1 or not st.contiguous or st.size != r.st.size or (tr in reduce_for_op and reduce_for_op[tr] != r):
-            can_chase = tr not in reduce_for_op or reduce_for_op[tr] == r
-            forced_realize = True
-            break
-          continue
-        for tr_next in children[tr].keys():
-          if not tr_next.realized:
-            # max one reduceop per kernel
-            if tr_next.op in ReduceOps:
-              forced_realize = True
-              break
-            st_childs = dedup([s for s in tr_next.srcs if s.base == tr])
-            if len(st_childs) > 1:
-              forced_realize = True
-              break
-            next_child_set[tr_next] = st + st_childs[0].st
-      child_set = next_child_set
-    if forced_realize:
-      tr = r
-      if can_chase:
-        # can chase this down to contiguous children
-        st = tr.st
-        while len(children[tr]) == 1:
-          tr_next = next(iter(children[tr].keys()))
-          st_childs = dedup([s for s in tr_next.srcs if s.base == tr])
-          if len(st_childs) > 1: break
-          if st.size != st_childs[0].st.size: break
-          st = st + st_childs[0].st
-          if not st.contiguous or tr_next.op in ReduceOps: break
-          tr = tr_next
-        reduce_for_op[tr] = r
-      realizes.add(tr)
-    else:
-      assert len(realized_children) == 1
-      reduce_for_op[next(iter(realized_children.keys()))] = r
-  return flatten(_recursive_schedule(x.base, seen, realizes, reduce_for_op) for x in outs)

tinygrad/multi.py ADDED Viewed

@@ -0,0 +1,173 @@
+from __future__ import annotations
+from typing import Optional, Union, Any, Tuple, List
+import functools, itertools, operator
+from tinygrad.helpers import all_same, all_int, dedup, round_up, prod, DEBUG, RING
+from tinygrad.dtype import DType, ConstType
+from tinygrad.ops import BinaryOps, LoadOps, UnaryOps, TernaryOps, ReduceOps
+from tinygrad.lazy import LazyBuffer
+from tinygrad.shape.shapetracker import sint
+def all_reduce(op: ReduceOps, lbs: List[LazyBuffer]) -> List[LazyBuffer]:
+  assert all_int(lbs[0].shape), f"does not support symbolic shape {lbs[0].shape}"
+  assert all_same([lb.shape[0] for lb in lbs]), "allreduce with uneven shards is undefined"
+  bop = {ReduceOps.SUM:BinaryOps.ADD, ReduceOps.MAX:BinaryOps.MAX}[op]
+  n_lbs, dim = len(lbs), prod(lbs[0].shape)
+  # Ring allreduce doesn't provide a benefit with only 2 nodes or where number of elements is less than 256k (empirically)
+  # so just fallback to naive allreduce to save on kernel dispatch, chunking and reassembling chunks.
+  use_ring = (RING >= 2 or (n_lbs > 2 and dim > 256_000 and RING >= 1))
+  if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{dim} | {lbs[0].dtype}")
+  if not use_ring:
+    return [functools.reduce(lambda x,y: x.e(bop, y), [x.copy_to_device(lb.device) for x in lbs]) for lb in lbs]
+  factor = max(f for f in [32, 16, 8, 4, 2, 1] if dim % f == 0)
+  base, left = (dim // factor) // n_lbs, (dim // factor) % n_lbs
+  c_lens = [(base + 1) * factor if i < left else base * factor for i in range(n_lbs)]
+  acc = 0
+  chunks = [(acc, (acc := acc + i)) for i in c_lens if i > 0]
+  chunked = [[lb.reshape((dim,)).shrink(((s,e),)) for s,e in chunks] for lb in lbs]
+  # Scatter-reduce step
+  for step in range(n_lbs - 1):
+    for i in range(len(chunks)):
+      s, r = (i+step)%n_lbs, (i+step+1)%n_lbs
+      chunked[r][i] = chunked[r][i].e(bop, chunked[s][i].copy_to_device(chunked[r][i].device, force=True))
+  # Allgather step
+  for step in range(n_lbs - 1):
+    for i in range(len(chunks)):
+      s, r = (i+step-1)%n_lbs, (i+step)%n_lbs
+      chunked[r][i] = chunked[s][i].copy_to_device(chunked[r][i].device, force=True)
+  # Assemble chunks back
+  pads = [((s,dim-e),) for s,e in chunks]
+  return [functools.reduce(lambda x,y: x.e(BinaryOps.ADD, y), [c.pad(pads[i]) for i,c in enumerate(lb_c)]).reshape(lbs[0].shape) for lb_c in chunked]
+def to_sharded(lbs:List[LazyBuffer], axis:int) -> List[LazyBuffer]:
+  if DEBUG >= 3 and lbs[0].shape[axis] % len(lbs) != 0: print(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}")
+  sz = round_up(lbs[0].shape[axis], len(lbs)) // len(lbs)
+  return [lb.shrink(tuple((0,s) if a != axis else (min(s,sz*i),min(s,sz*(i+1))) for a,s in enumerate(lb.shape))) for i,lb in enumerate(lbs)]
+class MultiLazyBuffer:
+  def __init__(self, lbs:List[LazyBuffer], axis:Optional[int], real:Optional[List[bool]]=None):
+    assert all(isinstance(x, LazyBuffer) for x in lbs) and len(lbs), "all lbs must be LazyBuffers, and we need at least one of them"
+    assert all_same([x.dtype for x in lbs]), f"all multilazybuffer needs same dtype, getting {[x.dtype for x in lbs]}"
+    self.lbs, self.axis, self.dtype, self.device, self.real = lbs, axis, lbs[0].dtype, tuple(x.device for x in lbs), real or [True]*len(lbs)
+    if axis is not None:
+      splits = list(itertools.accumulate([lb.shape[axis] for lb in lbs], initial=0))
+      self.bounds = list(zip(splits, splits[1:]))
+  @property
+  def shape(self):
+    return tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape))
+  @property
+  def size(self): return sum(x.size for x in self.real_lbs)
+  @property
+  def real_lbs(self): return [lb for lb,r in zip(self.lbs, self.real) if r]
+  def __repr__(self):
+    return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
+  @staticmethod
+  def from_sharded(lb:LazyBuffer, devices:Tuple[str, ...], axis:Optional[int]=None):
+    lbs = [lb.contiguous() if lb.base != lb and not lb.is_unrealized_unmasked_const() else lb] * len(devices)
+    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis) if axis is not None else lbs, devices)]
+    return MultiLazyBuffer([lb if lb.is_unrealized_unmasked_const() else lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
+  def copy_to_device(self, device:str) -> LazyBuffer:
+    if self.axis is None:
+      # if we already have a copy on the device, return that
+      for lb in self.real_lbs:
+        if lb.device == device: return lb
+      return self.lbs[self.real.index(True)].copy_to_device(device)
+    llbs:List[LazyBuffer] = []
+    for lb,real,(start,end) in zip(self.lbs, self.real, self.bounds):
+      if not real: continue
+      pad_arg = tuple((0,0) if a != self.axis else (start, self.bounds[-1][1]-end) for a in range(len(lb.shape)))
+      llbs.append(lb.copy_to_device(device).pad(pad_arg))
+    return functools.reduce(lambda x,y: x.e(BinaryOps.ADD, y), llbs)
+  # passthroughs
+  def is_realized(self) -> bool: return all(lb.base.realized is not None for lb, r in zip(self.lbs, self.real) if r is True)
+  def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
+  def const(self, val:ConstType) -> MultiLazyBuffer: return MultiLazyBuffer([x.const(val) for x in self.lbs], self.axis, self.real)
+  def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
+  def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)
+  # elementwise is simple
+  def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:MultiLazyBuffer, arg:Optional[Any]=None) -> MultiLazyBuffer:
+    msrcs = (self,)+in_srcs
+    assert all(isinstance(x, MultiLazyBuffer) for x in msrcs), f"all buffers must be MultiLazyBuffer {msrcs}"
+    assert all_same([x.device for x in msrcs]), f"all buffers must have the same device {[x.device for x in msrcs]}"
+    # NOTE: they all have to share an axis, we always choose [-1]
+    axis = axes[-1] if len(axes := dedup([x.axis for x in msrcs if x.axis is not None])) else None
+    srcs = []
+    not_all_real = any(not all(mlb.real) for mlb in msrcs)
+    new_real = [all(transposed) for transposed in zip(*[mlb.real for mlb in msrcs])] if not_all_real else self.real
+    assert any(new_real), "output contains no real lb"
+    for mlb in msrcs:
+      if mlb.axis == axis or not_all_real: srcs.append(mlb.lbs)
+      elif mlb.axis is None and axis is not None: srcs.append(to_sharded(mlb.lbs, axis))
+      else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis))
+    # NOTE: lsrcs[-1].const(0) is correct for where
+    return MultiLazyBuffer([lsrcs[0].e(op, *lsrcs[1:], arg=arg) if r else lsrcs[-1].const(0) for lsrcs,r in zip(zip(*srcs),new_real)], axis, new_real)
+  def _shape_to_single_shard(self, shape:Tuple[sint, ...], lb:LazyBuffer) -> Tuple[sint, ...]:
+    return tuple(lb.shape[self.axis] if a == self.axis else s for a,s in enumerate(shape))
+  def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> MultiLazyBuffer:
+    if self.axis is not None and self.axis in axis:
+      # all-reduce on sharded axes
+      reduced_parts = [(x if r else x.const(0)).r(op, axis) for x,r in zip(self.lbs, self.real)]
+      if all(self.real): return MultiLazyBuffer(all_reduce(op, reduced_parts), None)
+      return MultiLazyBuffer(reduced_parts, None, self.real)
+    # reduce on non sharded axes, piecewise is fine. if axis is None this is also correct
+    return MultiLazyBuffer([x.r(op, axis) for x in self.lbs], self.axis, self.real)
+  # *** movement ops ***
+  def reshape(self, arg:Tuple[sint, ...]):
+    if self.axis is None: return MultiLazyBuffer([x.reshape(arg) for x in self.lbs], None, self.real)
+    arg_acc:List[sint] = list(itertools.accumulate(arg, operator.mul, initial=1))
+    # new_axis is the last one that preserves prod(prior to new_axis) and must not move items between shards
+    # todo: what to do about shrinking to self.shape[self.axis]==1 len(self.real_lbs)==1?
+    new_axis = len(arg_acc) - arg_acc[::-1].index(prod(self.shape[:self.axis])) - 1
+    if arg[new_axis] != self.shape[self.axis]:
+      assert self.shape[self.axis] % len(self.real_lbs) == 0, f"cannot reshape on-axis for uneven shard {self.axis} {self.shape} {len(self.real_lbs)}"
+      assert arg[new_axis] % len(self.real_lbs) == 0, f"new on-axis shape must divide evenly between devices {new_axis} {arg} {len(self.real_lbs)}"
+    return MultiLazyBuffer([x.reshape(tuple(s if a != new_axis else
+                              x.shape[self.axis] if s == self.shape[self.axis] else
+                              s // len(self.real_lbs) for a,s in enumerate(arg))) for x in self.lbs],
+                           new_axis, self.real)
+  def pad(self, arg:Tuple[Tuple[sint, sint], ...]):
+    assert self.axis is None or arg[self.axis] == (0,0) or not all(self.real), f"padding not supported for {arg=}"
+    # pad on shard axis -> fill others with zeros and set real to all True
+    if self.axis is not None and arg[self.axis] != (0,0):
+      # pad back to whole axis, remove real mask
+      assert all(arg[i] == (0, 0) or i == self.axis for i in range(len(self.shape))), "cannot pad sharded and non-sharded axis at the same time"
+      assert arg[self.axis] == (sum(lb.shape[self.axis] for i,lb in enumerate(self.lbs) if i < self.real.index(True)), \
+                                sum(lb.shape[self.axis] for i,lb in enumerate(self.lbs) if i > self.real.index(True))), "can only pad to whole axis"
+      return MultiLazyBuffer([x if r else x.const(0) for x,r in zip(self.lbs, self.real)], self.axis)
+    return MultiLazyBuffer([x.pad(arg) for x in self.lbs], self.axis, self.real)
+  def expand(self, arg:Tuple[sint, ...]):
+    # NOTE: this assert isn't needed, sharded axis can have dim 1
+    assert self.axis is None or arg[self.axis] == self.shape[self.axis], f"expand not supported on sharded axis {arg=}"
+    return MultiLazyBuffer([x.expand(self._shape_to_single_shard(arg, x)) for x in self.lbs], self.axis, self.real)
+  def permute(self, arg:Tuple[int, ...]):
+    # all permutes supported!
+    return MultiLazyBuffer([x.permute(arg) for x in self.lbs], arg.index(self.axis) if self.axis is not None else None, self.real)
+  def shrink(self, arg:Tuple[Tuple[sint, sint], ...]):
+    assert self.axis is None or arg[self.axis] == (0, self.shape[self.axis]) or arg[self.axis] in self.bounds, f"shrinking not supported for {arg=}"
+    if self.axis is not None and arg[self.axis] in self.bounds and arg[self.axis] != (0, self.shape[self.axis]):
+      assert all(arg[i] == (0, s) or i == self.axis for i,s in enumerate(self.shape)), "cannot shrink sharded and non-sharded axis at the same time"
+      idx = self.bounds.index(arg[self.axis])
+      # zero out other lbs to not create lb reference
+      return MultiLazyBuffer([lb if i==idx else lb.const(0) for i,lb in enumerate(self.lbs)], self.axis, [i==idx for i in range(len(self.lbs))])
+    return MultiLazyBuffer([x.shrink(tuple((0, x.shape[self.axis]) if a == self.axis else s for a,s in enumerate(arg))) for x in self.lbs],
+                           self.axis, self.real)
+  def stride(self, arg:Tuple[int, ...]):
+    assert self.axis is None or arg[self.axis] == 1, "flipping not supported on sharded axis"
+    return MultiLazyBuffer([x.stride(arg) for x in self.lbs], self.axis, self.real)

tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl