PyPI - tinygrad - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

tinygrad/__init__.py +6 -0
tinygrad/codegen/kernel.py +572 -83
tinygrad/codegen/linearizer.py +415 -395
tinygrad/codegen/uops.py +415 -0
tinygrad/device.py +183 -0
tinygrad/dtype.py +113 -0
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +195 -0
tinygrad/engine/realize.py +191 -0
tinygrad/engine/schedule.py +362 -0
tinygrad/engine/search.py +196 -0
tinygrad/{mlops.py → function.py} +76 -55
tinygrad/helpers.py +196 -89
tinygrad/lazy.py +210 -371
tinygrad/multi.py +169 -0
tinygrad/nn/__init__.py +202 -22
tinygrad/nn/datasets.py +7 -0
tinygrad/nn/optim.py +112 -32
tinygrad/nn/state.py +136 -39
tinygrad/ops.py +119 -202
tinygrad/renderer/__init__.py +61 -0
tinygrad/renderer/assembly.py +276 -0
tinygrad/renderer/cstyle.py +353 -166
tinygrad/renderer/llvmir.py +150 -138
tinygrad/runtime/autogen/amd_gpu.py +1900 -0
tinygrad/runtime/autogen/comgr.py +865 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5761 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33328 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/hip_comgr.py +47 -0
tinygrad/runtime/driver/hsa.py +143 -0
tinygrad/runtime/graph/clang.py +38 -0
tinygrad/runtime/graph/cuda.py +81 -0
tinygrad/runtime/graph/hcq.py +143 -0
tinygrad/runtime/graph/hsa.py +171 -0
tinygrad/runtime/graph/metal.py +75 -0
tinygrad/runtime/ops_amd.py +564 -0
tinygrad/runtime/ops_clang.py +24 -77
tinygrad/runtime/ops_cuda.py +175 -89
tinygrad/runtime/ops_disk.py +56 -33
tinygrad/runtime/ops_gpu.py +92 -95
tinygrad/runtime/ops_hsa.py +278 -0
tinygrad/runtime/ops_llvm.py +39 -60
tinygrad/runtime/ops_metal.py +92 -74
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +630 -0
tinygrad/runtime/ops_python.py +204 -0
tinygrad/shape/shapetracker.py +86 -254
tinygrad/shape/symbolic.py +166 -141
tinygrad/shape/view.py +296 -0
tinygrad/tensor.py +2619 -448
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
tinygrad-0.9.0.dist-info/METADATA +227 -0
tinygrad-0.9.0.dist-info/RECORD +60 -0
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/assembly.py +0 -190
tinygrad/codegen/optimizer.py +0 -379
tinygrad/codegen/search.py +0 -72
tinygrad/graph.py +0 -83
tinygrad/jit.py +0 -57
tinygrad/nn/image.py +0 -100
tinygrad/renderer/assembly_arm64.py +0 -169
tinygrad/renderer/assembly_ptx.py +0 -98
tinygrad/renderer/wgsl.py +0 -53
tinygrad/runtime/lib.py +0 -113
tinygrad/runtime/ops_cpu.py +0 -51
tinygrad/runtime/ops_hip.py +0 -82
tinygrad/runtime/ops_shm.py +0 -29
tinygrad/runtime/ops_torch.py +0 -30
tinygrad/runtime/ops_webgpu.py +0 -45
tinygrad-0.7.0.dist-info/METADATA +0 -212
tinygrad-0.7.0.dist-info/RECORD +0 -40
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0

tinygrad/lazy.py CHANGED Viewed

@@ -1,381 +1,220 @@
 from __future__ import annotations
-import sys, operator, math
-from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast
-from weakref import ref, WeakSet, WeakValueDictionary
-import numpy as np
-from tinygrad.graph import log_op
-from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType
-from tinygrad.ops import Device, Compiled, UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp
-from tinygrad.shape.shapetracker import ShapeTracker, View, get_contraction
-from tinygrad.shape.symbolic import Node
-from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped, RawBufferTransfer
-from tinygrad.runtime.ops_cpu import RawNumpyBuffer
-from tinygrad.runtime.ops_disk import RawDiskBuffer
-# lazy can recurse a lot
-sys.setrecursionlimit(10000)
-OPT = getenv("OPT", 2)
-LAZY = getenv("LAZY", 1)
-LAZYCACHE = getenv("LAZYCACHE", 1)
-P2P = getenv("P2P", 0)
-# TODO: movement ops that only change shape are really nops. treat them as such
-REMOVE_MOVEMENT_NOPS, MERGE_ELEMENTWISE_INTO_REDUCE, SHUFFLE_MOVEMENT_OPS, MERGE_ELEMENTWISE_OPS = OPT>=1, OPT>=1, OPT>=1, OPT>=1
-MERGE_ONE_REDUCE_INTO_ELEMENTWISE, SHUFFLE_PAD_OPS, SIMPLIFY_SUM_RESHAPE_EXPAND_SUM = OPT>=2, OPT>=2, OPT>=2   # shuffle pad ops is fine now since we only push to merge binops
-PUSH_PERMUTES, PUSH_CONTIGUOUS = OPT>=3, OPT>=3
-def _simplify_sum_reshape_expand_sum(self:LazyBuffer, src: Any, prev_src: Any) -> Optional[LazyOp]:
-  if prev_src.op.op == MovementOps.EXPAND:
-    if src.op.op == ReduceOps.SUM:
-      if src.shape == self.shape:
-        dim_difference = [i for i, (a, b) in enumerate(zip(prev_src.shape, self.shape)) if a != b]
-        # NOTE: we can probably also handle the case where more than one dimension is different with more thought
-        if len(dim_difference) == 1:
-          expansion_index = dim_difference[0]
-          expansion_size = prev_src.shape[expansion_index]
-          return LazyOp(BinaryOps.MUL, (src, LazyBuffer.const(src, expansion_size)))
-  return None
-# **** realize functions ****
-def _ast_reduceops(self:LazyBuffer) -> LazyOp:
-  # TODO: this can also corealize a binary op after the reduce, not just before
-  # NOTE: mypy doesn't know that if not src.realized, then src.op must be a LazyOp so we have to ignore a bunch of warnings
-  src = self.op.src[0]
-  if not src.realized:
-    # When a tensor is reduced, reshaped/expanded back and then reduced again along the same axis,
-    # it's equivalent to performing the initial reduction and multiplying the result
-    # by the size of the expanded dimension.
-    if SIMPLIFY_SUM_RESHAPE_EXPAND_SUM and src.op.op == MovementOps.EXPAND: # type: ignore
-      expanded = src.op.src[0] # type: ignore
-      if expanded.op.op == MovementOps.RESHAPE: # type: ignore
-        reshaped = expanded.op.src[0] # type: ignore
-        simplified = _simplify_sum_reshape_expand_sum(self, reshaped, src)
-      else:
-        simplified = _simplify_sum_reshape_expand_sum(self, expanded, src)
-      if simplified: return simplified
-    if MERGE_ELEMENTWISE_INTO_REDUCE and src.optype is BinaryOps and len(src.children) <= 1:
-      # If we did remove an expand above, we might stumble back into a case where the reduction is not necessary
-      if src.shape == self.shape:
-        return src.op # type: ignore
-      src = src.op # type: ignore
-  return LazyOp(self.op.op, (src,), self.op.arg)
-# this supports late merging an upstream Reduce op and even an Elementwise op above that
-def _ast_binaryops(self:LazyBuffer) -> LazyOp:
-  real_srcs: Dict[LazyBuffer, Union[None, LazyOp, LazyBuffer]] = {x:None for x in self.op.buffers}
-  # NOTE: contiguous does not always mean the same size with SHRINK. this is still mergeable but requires more thought how
-  # TODO: this can also support late fusion of BinaryOps, required for test_fold_conv_sgd
-  psrcs: List[Tuple[LazyBuffer, LazyBuffer]] = [(k,x) for k,x in zip(real_srcs.keys(), map(get_movementroot_contiguous, real_srcs.keys())) if x.optype == ReduceOps and not x.realized and prod(k.shape) == prod(x.shape) and len(x.children) <= 1 and len(k.children) <= 1]
-  intermediate_shape: Tuple[int, ...] = self.shape
-  if MERGE_ONE_REDUCE_INTO_ELEMENTWISE and psrcs:
-    psrc = psrcs[0] # NOTE: right now we can't handle multiple, as we'd have to check for loop
-    if psrc[1].optype == ReduceOps:
-      top = _ast_reduceops(psrc[1])
-    real_srcs[psrc[0]] = top
-    real_srcs.update({x:x for x in top.buffers})  # the reduce op buffers are not modified
-    # if the ReduceOp is followed by a reshape, we push this reshape before all the ElementwiseOp inputs
-    if psrc[0].shape != psrc[1].shape:
-      intermediate_shape = psrc[1].shape
-      assert psrc[0].shape == self.shape, f"shape mismatch {psrc[0].shape} != {self.shape}"
-  # reshape all the late ops into the output shape
-  # NOTE: these RESHAPEs will return self if they don't change the shape
-  for x in real_srcs.keys():
-    if not real_srcs[x]: real_srcs[x] = x.reshape(intermediate_shape)
-  ast = self.op.map_buffers(real_srcs)
-  return LazyOp(MovementOps.RESHAPE, (ast, ), self.shape) if intermediate_shape != self.shape else ast
-# **** lazy operations ****
-def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(LazyBuffer, root.op.src[0])) if getattr(root, 'op', None) and len(root.op.src) == 1 else root
-def get_movementroot(root:LazyBuffer, allow_contiguous=False) -> LazyBuffer: return get_movementroot(cast(LazyBuffer, root.op.src[0]), allow_contiguous) if not root.realized and (root.optype == MovementOps or (root.op.op == LoadOps.CONTIGUOUS and allow_contiguous and root.op.src[0].st.contiguous)) else root
-def get_movementroot_contiguous(x:LazyBuffer) -> LazyBuffer: return get_movementroot_contiguous(cast(LazyBuffer, x.op.src[0])) if not x.realized and x.op.op == LoadOps.CONTIGUOUS else (get_movementroot(x, True) if x.optype == MovementOps and x.st.contiguous else x)
-lazycache: WeakValueDictionary = WeakValueDictionary()
-def create_lazybuffer(device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
-  # fromcpu aren't cached
-  if not LAZYCACHE or (optype is LoadOps and op.op in {LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST}): return LazyBuffer(device, st, optype, op, dtype)
-  # wop is the deduping key. i feel this used to compare more deeply
-  wop = (device, dtype, optype, ref(op))
-  if wop in lazycache:
-    for x in op.buffers: x.children.add(lazycache[wop])
-    return lazycache[wop]
-  lazycache[wop] = ret = LazyBuffer(device, st, optype, op, dtype)
+import math
+from typing import Union, Optional, Any, Tuple, List
+from tinygrad.dtype import dtypes, DType, ConstType
+from tinygrad.helpers import prod, getenv, all_int, all_same, DEBUG
+from tinygrad.ops import LoadOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, Op, exec_alu, python_alu
+from tinygrad.shape.symbolic import sint, Variable
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.device import Buffer
+from weakref import ref, ReferenceType, WeakValueDictionary
+lazycache: WeakValueDictionary[Any, LazyBuffer] = WeakValueDictionary()
+def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
+                      base:Optional[LazyBuffer]=None, enable_cache=bool(getenv("LAZYCACHE", 1))):
+  if st.size == 0: op, arg, srcs, base = LoadOps.CONST, 0, (), None
+  if op is LoadOps.CONST: arg, enable_cache = dtypes.as_const(arg, dtype) if not isinstance(arg, Variable) else arg, True
+  cache_key = (device, st, dtype, op, arg, tuple(ref(x) for x in srcs)) if base is None else (st, ref(base))
+  if enable_cache and (rret := lazycache.get(cache_key, None)): return rret
+  ret = LazyBuffer(device, st, dtype, op, arg, srcs, base=base)
+  if enable_cache: lazycache[cache_key] = ret
   return ret
-UNSAFE_PAD_OPS = {BinaryOps.DIV, BinaryOps.CMPLT, UnaryOps.LOG2, UnaryOps.EXP2, UnaryOps.RECIP}
+view_supported_devices = {"LLVM", "CLANG", "CUDA", "DISK"}
 class LazyBuffer:
-  __deletable__ = ('op',)
-  def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType, src:Optional[RawBuffer]=None):
-    self.st: ShapeTracker = st  # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
-    self.device, self.shape, self.optype, self.dtype = device, self.st.shape, optype, dtype
-    self.realized: Optional[RawBuffer] = src
-    self.output_buffer: Optional[RawBuffer] = None   # TODO: do we really need this? or can we just use realized
-    # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
-    self.children: WeakSet = WeakSet()
-    # NOTE: op should be read only after construction of LazyBuffer
-    self.op: LazyOp = op
-    for x in op.buffers: x.children.add(self)
-    if not LAZY: self.realize()
-    # log phantom ops to the graph
-    if GRAPH >= 3:
-      log_op(self, self.op, phantom=True)
-  def __repr__(self): return f"<LB {self.shape} {self.dtype} op={self.op.op if not self.realized else self.realized} st={self.st}>"
-  @property
-  def key(self):
-    if self.realized: return (self.dtype, self.realized.key, self.st.key)
-    return (self.dtype, self.op.op, self.st.key)
-  def _device_extra_args(self) -> Dict[str, str]: return {"device": self.device.split(":", 1)[1]} if ":" in self.device else {}
-  def realize(self:LazyBuffer) -> LazyBuffer:
-    if not self.realized:
-      # get real ops first
-      if self.optype is BinaryOps: self.op = _ast_binaryops(self)
-      elif self.optype is ReduceOps:
-        self.op = _ast_reduceops(self)
-        if self.op.op in BinaryOps: self.op = _ast_binaryops(self)
-      elif self.optype is LoadOps: LOAD_OPS_DISPATCHER[cast(LoadOps, self.op.op)](self)
-      # run the ast if we still have to, and log the op
-      if not self.realized:
-        for x in self.op.buffers: x.realize()
-        # HACK: image shape can be wrong, hot cast it back to a normal float
-        if isinstance(self.dtype, ImageDType) and self.optype != MovementOps and (prod(self.shape) != prod(self.dtype.shape) or not any(self.shape[x]%4 == 0 for x in self.st.unit_stride_axes())):
-          if self.op.op == MovementOps.RESHAPE:
-            # put CAST before the final RESHAPE
-            self.op = LazyOp(MovementOps.RESHAPE, (LazyOp(UnaryOps.CAST, self.op.src, (dtypes.float32, False)),), self.op.arg)
-          else:
-            self.op = LazyOp(UnaryOps.CAST, (self.op,), (dtypes.float32, False))
-          self.dtype = dtypes.float32
-        self.realized = Device[self.device].exec_ast(self.op, output=self, **self._device_extra_args())
-      assert self.realized and isinstance(self.realized, (RawConst, Device[self.device].buffer)), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
-      # HACK: allow hot casting of images
-      assert self.realized.dtype == self.dtype or self.dtype.__class__ is ImageDType, f"dtype mismatch on realize got {self.realized.dtype} expected {self.dtype}"
-      self.dtype = self.realized.dtype
-      # log to the graph
-      if (DEBUG or GRAPH) and (self.realized.__class__ is not RawConst or GRAPH >= 2):
-        log_op(self, self.op)
-      # no need to keep the op after realization
-      del self.op
-    return self
+  def __init__(self, device:str, st:ShapeTracker, dtype:DType,
+               op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
+               base:Optional[LazyBuffer]=None):
+    self.device, self.st, self.dtype, self.shape, self.size = device, st, dtype, st.shape, st.size
+    self._base: Optional[LazyBuffer] = None
+    if base is None:
+      # properties on base
+      self.op, self.arg, self.srcs = op, arg, srcs  # this is a LazyOp, except the src is LazyBuffers and not LazyOps
+      assert self.op is not LoadOps.ASSIGN or srcs[1].base.realized is not None, "assign target must be realized"
+      if (self.op is LoadOps.CONTIGUOUS or self.op is UnaryOps.BITCAST) and srcs[0].st.consecutive and \
+          not srcs[0].is_unrealized_const() and device.split(":")[0] in view_supported_devices:
+        # some LazyBuffers can be processed with only a view, no AST required
+        self.buffer: Buffer = srcs[0].base.buffer.view(st.size, dtype, srcs[0].st.views[0].offset * srcs[0].dtype.itemsize)
+        self.op = LoadOps.VIEW
+      else:
+        self.buffer = srcs[1].base.buffer if self.op is LoadOps.ASSIGN else Buffer(device, self.size, dtype)
+      self.buffer.ref(1)
+      self.contiguous_child: Optional[Tuple[ReferenceType[LazyBuffer], ShapeTracker]] = None
+      self.forced_realize = False
+    else:
+      # properties on view
+      assert base.base == base, "base must be a base itself"
+      self._base = base
-  @staticmethod
-  def loadop(op, shape, dtype, device, arg=None, src=None) -> LazyBuffer:
-    return create_lazybuffer(device, ShapeTracker(tuple(shape)), LoadOps, LazyOp(op, tuple() if src is None else (src,), arg), dtype)
+  def __del__(self):
+    if hasattr(self, 'buffer'): self.buffer.ref(-1)
+  def __repr__(self) -> str:
+    return f"<LB {self.device} {self.shape} {str(self.dtype)[7:]} {self.st if self.base != self else (self.op, self.realized)}>"
-  # create a constant with the shape and dtype of self
-  def const(self, val:Union[float, int]) -> LazyBuffer:
-    # NOTE: dtypes.from_np(self.dtype.np) to deal with image types
-    return self.loadop(LoadOps.CONST, tuple(), dtypes.from_np(self.dtype.np), self.device, arg=val).reshape((1,)*len(self.shape)).expand(self.shape)
+  @property
+  def realized(self) -> Optional[Buffer]:
+    # NOTE: we check for a lack of srcs instead of an allocated buffer to make unrealized assigns return None here
+    return self.buffer if self._base is None and not hasattr(self, 'srcs') else None
+  # NOTE: this has to be a function to prevent self reference
+  @property
+  def base(self) -> LazyBuffer: return self._base if self._base is not None else self
-  def contiguous(self:LazyBuffer) -> LazyBuffer:
-    if not self.realized and self.op.op == LoadOps.CONTIGUOUS: return self  # two CONTIGUOUS in a row is one
-    return create_lazybuffer(self.device, ShapeTracker(self.shape), LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,), None), self.dtype)
+  # same API as multi
+  @property
+  def lbs(self) -> List[LazyBuffer]: return [self]
   @staticmethod
-  def fromCPU(x: np.ndarray) -> LazyBuffer:
-    return LazyBuffer("CPU", ShapeTracker(x.shape, [View(x.shape, tuple(st//x.itemsize for st in x.strides))]), LoadOps, LazyOp(LoadOps.EMPTY, (), None), dtypes.from_np(x.dtype), RawNumpyBuffer.fromCPU(x))
-  def toCPU(self) -> np.ndarray:
-    assert self.dtype.np, f"{self.dtype} is not supported in toCPU"
-    self_casted = self.e(UnaryOps.CAST, arg=(dtypes.from_np(self.dtype.np), False)) if dtypes.from_np(self.dtype.np) != self.dtype else self
-    realized = self_casted.contiguous().realize().realized
-    return cast(RawBuffer, realized).toCPU().reshape(self.shape)
-  def e(self:LazyBuffer, op:Union[UnaryOps, BinaryOps, TernaryOps], *srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
-    # srcs includes self
-    srcs = (self,)+srcs
-    # if we are separated from other binary ops by movement ops, we push those movement ops above those binaryops
-    if SHUFFLE_MOVEMENT_OPS: srcs = _push_movement_ops(srcs)
-    # get outputs now
-    out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max([x.dtype for x in srcs]) if op != UnaryOps.CAST else cast(Tuple[DType, bool], arg)[0]
-    # push all contiguous to the end of BinaryOps. kernels 198 -> 196
-    if PUSH_CONTIGUOUS and any(not x.realized and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs):
-      new_srcs: List[LazyBuffer] = []
-      for x in srcs:
-        if not x.realized and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1:
-          x.op.src[0].children.discard(x)
-          new_srcs.append(cast(LazyBuffer, x.op.src[0]))
-        else:
-          new_srcs.append(x)
-      return new_srcs[0].e(op, *new_srcs[1:], arg=arg).contiguous()
-    if MERGE_ELEMENTWISE_OPS:
-      # remove the buffers from any (childless) BinaryOps that feed into this
-      srcs = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs])  # type: ignore
-    return create_lazybuffer(out_device, ShapeTracker(out_shape), BinaryOps, LazyOp(op, srcs, arg), out_dtype)
-  def shuffle_and_prune_movement_ops(self, st: ShapeTracker, op: MovementOps, arg: Union[Tuple[Union[Node,int], ...], Tuple[Tuple[int, int], ...]]) -> LazyBuffer:
-    if SHUFFLE_MOVEMENT_OPS and self.optype == BinaryOps and not self.realized and (op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps)) and not self.children:
-      return self.op.replace_with_movement_ops([(op, arg)])
-    ret = create_lazybuffer(self.device, st, MovementOps, LazyOp(op, (self,), arg), self.dtype)
-    if REMOVE_MOVEMENT_NOPS and not self.realized and not ret.realized and ret.st.contiguous:
-      # MovementOps aren't stacked any more, they each have one parent, find the root
-      root = get_movementroot(self)
-      if root.st.contiguous and root != self and prod(ret.st.shape) == prod(root.shape):
-        return root.reshape(ret.st.shape)
-    return ret
-  def _reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
-    if self.shape == tuple(new_shape): return self
-    srcs = _push_movement_ops((self,)) if SHUFFLE_MOVEMENT_OPS else (self,)
-    return create_lazybuffer(self.device, ShapeTracker(new_shape), ReduceOps, LazyOp(op, srcs, new_shape), self.dtype)
-  def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
-    if any(not isinstance(s, int) for s in self.shape) or prod(self.shape) // prod(new_shape) < 32768: return self._reduce_op(op, new_shape) # The amount of work should be big enough to take the benefit of "2 kernels" approach.
-    heuristic, divisor, dim_to_split = max(((divisor := math.gcd(256, old))/(stride or math.inf), divisor, i) for i, (old, new, stride) in enumerate(zip(self.shape, new_shape, self.st.real_strides())) if old != new) # type: ignore
-    if divisor < 16 or heuristic < 0.125: return self._reduce_op(op, new_shape) # Choose largest divisor (>=16) to split on, penalize large strides.
-    def splitted_shape(dim_aft_div): return self.shape[:dim_to_split] + (self.shape[dim_to_split]//divisor,) + dim_aft_div + self.shape[dim_to_split+1:]
-    return self.reshape(splitted_shape((divisor,)))._reduce_op(op, splitted_shape((1,))).reshape(splitted_shape(()))._reduce_op(op, new_shape)
-  def reshape(self:LazyBuffer, arg:Tuple[Union[Node, int], ...]) -> LazyBuffer:
-    if self.shape == arg: return self
-    if not self.realized and self.op.op == MovementOps.RESHAPE:
-      self.op.src[0].children.discard(self) # NOTE: this is only required in reshape and when pushing permutes, why??
-      return self.op.src[0].reshape(arg)
-    return self.shuffle_and_prune_movement_ops(ShapeTracker(self.st).reshape(arg), MovementOps.RESHAPE, arg)
-  def pad(self:LazyBuffer, arg:Tuple[Tuple[int, int], ...]) -> LazyBuffer:
-    if all(b == 0 and e == 0 for b,e in arg): return self
-    if not self.realized and self.op.op == MovementOps.PAD: return self.op.src[0].pad(tuple([(b1+b2, e1+e2) for (b1,e1),(b2,e2) in zip(self.op.arg, arg)]))
-    return self.shuffle_and_prune_movement_ops(ShapeTracker(self.st).pad(arg), MovementOps.PAD, arg)
-  def expand(self: LazyBuffer, arg:Tuple[Union[Node,int], ...]) -> LazyBuffer:
-    if self.shape == arg: return self
-    if not self.realized and self.op.op == MovementOps.EXPAND:
-      return self.op.src[0].expand(arg)
-    return self.shuffle_and_prune_movement_ops(ShapeTracker(self.st).expand(arg), MovementOps.EXPAND, arg)
-  def permute(self: LazyBuffer, arg:Tuple[int, ...]) -> LazyBuffer:
-    if arg == tuple(range(len(self.shape))): return self
-    if not self.realized and self.op.op == MovementOps.PERMUTE: return self.op.src[0].permute(tuple([self.op.arg[i] for i in arg]))
-    if not self.realized:
-      if PUSH_PERMUTES and self.optype == ReduceOps:
-        # reduceops have one buffer input, permute it
-        narg = tuple([self.op.arg[arg[i]] for i in range(len(arg))])
-        src, rop = self.op.src[0], self.op.op
-        src.children.discard(self)
-        del self  # TODO: why doesn't this delete remove it from the children
-        return src.permute(arg).reduce_op(cast(ReduceOps, rop), narg)
-      # move permutes before expands (always, this is safe)
-      if self.op.op == MovementOps.EXPAND:
-        return self.op.src[0].permute(arg).expand(tuple([self.op.arg[a] for a in arg]))
-      # move permutes before reshapes if we can
-      if PUSH_PERMUTES and self.op.op == MovementOps.RESHAPE and self.op.src[0].__class__ is LazyBuffer:
-        if shape_idx_groups := get_contraction(self.op.src[0].shape, self.shape):
-          self.op.src[0].children.discard(self) # NOTE: this is only required in reshape and when pushing permutes, why??
-          return self.op.src[0].permute(tuple(flatten(shape_idx_groups[i] for i in arg))).reshape(ShapeTracker(self.st).permute(arg).shape)
-    return self.shuffle_and_prune_movement_ops(ShapeTracker(self.st).permute(arg), MovementOps.PERMUTE, arg)
-  def shrink(self:LazyBuffer, arg:Tuple[Tuple[int, int], ...]) -> LazyBuffer:
-    if all(b - a == s for s, (a, b) in zip(self.shape, arg)): return self
-    if not self.realized and self.op.op == MovementOps.SHRINK: return self.op.src[0].shrink(tuple([(b1+b2, b1+e2) for (b1,_),(b2,e2) in zip(self.op.arg, arg)]))
-    return self.shuffle_and_prune_movement_ops(ShapeTracker(self.st).shrink(arg), MovementOps.SHRINK, arg)
-  def stride(self:LazyBuffer, arg:Tuple[int, ...]) -> LazyBuffer:
-    local_st = ShapeTracker(self.shape).stride(arg)
-    if self.shape == local_st.shape and local_st.contiguous: return self
-    if not self.realized and self.op.op == MovementOps.STRIDE: return self.op.src[0].stride(tuple(map(operator.mul, arg, self.op.arg)))
-    return self.shuffle_and_prune_movement_ops(ShapeTracker(self.st).stride(arg), MovementOps.STRIDE, arg)
+  def loadop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, src:Tuple[LazyBuffer, ...]=(), enable_cache=False) -> LazyBuffer:
+    assert isinstance(src, tuple)
+    return create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, src, enable_cache=enable_cache)
+  def const(self, val:ConstType, shape:Optional[Tuple[sint,...]]=None) -> LazyBuffer:
+    shape = self.shape if shape is None else shape
+    return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, self.device, arg=val).reshape((1,)*len(shape)).expand(shape)
+  def is_realized(self) -> bool: return self.base.realized is not None
+  def assign(self, x:LazyBuffer) -> LazyBuffer:
+    assert x.size == self.size, f"assign target must have same size {self.size=} != {x.size=}"
+    return LazyBuffer.loadop(LoadOps.ASSIGN, self.shape, self.dtype, self.device, arg=() if self.st.contiguous else (self.st,), src=(x, self.base))
+  def contiguous(self):
+    if not self.st.contiguous or self.size != self.base.size or self.is_unrealized_const():
+      ret = self.e(LoadOps.CONTIGUOUS)
+      if (sti := self.st.invert(self.base.shape)) is not None: self.base.contiguous_child = ref(ret), sti
+      return ret
+    self.base.forced_realize = True
+    return self
-  @property
-  def buffers(self) -> Tuple[LazyBuffer, ...]: return (self,)
-  def map_buffers(self, real_srcs: Dict[Any, Any]): return real_srcs.get(self, self)
-  def get_lazyops(self) -> List[Any]: return []
-  def replace_with_movement_ops(self: LazyBuffer, ops:List[Tuple[MovementOps, Any]]) -> LazyBuffer:
-    y = self
-    for op, arg in ops: y = MOVEMENT_OPS_DISPATCHER[op](y, arg)
-    return y
-def _push_movement_ops(srcs:Tuple[LazyBuffer, ...]) -> Tuple[LazyBuffer, ...]:
-  new_srcs = []
-  for x in srcs:
-    mops: List[Tuple[MovementOps, Any]] = []
-    bx = x
-    # backwalk all the movement ops. don't push PAD or EXPAND
-    while not bx.realized and bx.optype is MovementOps and bx.op.op is not MovementOps.EXPAND and (SHUFFLE_PAD_OPS or bx.op.op is not MovementOps.PAD) and len(bx.children) <= 1:
-      assert isinstance(bx.op.op, MovementOps)
-      mops.append((bx.op.op, bx.op.arg))
-      assert isinstance(bx.op.src[0], LazyBuffer)
-      bx = bx.op.src[0]
-    # NOTE: can't push pads past anything where f(0, 0) != 0 or f(0) != 0
-    if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(x[0] is not MovementOps.PAD for x in mops) or all(x.op not in UNSAFE_PAD_OPS for x in bx.op.get_lazyops())):
-      new_srcs.append(bx.op.replace_with_movement_ops(mops[::-1]))
-    else:
-      new_srcs.append(x)
-  return tuple(new_srcs)
-def _realize_contiguous(buffer: LazyBuffer) -> None:
-  realized = buffer.op.src[0].realize().realized
-  if buffer.op.src[0].st.contiguous and realized.__class__ is not RawConst and realized is not None and realized.size == prod(buffer.shape):
-    # no need to run an AST, this is already contiguous
-    buffer.realized = realized
-  else:
-    buffer.op = LazyOp(UnaryOps.NOOP, buffer.op.src)
-def _realize_custom(buffer: LazyBuffer) -> None:
-  # this needs to immediately realize
-  buffer.realized = buffer.op.arg(buffer, *[x.realize() for x in buffer.op.src])
-def _realize_from(buffer: LazyBuffer) -> None:
-  rawbuf = buffer.op.src[0].realize()
-  assert rawbuf.realized, "realize failed?"
-  if DEBUG >= 3: print(f"*** copy {buffer.device} <- {rawbuf.device} size {rawbuf.realized.size} dtype {rawbuf.realized.dtype}")
-  # TODO: make this generic
-  if isinstance(rawbuf.realized, RawDiskBuffer) and issubclass(Device[buffer.device].buffer, RawBufferMapped):
-    buffer.realized = Device[buffer.device].buffer(prod(buffer.shape), buffer.dtype, **buffer._device_extra_args())
-    rawbuf.realized.readinto(cast(RawBufferMapped, buffer.realized)._buffer())
-  elif isinstance(rawbuf.realized, RawBufferTransfer) and issubclass(Device[buffer.device].buffer, RawBufferTransfer) and P2P >= 1:
-    buffer.realized = cast(RawBufferTransfer, Device[buffer.device].buffer).transfer(rawbuf.realized, buffer.shape, buffer.dtype, **buffer._device_extra_args())
-  else:
-    buffer.realized = Device[buffer.device].buffer.fromCPU(rawbuf.toCPU(), **buffer._device_extra_args())
-def _realize_empty(buffer: LazyBuffer) -> None:
-  buffer.realized = Device[buffer.device].buffer(prod(buffer.shape), buffer.dtype, **buffer._device_extra_args())
-def _realize_rand(buffer: LazyBuffer) -> None:
-  rng = np.random.default_rng(buffer.op.arg)
-  buffer.realized = Device[buffer.device].buffer.fromCPU(rng.random(size=buffer.shape, dtype=np.float32).astype(dtype=buffer.dtype.np, copy=False), **buffer._device_extra_args()) # type: ignore
-def _realize_const(buffer: LazyBuffer) -> None:
-  if isinstance(Device[buffer.device], Compiled) and buffer.device not in ["LLVM"]:  # consts are broken in LLVM in NaN/inf
-    buffer.realized = RawConst(1, buffer.dtype, float(buffer.op.arg))
-  else:
-    buffer.realized = Device[buffer.device].buffer.fromCPU(np.array(buffer.op.arg, dtype=buffer.dtype.np), **buffer._device_extra_args())
-LOAD_OPS_DISPATCHER: Dict[LoadOps, Callable] = {
-  LoadOps.CONTIGUOUS: _realize_contiguous,
-  LoadOps.CUSTOM: _realize_custom,
-  LoadOps.FROM: _realize_from,
-  LoadOps.EMPTY: _realize_empty,
-  LoadOps.RAND: _realize_rand,
-  LoadOps.CONST: _realize_const,
-}
-MOVEMENT_OPS_DISPATCHER: Dict[MovementOps, Callable] = {
-  MovementOps.RESHAPE: LazyBuffer.reshape,
-  MovementOps.EXPAND: LazyBuffer.expand,
-  MovementOps.SHRINK: LazyBuffer.shrink,
-  MovementOps.PERMUTE: LazyBuffer.permute,
-  MovementOps.PAD: LazyBuffer.pad,
-  MovementOps.STRIDE: LazyBuffer.stride,
-}
+  def cast(self, dtype:DType, bitcast:bool=False):
+    if self.dtype == dtype: return self
+    if self.device.startswith("DISK") and not bitcast: raise RuntimeError("attempted to cast disk buffer (bitcast only)")
+    if self.is_unrealized_unmasked_const() and not bitcast:
+      return create_lazybuffer(self.device, self.st, dtype, LoadOps.CONST, dtypes.as_const(self.base.arg, dtype))
+    # TODO: applying this makes gpt2 slower
+    if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base:
+      return self.base.cast(dtype, bitcast)._view(self.st)
+    new_shape = self.shape
+    if bitcast and self.dtype.itemsize != dtype.itemsize:
+      if not self.device.startswith("DISK"): raise RuntimeError("shape changing bitcast only supported on DISK right now")
+      if not all_int(new_shape): raise RuntimeError("shape changing bitcast with symbolic shape isn't supported yet")
+      # https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
+      if not (new_shape[-1]*self.dtype.itemsize) % dtype.itemsize == 0: raise RuntimeError("unsupported size in bitcast")
+      new_shape = new_shape[:-1] + ((new_shape[-1]*self.dtype.itemsize) // dtype.itemsize,)
+    cast_op = UnaryOps.BITCAST if bitcast else UnaryOps.CAST
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), dtype, cast_op, dtype, (self,))
+  def is_unrealized_const(self): return self.base.realized is None and self.base.op is LoadOps.CONST and not isinstance(self.base.arg, Variable)
+  def is_unrealized_unmasked_const(self): return self.is_unrealized_const() and all(v.mask is None for v in self.st.views)
+  def _copy(self, device:str) -> LazyBuffer:
+    return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, self.buffer.nbytes, (self,), enable_cache=False)
+  def copy_to_device(self, device:str, force: bool = False) -> LazyBuffer:
+    # no COPY
+    if self.device == device: return self
+    # double COPY = one COPY
+    if not force and self.st.contiguous and self.size == self.base.size and not self.base.realized and self.base.op is LoadOps.COPY:
+      return self.base.srcs[0].copy_to_device(device).reshape(self.st.shape)
+    # const doesn't have to be copied (issues with disk tensor)
+    if self.is_unrealized_const():
+      return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, device, arg=self.base.arg)._view(self.st)
+    # if it's a shrink, do the shrink before the copy with CONTIGUOUS
+    if prod(self.st.shape) < prod(self.base.st.shape): return self.contiguous()._copy(device)
+    # copy the base and apply the shapetracker on the new device
+    return self.base._copy(device)._view(self.st)
+  def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
+    srcs: List[LazyBuffer] = []
+    for s in (self,)+in_srcs:
+      if s == s.base and s.base.contiguous_child and (root:=s.base.contiguous_child[0]()) is not None:
+        srcs.append(root._view(s.base.contiguous_child[1]))
+      else:
+        srcs.append(s)
+    assert all_same(dts:=[x.dtype.scalar() for x in (srcs[1:] if op is TernaryOps.WHERE else srcs)]), f"all dtypes must match {dts} on {op}"
+    assert all_same([x.shape for x in srcs]), f"all shapes must be the same {[x.shape for x in srcs]}"
+    if op is TernaryOps.WHERE: assert srcs[0].dtype == dtypes.bool, "TernaryOps.WHERE must have the first arg be bool"
+    if op is UnaryOps.NEG: assert srcs[0].dtype != dtypes.bool, "UnaryOps.NEG does not accept dtype bool"
+    out_dtype = dtypes.bool if op in (BinaryOps.CMPLT, BinaryOps.CMPEQ) else srcs[-1].dtype
+    # const folding
+    if op in python_alu and all(s.is_unrealized_unmasked_const() for s in srcs):
+      return self.cast(out_dtype).const(exec_alu(op, out_dtype, [s.base.arg for s in srcs]))
+    if op is UnaryOps.NEG and self.base.op is UnaryOps.NEG: return self.base.srcs[0]
+    if op in BinaryOps: x, y = self, in_srcs[0]
+    if op is BinaryOps.ADD:
+      if y.is_unrealized_unmasked_const() and y.base.arg == 0: return x # pylint: disable=possibly-used-before-assignment
+      if x.is_unrealized_unmasked_const() and x.base.arg == 0: return y # pylint: disable=possibly-used-before-assignment
+    if op is BinaryOps.SUB and y.is_unrealized_unmasked_const() and y.base.arg == 0: return x
+    if op is BinaryOps.MUL:
+      if x.is_unrealized_unmasked_const() and (val := x.base.arg) in (1, 0, -1):
+        return y if val == 1 else y.const(0) if val == 0 else y.e(UnaryOps.NEG)
+      if y.is_unrealized_unmasked_const() and (val := float(y.base.arg)) in (1, 0, -1):
+        return x if val == 1 else x.const(0) if val == 0 else x.e(UnaryOps.NEG)
+    if op is BinaryOps.DIV and dtypes.is_float(x.dtype) and y.is_unrealized_unmasked_const() and y.base.arg != 0:
+      return x.e(BinaryOps.MUL, x.const(1 / y.base.arg))
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), out_dtype, op, arg, tuple(srcs))
+  # *** reduce ops ***
+  def _reduce_op(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
+    assert all(0 <= x < len(self.shape) for x in axis), f"axis args {axis} out of range for shape {self.shape}"
+    axis = tuple(x for x in axis if self.shape[x] != 1)
+    if len(axis) == 0: return self
+    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), self.dtype, op, axis, (self,))
+  def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
+    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
+    # TODO: this logic should move to the scheduler
+    if self.size == 0 and 0 not in new_shape: return self.const({ReduceOps.SUM: 0.0, ReduceOps.MAX: -math.inf}[op], new_shape)
+    # const folding
+    if self.is_unrealized_unmasked_const():
+      return self.const(self.base.arg * {ReduceOps.SUM: prod(self.shape[i] for i in axis), ReduceOps.MAX: 1}[op], new_shape)
+    # TODO: can we split symbolic shape if the reduce axis is not symbolic?
+    if not getenv("SPLIT_REDUCEOP", 1) or not all_int(self.shape) or (0 in self.shape) or \
+      prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768):
+      return self._reduce_op(op, axis)
+    # if there are few globals, make some reduces into globals by splitting into two kernels
+    # cap output buffer to 2**22: heuristic number of global outputs to achieve max occupancy with enough locals+upcasts for gemm
+    #   ~2**10 should be enough if GROUP is used
+    # 256 split maximum should be "negligible reduce" for low prod(new_shape), 8 split minimum.
+    # split is moved to the end to provide maximum locality for the second phase reduce.
+    self_real_strides = self.st.real_strides(ignore_valid=True)
+    split_candidates = [(i, x) for i in axis for x in range(min(256,2**getenv("REDUCEOP_SPLIT_SIZE",22)//prod(new_shape)),8-1,-1)
+                        if self.shape[i] % x == 0 and self_real_strides[i] != 0]
+    if not split_candidates: return self._reduce_op(op, axis)
+    dim_to_split, divisor = split_candidates[0]
+    splitted_shape = self.shape[:dim_to_split] + (divisor,) + (self.shape[dim_to_split]//divisor,) + self.shape[dim_to_split+1:]
+    splitted = self.reshape(splitted_shape).permute(tuple([x for x in range(len(splitted_shape)) if x != dim_to_split]+[dim_to_split]))
+    if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
+    return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
+  # *** movement ops ***
+  def _view(self, new_st:ShapeTracker) -> LazyBuffer:
+    if self.st.size == 0 or (new_st.views[-1].mask is not None and any((x[1]-x[0]) == 0 for x in new_st.views[-1].mask)):
+      return self.const(0, new_st.shape)
+    if new_st.contiguous and self.base.shape == new_st.shape: return self.base
+    return create_lazybuffer(self.device, new_st, self.dtype, base=self.base)
+  def reshape(self, arg:Tuple[sint, ...]): return self._view(self.st.reshape(arg))
+  def pad(self, arg:Tuple[Tuple[sint, sint], ...]): return self._view(self.st.pad(arg))
+  def expand(self, arg:Tuple[sint, ...]): return self._view(self.st.expand(arg))
+  def permute(self, arg:Tuple[int, ...]): return self._view(self.st.permute(arg))
+  def shrink(self, arg:Tuple[Tuple[sint, sint], ...]): return self._view(self.st.shrink(arg))
+  def stride(self, arg:Tuple[int, ...]): return self._view(self.st.stride(arg))

tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl