PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +253 -225
tinygrad/codegen/linearizer.py +398 -436
tinygrad/codegen/uops.py +451 -0
tinygrad/device.py +268 -274
tinygrad/dtype.py +56 -40
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +198 -0
tinygrad/engine/realize.py +192 -0
tinygrad/engine/schedule.py +370 -0
tinygrad/engine/search.py +199 -0
tinygrad/{mlops.py → function.py} +40 -32
tinygrad/helpers.py +144 -46
tinygrad/lazy.py +143 -242
tinygrad/multi.py +173 -0
tinygrad/nn/__init__.py +180 -9
tinygrad/nn/datasets.py +8 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +87 -19
tinygrad/ops.py +104 -45
tinygrad/renderer/__init__.py +65 -0
tinygrad/renderer/assembly.py +269 -0
tinygrad/renderer/cstyle.py +308 -210
tinygrad/renderer/llvmir.py +119 -124
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +13403 -0
tinygrad/runtime/autogen/comgr.py +891 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5893 -0
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33597 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +56 -0
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +39 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +187 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +550 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +129 -37
tinygrad/runtime/ops_disk.py +111 -43
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +41 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +625 -0
tinygrad/runtime/ops_python.py +208 -0
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +46 -107
tinygrad/shape/symbolic.py +99 -98
tinygrad/shape/view.py +162 -45
tinygrad/tensor.py +2492 -483
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/runtime/ops_python.py ADDED Viewed

@@ -0,0 +1,208 @@
+# pylint: disable=cell-var-from-loop
+# a python uops emulator
+# works to test the tensor cores, and all the uops in general
+# this is the (living) definition of uops
+from typing import Tuple, List, Optional, Any, Dict
+import pickle, base64, itertools, time, struct
+from tinygrad.dtype import DType, dtypes, ImageDType
+from tinygrad.helpers import all_same, getenv, flatten
+from tinygrad.device import Compiled, Compiler, Allocator
+from tinygrad.codegen.uops import UOpGraph, UOps
+from tinygrad.ops import BinaryOps, TernaryOps, exec_alu, truncate
+from tinygrad.renderer import Renderer
+from tinygrad.renderer.cstyle import CUDARenderer, MetalRenderer, AMDRenderer
+def _load(m, i):
+  if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}")
+  return m[i]
+def load(inp, j=0):
+  if len(inp) == 4: return [_load(m, x+j) if gate else default for m,x,gate,default in zip(*inp)]
+  return [_load(m, x+j) for m,x in zip(inp[0], inp[1])]
+def _store(m, i, v):
+  if i < 0 or i >= len(m): raise IndexError(f"store out of bounds, size is {len(m)}, access is {i}, value is {v}")
+  m[i] = v
+class PythonProgram:
+  def __init__(self, name:str, lib:bytes):
+    self.uops: List[Tuple[UOps, Optional[DType], List[int], Any]] = pickle.loads(lib)
+  def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
+    st = time.perf_counter()
+    warp = list(itertools.product(*[range(x) for x in local_size[::-1]]))
+    warp_size = len(warp)
+    for idxs in itertools.product(*[range(x) for x in global_size[::-1]]):
+      ul: Dict[int, Any] = {}
+      dl: Dict[int, DType] = {}
+      pbufs: List[memoryview] = list(bufs)
+      pvals: List[int] = list(vals)
+      i = 0
+      loop_ends: Dict[int, int] = {}
+      while i < len(self.uops):
+        uop, dtype, idp, arg = self.uops[i]
+        void_ops = {UOps.STORE, UOps.ENDRANGE, UOps.BARRIER, UOps.IF, UOps.ENDIF}
+        if uop is UOps.DEFINE_ACC: idp = [idp[0]]
+        inp = [ul[v] for v in idp if self.uops[v][0] not in void_ops]
+        dtp = [dl[v] for v in idp if self.uops[v][0] not in void_ops]
+        if getenv("TRACE"): print(i, uop, dtype, arg, inp, dtp)
+        if uop is UOps.STORE:
+          if len(inp) == 3: inp.append([True] * len(inp[0]))  # set the gate to True
+          if isinstance(dtp[0], ImageDType):
+            # image store
+            assert dtp[2].count == 4
+            for j,val in enumerate(inp[2]):
+              for m,ox,oy,v,g in zip(inp[0], inp[1][0], inp[1][1], val, inp[3]):
+                assert ox >= 0 and ox < dtp[0].shape[1] and oy >= 0 and oy < dtp[0].shape[0]
+                if g: _store(m, ox*4 + oy*dtp[0].shape[1]*4 + j, v)
+          elif dtp[2].count > 1:
+            for j,val in enumerate(inp[2]):
+              for m,o,v,g in zip(inp[0], inp[1], val, inp[3]):
+                if g: _store(m, o+j, v)
+          else:
+            for m,o,v,g in zip(*inp):
+              if g: _store(m, o, v)
+          i += 1
+          continue
+        if uop is UOps.ENDRANGE:
+          loop_ends[idp[0]] = i
+          i = idp[0]
+          continue
+        if uop in (UOps.BARRIER, UOps.IF, UOps.ENDIF):
+          # in the python emulator, the warp is always in sync
+          i += 1
+          continue
+        assert dtype is not None, f"{uop} is missing a dtype"
+        dl[i] = dtype
+        if uop is UOps.DEFINE_GLOBAL:
+          assert dtype.fmt is not None
+          ul[i] = [pbufs.pop(0).cast(dtype.fmt)] * warp_size
+        elif uop is UOps.DEFINE_LOCAL:
+          assert dtype.fmt is not None
+          lbuf = memoryview(bytearray(arg[1]*dtype.itemsize))
+          ul[i] = [lbuf.cast(dtype.fmt)] * warp_size
+        elif uop is UOps.DEFINE_VAR:
+          ul[i] = [pvals.pop(0)] * warp_size
+        elif uop is UOps.SPECIAL:
+          if arg[1][0] == 'g':
+            ul[i] = [idxs[2-arg[0]]] * warp_size
+          elif arg[1][0] == 'l':
+            ul[i] = [x[2-arg[0]] for x in warp]
+        elif uop is UOps.CONST:
+          ul[i] = [[arg] * warp_size for _ in range(dtype.count)] if dtype.count > 1 else [arg] * warp_size
+        elif uop is UOps.DEFINE_ACC:
+          ul[i] = [[inp[0][0]] * warp_size for _ in range(dtype.count)] if dtype.count > 1 else [inp[0][0]] * warp_size
+        elif uop is UOps.RANGE:
+          if i not in ul: ul[i] = [inp[0][0]] * warp_size
+          else:
+            for j in range(len(ul[i])):
+              ul[i][j] += 1
+            if ul[i][0] == inp[1][0]:
+              del ul[i]
+              i = loop_ends[i] + 1
+              continue
+        elif uop in (UOps.CAST, UOps.BITCAST):
+          if dtype.count > 1: ul[i] = inp
+          else:
+            assert dtp[0].fmt and dtype.fmt
+            pack_format, unpack_format = str(warp_size) + dtp[0].fmt, str(warp_size) + dtype.fmt
+            if uop is UOps.BITCAST: ul[i] = list(struct.unpack(unpack_format, struct.pack(pack_format, *inp[0])))
+            else:
+              casted = [dtypes.as_const(x, dtype) for x in inp[0]]
+              if dtypes.is_int(dtype):
+                overflow_adjust = 2**(dtype.itemsize*8 - 1) if not dtypes.is_unsigned(dtype) else 0
+                casted = [((x + overflow_adjust) % 2**(dtype.itemsize*8) - overflow_adjust) for x in casted]
+              elif dtypes.is_float(dtype):
+                casted = [truncate.get(dtype, lambda dt: dt)(x) for x in casted]
+              ul[i] = list(struct.unpack(unpack_format, struct.pack(unpack_format, *casted)))
+        elif uop is UOps.LOAD:
+          if isinstance(dtp[0], ImageDType):
+            assert dtype.count == 4
+            ul[i] = []
+            for j in range(dtype.count):
+              ret = []
+              for m,ox,oy in zip(inp[0], inp[1][0], inp[1][1]):
+                if ox < 0 or ox >= dtp[0].shape[1] or oy < 0 or oy >= dtp[0].shape[0]: ret.append(0)
+                else: ret.append(_load(m, ox*4 + oy*dtp[0].shape[1]*4 + j))
+              ul[i].append(ret)
+          elif dtype.count > 1:
+            ul[i] = [load([inp[i][j] if dtp[i].count > 1 else inp[i] for i in range(len(inp))], j) for j in range(dtype.count)]
+          else:
+            ul[i] = load(inp)
+        elif uop is UOps.PHI:
+          for j in range(len(inp[0])): inp[0][j] = inp[1][j]
+          ul[i] = inp[0]
+        elif uop is UOps.GEP:
+          ul[i] = inp[0][arg]
+        elif uop is UOps.WMMA:
+          # here are the models for the WMMA instruction on the different hardware
+          def wmma_helper(WARP_THREADS, K, NUM_A, NUM_B, NUM_C, a_elem, b_elem, c_map):
+            assert len(inp[0]) == NUM_A, f"A must have {NUM_A} elements per thread"
+            assert len(inp[1]) == NUM_B, f"B must have {NUM_B} elements per thread"
+            assert len(inp[2]) == NUM_C, f"C must have {NUM_C} elements per thread"
+            assert len(flatten(inp[0])) == NUM_A * warp_size, f"WMMA must have {NUM_A * warp_size} total elements for A in WMMA"
+            assert len(flatten(inp[1])) == NUM_B * warp_size, f"WMMA must have {NUM_B * warp_size} total elements for B in WMMA"
+            assert len(flatten(inp[2])) == NUM_C * warp_size, f"WMMA must have {NUM_C * warp_size} total elements for C in WMMA"
+            assert warp_size > 0 and warp_size % WARP_THREADS == 0, f"must have multiples of {WARP_THREADS} warp threads"
+            out = [inp[2][elem_idx][:] for elem_idx in range(NUM_C)]
+            for goff in range(0, warp_size, WARP_THREADS):
+              for lane_id in range(WARP_THREADS):
+                for elem_idx in range(NUM_C): # calculate new muls and add to acc
+                  (c_i, c_j) = c_map(lane_id, elem_idx)
+                  out[elem_idx][goff+lane_id] += sum(a_elem(inp[0], _k, c_j, goff) * b_elem(inp[1], c_i, _k, goff) for _k in range(K))
+            return out
+          # TODO: refactor these to a shared TensorCoreLayout in kernel.py
+          if arg[5] == "METAL":
+            # A (2 elements on 32 threads): row major
+            def a_b_elem(x, i, j, goff): return x[(i%2)][goff+(i//2)%2+(j%4)*2+(i//4)*8+(j//4)*16]
+            # (i, j), C, D (2 elements on 32 threads): row major same as A/B
+            def c_map(lane, elem): return (elem + ((lane%2)*2) + ((lane//8)%2)*4, ((lane//2)%4) + (lane//16)*4)
+            ul[i] = wmma_helper(32, 8, 2, 2, 2, a_b_elem, a_b_elem, c_map)
+          elif arg[5] == "AMD":
+            # A (16 elements on 32 threads): col major, lane 16-32 == lane 0-15
+            def a_elem(x, i, j, goff):
+              assert x[i][goff+j] == x[i][goff+j+16], "warp elements not duplicated properly across lanes"
+              return x[i][goff+j]
+            # B (16 elements on 32 threads): row major, lane 16-32 == lane 0-15
+            def b_elem(x, i, j, goff): return a_elem(x, j, i, goff)  # pylint: disable=arguments-out-of-order
+            def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major
+            ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map)
+          elif arg[5] == "CUDA":
+            # A (8 elements on 32 threads)
+            def a_elem(x, i, j, goff): return x[(i%2)+(j//8)*2+(i//8)*4][goff+((i//2)%4)+(j%8)*4]
+            # B (4 elements on 32 threads)
+            def b_elem(x, i, j, goff): return x[(j%2)+(j//8)*2][goff+(j//2)%4+(i)*4]
+            # (i, j), C, D (4 elements on 32 threads)
+            def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8)
+            ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
+          else: raise NotImplementedError(f"unimplemented tensor core {arg}")
+        elif uop is UOps.ALU:
+          assert all_same([len(x) for x in inp]), f"{[len(x) for x in inp]} doesn't match on {arg}"
+          assert all_same([dtype] + dtp) or arg in {BinaryOps.CMPNE, BinaryOps.CMPLT, TernaryOps.WHERE}, f"dtype mismatch on {arg}"
+          ul[i] = [exec_alu(arg, dtype, p) for p in zip(*inp)]
+        assert i in ul, (uop, dtype, idp, arg)
+        i += 1
+    return time.perf_counter() - st
+class PythonRenderer(Renderer):
+  device = "PYTHON"
+  def __init__(self):
+    if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores
+    if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores
+    if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tensor_cores
+  def render(self, name:str, uops:UOpGraph) -> str:
+    lops = [(u.op, u.dtype, [uops.uops.index(v) for v in u.src], u.arg) for u in uops]
+    return base64.b64encode(pickle.dumps(lops)).decode()
+class PythonCompiler(Compiler):
+  def compile(self, src:str) -> bytes: return base64.b64decode(src)
+class PythonAllocator(Allocator):
+  def _alloc(self, size, options): return memoryview(bytearray(size))
+  def copyin(self, dest, src:memoryview): dest[:] = src
+  def copyout(self, dest:memoryview, src): dest[:] = src
+class PythonDevice(Compiled):
+  def __init__(self, device:str):
+    super().__init__(device, PythonAllocator(), PythonRenderer(), PythonCompiler(), PythonProgram)

tinygrad/shape/__init__.py ADDED Viewed

File without changes

tinygrad/shape/shapetracker.py CHANGED Viewed

@@ -1,70 +1,35 @@
 # ShapeTracker allows movement operations to a buffer that don't require a copy to be made.
 from __future__ import annotations
-import functools, itertools, operator
 from dataclasses import dataclass
-from typing import Tuple, List, Optional, Dict, Set, cast, Union, Iterable
-from tinygrad.ops import MovementOps
-from tinygrad.helpers import prod, DEBUG, merge_dicts, getenv
-from tinygrad.shape.symbolic import Variable, MulNode, Node, SumNode, NumNode, sint
-from tinygrad.shape.view import View, _merge_dims
-def expr_node_mask(view:View, idx:Node, valid:Optional[Node]=None) -> Node:
-  expr = [valid] if valid is not None else []
-  if view.mask is not None:
-    acc = 1
-    for d,(x,y) in zip(reversed(view.shape), reversed(view.mask)):
-      if (x,y) != (0,d):
-        base = ((idx//acc)%d)
-        expr += [base >= x, base < y]
-      acc *= d
-  return Node.ands(expr)
-# generate an expression if you have a single idx variable
-def expr_node(view:View, idx:Optional[Node]=None) -> Node:
-  if idx is None: idx = Variable('idx', 0, prod(view.shape)-1)
-  ret: List[Node] = [NumNode(view.offset) if isinstance(view.offset, int) else view.offset] if view.offset else []
-  acc = 1
-  for d,s,_ in reversed(_merge_dims(view.shape, view.strides)):
-    ret.append(((idx//acc)%d)*s)
-    acc *= d
-  return Node.sum(ret)
-# generate an expression if you have a variable or expression for each index
-def expr_idxs(view:View, idxs:Tuple[Node, ...]) -> Node:
+from typing import Tuple, List, Optional, Dict, Set, Iterable, cast
+from tinygrad.helpers import merge_dicts, getenv
+from tinygrad.shape.symbolic import Variable, MulNode, Node, SumNode, NumNode, create_lt_node, create_ge_node, sint
+from tinygrad.shape.view import View, strides_for_shape
+def _expr_view(view:View, idxs:List[Node], valid:Optional[Node]=None) -> Tuple[Node, Node]:
   assert len(idxs) == len(view.shape), f"need an idx for all dimensions {idxs} vs {view.shape}"
-  return Node.sum([NumNode(view.offset) if isinstance(view.offset, int) else view.offset] + [idx*st for idx,sh,st in zip(idxs, view.shape, view.strides) if sh != 1 and st != 0])  # noqa: E501
-@functools.lru_cache(maxsize=None)
-def merge_views(vm2:View, vm1:View) -> Optional[View]:
-  if vm1.contiguous and vm1.shape == vm2.shape: return vm2
-  if vm2.contiguous: return vm1
-  if vm2.mask or vm1.offset != 0: return None  # this isn't supported yet
-  if None in (strides := ShapeTracker((vm2, vm1)).real_strides()): return None
-  return View.create(vm1.shape, cast(Tuple[sint, ...], strides), vm2.offset, vm1.mask)
-@functools.lru_cache(maxsize=None)
-def idxs_to_idx(shape:Tuple[int, ...], idxs:Tuple[Node, ...]) -> Node:
-  assert len(idxs) == len(shape), "need an idx for all dimensions"
-  acc, ret = 1, []
-  for tidx,d in zip(reversed(idxs), reversed(shape)):
-    ret.append(tidx * acc)
-    acc *= d
-  return Node.sum(ret)
+  iexpr: List[Node] = [NumNode(view.offset) if isinstance(view.offset, int) else view.offset]
+  vexpr: List[Node] = [valid] if valid is not None else []
+  for idx,sh,st,m in zip(idxs, view.shape, view.strides, view.mask if view.mask is not None else [None]*len(view.shape)):
+    if sh != 1 and st != 0: iexpr.append(idx*st)
+    if m is not None: vexpr += [create_ge_node(idx, m[0]), create_lt_node(idx, m[1])]  # idx >= m[0], idx < m[1]
+  return Node.sum(iexpr), Node.ands(vexpr)
 @dataclass(frozen=True)
 class ShapeTracker:
   views: Tuple[View, ...]
-  def __post_init__(self):
-    assert isinstance(self.views, tuple) and all(isinstance(v, View) for v in self.views), "ShapeTracker must be created with a tuple of Views"
   def __add__(self, st:ShapeTracker) -> ShapeTracker:
-    base = ShapeTracker(self.views)
-    for v in st.views: base = ShapeTracker(base.views + (v,)).simplify() # one view at a time = better simplification
-    return base
+    ret = self
+    for v in st.views: ret = ShapeTracker(ret.views + (v,)).simplify() # one view at a time = better simplification
+    return ret
   def invert(self, out_shape:Tuple[sint, ...]) -> Optional[ShapeTracker]:
-    ret = tuple(v.invert(s) for v,s in zip(self.views[::-1], [x.shape for x in self.views[::-1][1:]]+[out_shape]))
-    return ShapeTracker(cast(Tuple[View, ...], ret)).reshape(out_shape) if all(x is not None for x in ret) else None
+    inverted_views:List[View] = []
+    for v,s in zip(self.views[::-1], [x.shape for x in self.views[::-1][1:]]+[out_shape]):
+      if (inverted:= v.invert(s)) is None: return None
+      inverted_views.append(inverted)
+    return ShapeTracker(tuple(inverted_views)).reshape(out_shape)
   @staticmethod
   def from_shape(shape:Tuple[sint, ...]): return ShapeTracker((View.create(shape),))
@@ -72,16 +37,22 @@ class ShapeTracker:
   @property
   def contiguous(self) -> bool: return len(self.views) == 1 and self.views[0].contiguous
+  @property
+  def consecutive(self) -> bool: return len(self.views) == 1 and (v:=self.views[0]).mask is None and v.strides == strides_for_shape(v.shape)
   @property
   def shape(self) -> Tuple[sint, ...]: return self.views[-1].shape
   @property
-  def size(self) -> int: return prod([x.max if isinstance(x, Node) else x for x in self.views[-1].shape])
+  def size(self) -> int: return self.views[-1].size()
   def real_size(self) -> int:
     if 0 in self.shape: return 0
-    ret = self.expr_idxs()[0].max
-    while not isinstance(ret, int): ret = ret.max    # TODO: this is a while loop?!? it should be more clear what max does
+    idx, valid = self.expr_idxs()
+    if not valid: return 0
+    # TODO: it's possible that the real_size is smaller condition on valid being true
+    ret = idx.max
+    if not isinstance(ret, int): ret = ret.max  # might be represent by symbolic shape, take one more max for int max
     assert isinstance(ret, int), f"ret must be integer, {ret=} isn't"
     return ret+1
@@ -90,30 +61,9 @@ class ShapeTracker:
   @property
   def var_vals(self) -> Dict[Variable, int]: return merge_dicts([dict([v.unbind()]) for v in self.vars()])
-  def unbind(self) -> ShapeTracker: return ShapeTracker(tuple(v.unbind() for v in self.views))
-  def to_movement_ops(self) -> List[Tuple[MovementOps, Tuple]]:
-    to_apply:List[Tuple[MovementOps, Tuple]] = []
-    for v in self.views:
-      real_shape = tuple(y-x for x,y in v.mask) if v.mask else v.shape
-      real_offset = 0 if 0 in real_shape else (v.offset + (sum(x*st for (x,_),st in zip(v.mask, v.strides)) if v.mask else 0))
-      # first, we apply the offset
-      # then, we make it the correct shape
-      # then, we apply permutations
-      to_apply.append((MovementOps.AS_STRIDED, (tuple([s if st != 0 else 1 for s,st in zip(real_shape, v.strides)]), v.strides, real_offset)))
-      # then, we apply pre expand pads
-      if v.mask is not None:
-        pre_expand_pads = tuple((x,s-y) if st != 0 else (0,0) for (x,y),s,st in zip(v.mask, v.shape, v.strides))
-        post_expand_pads = tuple((x,s-y) if st == 0 else (0,0) for (x,y),s,st in zip(v.mask, v.shape, v.strides))
-        if any(x != (0,0) for x in pre_expand_pads):
-          to_apply.append((MovementOps.PAD, pre_expand_pads))
-          real_shape = tuple(x+s[0]+s[1] for x,s in zip(real_shape, pre_expand_pads))
-      # then, we do any expands
-      # NOTE: this is a good idea even without masks, since torch doesn't support negative strides and has to make a copy
-      if any(s != 1 and st == 0 for s,st in zip(real_shape, v.strides)): to_apply.append((MovementOps.EXPAND, real_shape))
-      # lastly, we apply post expand pads
-      if v.mask is not None and any(x != (0,0) for x in post_expand_pads): to_apply.append((MovementOps.PAD, post_expand_pads))
-    return to_apply
+  def unbind(self) -> Tuple[ShapeTracker, Dict[Variable, int]]:
+    unbound_views, var_vals = zip(*[v.unbind() for v in self.views])
+    return ShapeTracker(tuple(unbound_views)), merge_dicts(var_vals)
   # NOTE: if a stride is not always valid, it will be None
   def real_strides(self, ignore_valid=False) -> Tuple[Optional[sint], ...]:
@@ -124,7 +74,7 @@ class ShapeTracker:
     bad_idx_vars: Set[Variable] = set()
     for this_dim in (idx.nodes if isinstance(idx, SumNode) else [idx]):
       idx_maybe, stride_maybe = (this_dim.a, this_dim.b) if isinstance(this_dim, MulNode) else (this_dim, 1)
-      try: ret[idxs.index(idx_maybe)] = stride_maybe
+      try: ret[idxs.index(idx_maybe)] = cast(sint, stride_maybe)
       except ValueError: bad_idx_vars = bad_idx_vars.union(idx_maybe.vars())
     idx_vars, valid_vars = idx.vars(), valid.vars()
     for i,tidx in enumerate(idxs):
@@ -134,30 +84,27 @@ class ShapeTracker:
   def unit_stride_axes(self, ignore_valid=False) -> List[int]: return [i for i,st in enumerate(self.real_strides(ignore_valid)) if st == 1]
-  def _expr_idx(self, idx:Node, valid:Node) -> Tuple[Node, Node]:
-    for v in reversed(self.views[0:-1]):
+  def expr_idxs(self, idxs:Optional[Iterable[Node]]=None) -> Tuple[Node, Node]:
+    idxs = [Variable(f"idx{i}", 0, s-1) for i,s in enumerate(self.shape)] if idxs is None else list(idxs)
+    idx, valid = _expr_view(self.views[-1], idxs)
+    for view in reversed(self.views[0:-1]):
       if valid.max == 0: return NumNode(-1), valid
-      valid = expr_node_mask(v, idx, valid)
-      idx = expr_node(v, idx)
+      view = view.minify()
+      acc, idxs = 1, []
+      for d in reversed(view.shape):
+        idxs.append((idx//acc)%d)
+        acc *= d
+      idx, valid = _expr_view(view, idxs[::-1], valid)
+    assert not isinstance(idx.min, int) or idx.min >= -2**31, f"idx.min too small. {idx=}, {idx.min=}"
+    assert not isinstance(idx.max, int) or idx.max < 2**31, f"idx.max too big. {idx=}, {idx.max=}"
     return idx, valid
-  def expr_idxs(self, idxs:Optional[Iterable[Node]]=None):
-    if idxs is None: idxs = [Variable(f"idx{i}", 0, s-1) for i,s in enumerate(self.shape)]
-    idx = expr_idxs(self.views[-1], tuple(idxs))
-    valid = expr_node_mask(self.views[-1], idxs_to_idx(self.views[-1].shape, tuple(idxs)))
-    return self._expr_idx(idx, valid)
-  def expr_node(self, idx:Union[Node,str]='idx'):
-    if isinstance(idx, str): idx = Variable(idx, 0, prod(self.shape)-1)
-    return self._expr_idx(expr_node(self.views[-1], idx), expr_node_mask(self.views[-1], idx))
   def axis_is_masked(self, axis:int) -> bool:
     _, valid = self.expr_idxs()
     return f'idx{axis}' in [v.expr for v in valid.vars()]
   def simplify(self) -> ShapeTracker:
-    if len(self.views) >= 2 and (new_view := merge_views(self.views[-2], self.views[-1])) is not None:
-      if DEBUG >= 5: print(f"st simplify : {self.views[-2]} + {self.views[-1]} = {new_view}")
+    if len(self.views) >= 2 and (new_view := self.views[-2] + self.views[-1]) is not None:
       return ShapeTracker(self.views[:-2] + (new_view,)).simplify()
     return self
@@ -172,11 +119,3 @@ class ShapeTracker:
   def reshape(self, new_shape: Tuple[sint, ...]) -> ShapeTracker:
     if getenv("MERGE_VIEW", 1) and (new_view := self.views[-1].reshape(new_shape)) is not None: return ShapeTracker(self.views[0:-1] + (new_view,))
     return ShapeTracker(self.views + (View.create(new_shape), ))
-# returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
-# TODO: if we remove movementops from lazy.py we can delete this
-def get_contraction(old_shape:Tuple[sint, ...], new_shape:Tuple[sint, ...]) -> Optional[List[List[int]]]:
-  acc_old, acc_new = list(itertools.accumulate(old_shape, operator.mul)), list(itertools.accumulate(new_shape, operator.mul))
-  try: split = [acc_old.index(acc)+1 if acc != 1 else 0 for acc in acc_new]
-  except ValueError: return None
-  return [list(range(st,ed)) for st,ed in zip([0]+split[:-1], split[:-1]+[len(old_shape)])]

tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl