PyPI - tinygrad - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

tinygrad 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

tinygrad/codegen/kernel.py +114 -172
tinygrad/codegen/linearize.py +211 -81
tinygrad/codegen/lowerer.py +30 -35
tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
tinygrad/codegen/transcendental.py +12 -13
tinygrad/device.py +170 -47
tinygrad/dtype.py +28 -26
tinygrad/engine/jit.py +80 -63
tinygrad/engine/memory.py +4 -5
tinygrad/engine/multi.py +162 -0
tinygrad/engine/realize.py +58 -107
tinygrad/engine/schedule.py +381 -314
tinygrad/engine/search.py +40 -44
tinygrad/gradient.py +70 -0
tinygrad/helpers.py +77 -58
tinygrad/nn/__init__.py +30 -32
tinygrad/nn/datasets.py +1 -2
tinygrad/nn/optim.py +22 -26
tinygrad/nn/state.py +89 -64
tinygrad/ops.py +562 -446
tinygrad/renderer/__init__.py +79 -36
tinygrad/renderer/cstyle.py +70 -84
tinygrad/renderer/llvmir.py +32 -20
tinygrad/renderer/ptx.py +79 -99
tinygrad/renderer/wgsl.py +87 -0
tinygrad/runtime/autogen/amd_gpu.py +39507 -12
tinygrad/runtime/autogen/comgr.py +2 -0
tinygrad/runtime/autogen/kfd.py +4 -3
tinygrad/runtime/autogen/kgsl.py +1 -1
tinygrad/runtime/autogen/libpciaccess.py +2023 -0
tinygrad/runtime/autogen/llvm.py +11379 -0
tinygrad/runtime/autogen/vfio.py +891 -0
tinygrad/runtime/graph/cuda.py +8 -9
tinygrad/runtime/graph/hcq.py +84 -79
tinygrad/runtime/graph/metal.py +19 -21
tinygrad/runtime/ops_amd.py +488 -327
tinygrad/runtime/ops_clang.py +15 -28
tinygrad/runtime/ops_cloud.py +34 -34
tinygrad/runtime/ops_cuda.py +30 -27
tinygrad/runtime/ops_disk.py +62 -63
tinygrad/runtime/ops_dsp.py +129 -38
tinygrad/runtime/ops_gpu.py +30 -30
tinygrad/runtime/ops_hip.py +29 -31
tinygrad/runtime/ops_llvm.py +45 -40
tinygrad/runtime/ops_metal.py +93 -73
tinygrad/runtime/ops_npy.py +2 -2
tinygrad/runtime/ops_nv.py +232 -270
tinygrad/runtime/ops_python.py +51 -46
tinygrad/runtime/ops_qcom.py +129 -157
tinygrad/runtime/ops_webgpu.py +63 -0
tinygrad/runtime/support/allocator.py +94 -0
tinygrad/runtime/support/am/__init__.py +0 -0
tinygrad/runtime/support/am/amdev.py +384 -0
tinygrad/runtime/support/am/ip.py +463 -0
tinygrad/runtime/support/compiler_cuda.py +4 -2
tinygrad/runtime/support/elf.py +26 -4
tinygrad/runtime/support/hcq.py +254 -324
tinygrad/runtime/support/llvm.py +32 -0
tinygrad/shape/shapetracker.py +84 -53
tinygrad/shape/view.py +103 -138
tinygrad/spec.py +154 -0
tinygrad/tensor.py +744 -496
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
tinygrad-0.10.1.dist-info/RECORD +86 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
tinygrad/engine/lazy.py +0 -228
tinygrad/function.py +0 -212
tinygrad/multi.py +0 -177
tinygrad/runtime/graph/clang.py +0 -39
tinygrad-0.10.0.dist-info/RECORD +0 -77
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0

tinygrad/shape/view.py CHANGED Viewed

@@ -1,111 +1,101 @@
 from __future__ import annotations
-import functools, operator, itertools, math
+import functools, operator, itertools
 from dataclasses import dataclass
-from typing import Tuple, List, Optional, Dict, Set, cast
+from typing import Optional, cast, Sequence
 from tinygrad.dtype import dtypes
-from tinygrad.ops import resolve, UOp, Variable, sint, sym_infer, smax, smin
-from tinygrad.helpers import prod, all_int, argsort, flatten
+from tinygrad.ops import resolve, UOp, Variable, sint, sym_infer, smax, smin, sint_to_uop
+from tinygrad.helpers import prod, all_int, argsort, flatten, ceildiv
 @functools.lru_cache(maxsize=None)
-def canonicalize_strides(shape:Tuple[sint, ...], strides:Tuple[sint, ...]) -> Tuple[sint, ...]:
+def canonicalize_strides(shape:tuple[sint, ...], strides:tuple[sint, ...]) -> tuple[sint, ...]:
   return tuple(0 if s == 1 else st for s, st in zip(shape, strides))
 @functools.lru_cache(maxsize=None)
-def strides_for_shape(shape:Tuple[sint, ...]) -> Tuple[sint, ...]:
+def strides_for_shape(shape:tuple[sint, ...]) -> tuple[sint, ...]:
   if not shape: return ()
   strides = tuple(itertools.accumulate(reversed(shape[1:]), operator.mul, initial=1))[::-1]
   return canonicalize_strides(shape, strides)
 @functools.lru_cache(maxsize=None)
-def _merge_dims(shape:Tuple[int, ...], strides:Tuple[int, ...], mask:Optional[Tuple[Tuple[int, int], ...]]=None) -> Tuple[Tuple[int, int, int], ...]:
-  # merge contiguous sub-parts or zero strided dims. ret = Tuple[(merged_size, stride, merged size w/o zero stride), ...]
+def merge_dims(shape:tuple[int, ...], strides:tuple[int, ...], mask:Optional[tuple[tuple[int, int], ...]]=None) -> tuple[tuple[int, int, int], ...]:
+  # merge contiguous sub-parts or zero strided dims
+  # any stride 0, masked from dim=1, or contiguous part is merged into next dim.
+  # stride != 0 to stride == 0 starts a new merging block
+  # ret = tuple[(merged_size, stride, merged size w/o zero stride), ...]
   if not shape: return ()
   assert len(shape) == len(strides) and (mask is None or len(shape) == len(mask))
   ret = [(shape[0], strides[0], shape[0] if strides[0] != 0 else 0)]
   # merge this dim to next dim if size is 1
   merging = (mask[0][1] - mask[0][0] == 1) if mask is not None else shape[0] == 1
   for i, (s, st) in enumerate(zip(shape[1:], strides[1:]), start=1):
-    last_s, last_st, last_pre_expand_s = ret[-1]
     # always merge 1
     if s == 1: continue
+    last_s, last_st, last_pre_expand_s = ret[-1]
     # merge last dim with this dim if merging or strides matched
-    if merging or last_st == s * st: ret[-1] = (last_s * s, st, (s if merging else last_pre_expand_s * s) if st != 0 else 0)
-    else: ret.append((s, st, s if st != 0 else 0))
+    if merging or last_st == s * st: ret[-1] = (last_s * s, st, (s if merging else last_pre_expand_s * s))
+    else: ret.append((s, st, s))
     # merge this dim to next dim if size is 1
     merging = (mask[i][1] - mask[i][0] == 1) if mask is not None else s == 1
   return tuple(ret)
 @functools.lru_cache(maxsize=None)
-def _reshape_mask(_mask:Optional[Tuple[Tuple[sint, sint], ...]], old_shape:Tuple[sint, ...], new_shape:Tuple[sint, ...]) \
-  -> Optional[Tuple[Tuple[sint, sint], ...]]:
+def _reshape_mask(_mask:Optional[tuple[tuple[sint, sint], ...]], old_shape:tuple[sint, ...], new_shape:tuple[sint, ...]) \
+  -> Optional[tuple[tuple[sint, sint], ...]]:
   """Returns the new mask if reshape is possible, and None if not possible."""
   if _mask is None: return tuple((0, s) for s in new_shape)
-  if any(not isinstance(m[0], int) or not isinstance(m[1], int) for m in _mask): return None
-  if any(m[1] - m[0] < 1 for m in _mask): return ((0, 0),) * len(new_shape)  # zero mask
+  if not all_int(flatten(_mask)): return None
-  new_mask: List[Tuple[int, int]] = []
+  new_mask: list[tuple[int, int]] = []
   # _mask is all int here
-  r_masks, r_shape, r_new_shape = reversed(cast(Tuple[Tuple[int, int], ...], _mask)), reversed(old_shape), reversed(new_shape)
+  r_masks, r_shape, r_new_shape = reversed(cast(tuple[tuple[int, int], ...], _mask)), reversed(old_shape), reversed(new_shape)
   curr_stride, old_dim, new_dim, mask = 1, next(r_shape, 1), next(r_new_shape, 1), next(r_masks, (0,1))
   while len(new_mask) < len(new_shape):
     (l, r), next_stride = mask, new_dim * curr_stride
-    if old_dim >= next_stride: # need to split mask.
-      if old_dim == next_stride: # simply copy the mask and get next batch for merging
-        new_mask.append((l // curr_stride, (r - 1) // curr_stride + 1))
-        curr_stride, old_dim, new_dim, mask = 1, next(r_shape, 1), next(r_new_shape, 1), next(r_masks, (0,1))
-      else: # mask can only be splitted if reshape doesn't cut across the mask.
-        if (((l % next_stride != 0 or r % next_stride != 0) and l // next_stride != (r - 1) // next_stride)
-            or old_dim % next_stride != 0): return None
-        new_mask.append((l % next_stride // curr_stride, (r - 1) % next_stride // curr_stride + 1))
-        curr_stride, new_dim = next_stride,  next(r_new_shape, 1) # need to get mask for next dimension
+    # need to split mask
+    if old_dim == next_stride: # simply copy the mask and get next batch for merging
+      new_mask.append((l // curr_stride, (r - 1) // curr_stride + 1))
+      curr_stride, old_dim, new_dim, mask = 1, next(r_shape, 1), next(r_new_shape, 1), next(r_masks, (0,1))
+    elif old_dim > next_stride: # mask can only be splitted if reshape doesn't cut across the mask.
+      if old_dim % next_stride != 0: return None
+      if (l % next_stride != 0 or r % next_stride != 0) and l // next_stride != (r - 1) // next_stride: return None
+      new_mask.append((l % next_stride // curr_stride, (r - 1) % next_stride // curr_stride + 1))
+      curr_stride, new_dim = next_stride,  next(r_new_shape, 1) # need to get mask for next dimension
     else:
       next_mask = next(r_masks, (0, 1))
       # combine if the mask can unfold continuously
-      if mask != (0, old_dim) and next_mask[1] - next_mask[0] != 1: return None
+      if mask != (0, old_dim) and l != r and next_mask[1] - next_mask[0] != 1: return None
       mask, old_dim = (next_mask[0] * old_dim + l, (next_mask[1] - 1) * old_dim + r), old_dim * next(r_shape, 1)
-  for mask in r_masks: # if the old shape has leading 1s, need to make sure their mask is (0,1)
-    if mask != (0, 1): return ((0, 0),) * len(new_shape) # invalid mask
   return tuple(reversed(new_mask))
-def un1d(shape:Tuple[sint, ...], offs:sint) -> List[sint]:
-  strides = strides_for_shape(shape)
-  result = []
-  for stride in strides:
-    here = offs // stride if stride != 0 else 0
-    result.append(here)
-    offs -= here * stride
-  return result
-def variable_to_uop(x, ctx=None) -> UOp: return UOp.const(dtypes.int, x) if isinstance(x, int) else x
+def unravel(shape:tuple[sint, ...], offset:sint) -> list[sint]:
+  # find the position of offset on each dimension based on shape
+  # similar to unravel_index in numpy/torch
+  acc, idxs = 1, []
+  for d in reversed(shape):
+    idxs.append((offset//acc)%d)
+    acc *= d
+  return idxs[::-1]
 @dataclass(frozen=True)
 class View:
-  shape:Tuple[sint, ...]
-  strides:Tuple[sint, ...]
+  shape:tuple[sint, ...]
+  strides:tuple[sint, ...]
   offset:sint
-  mask:Optional[Tuple[Tuple[sint, sint], ...]]
+  mask:Optional[tuple[tuple[sint, sint], ...]]
   contiguous:bool
-  @functools.cached_property
-  def t(self):
-    return tuple(x.tuplize if isinstance(x, UOp) else (x,) \
-                 for x in self.shape+self.strides+(self.offset,)+(tuple(flatten(self.mask)) if self.mask is not None else tuple()))
-  def __lt__(self, o:View): return self.t < o.t
-  def to_indexed_uops(self:View, _idxs:Optional[List[UOp]]=None, vexpr:UOp=UOp.const(dtypes.bool, True)) -> Tuple[UOp, UOp]:
-    idxs = [UOp.range(dtypes.int, 0, s, i) for i,s in enumerate(self.shape)] if _idxs is None else _idxs
-    iexpr = variable_to_uop(self.offset)
-    for idx,sh,st,m in zip(idxs, self.shape, self.strides, self.mask if self.mask is not None else [None]*len(self.shape)):
+  def to_indexed_uops(self:View, idxs:Optional[Sequence[UOp]]=None, vexpr:UOp=UOp.const(dtypes.bool, True)) -> tuple[UOp, UOp]:
+    """(idx, valid)"""
+    if idxs is None: idxs = [UOp.range(dtypes.int, 0, s, i) for i,s in enumerate(self.shape)]
+    iexpr = sint_to_uop(self.offset)
+    for idx,sh,st,m in zip(idxs, self.shape, self.strides, self.mask if self.mask is not None else itertools.repeat(None)):
       if resolve(sh != 1) and resolve(st != 0): iexpr = iexpr + idx*st
       if m is not None:
-        if resolve(m[0] != 0): vexpr = vexpr * idx.ge(m[0])
-        if resolve(m[1] != sh): vexpr = vexpr * idx.lt(m[1])
+        if resolve(m[0] != 0): vexpr = vexpr * (idx >= m[0])
+        if resolve(m[1] != sh): vexpr = vexpr * (idx < m[1])
     return iexpr, vexpr
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
@@ -116,13 +106,13 @@ class View:
   @staticmethod
   @functools.lru_cache(maxsize=None)
-  def create(shape:Tuple[sint, ...], strides:Optional[Tuple[sint, ...]]=None, offset:sint=0, mask:Optional[Tuple[Tuple[sint, sint], ...]]=None):
+  def create(shape:tuple[sint, ...], strides:Optional[tuple[sint, ...]]=None, offset:sint=0, mask:Optional[tuple[tuple[sint, sint], ...]]=None):
     # TODO: this resolve shouldn't be needed
     if not all(resolve(s >= 0) for s in shape): raise ValueError(f"Trying to create View with negative dimension: {shape=}")
     strides = canonicalize_strides(shape, strides) if strides else strides_for_shape(shape)
     # canonicalize 0 in shape
     if 0 in shape: return View(shape, (0,) * len(shape), offset=0, mask=None, contiguous=True)
-    # canonicalize empty mask
+    # canonicalize no-op mask
     if mask is not None and all(m == (0,s) for m,s in zip(mask, shape)): mask = None
     # if any dimension has size >1, but is masked such that only one index in the dimension is unmasked
     # then its stride can also be set to 0, albeit with a corresponding adjustment required to the offset
@@ -134,7 +124,7 @@ class View:
     # simplify as we go
     if isinstance(offset, UOp): offset = cast(sint, offset.ssimplify())
     shape = tuple(cast(sint, x.ssimplify()) if isinstance(x, UOp) else x for x in shape)
-    # TODO: enabling stride simplification breaks it
+    # TODO: enabling stride simplification breaks symbolic jit
     """
     strides = tuple(x.ssimplify() if isinstance(x, UOp) else x for x in strides)
     if mask: mask = tuple((s.ssimplify() if isinstance(s, UOp) else s, e.ssimplify() if isinstance(e, UOp) else e) for s,e in mask)
@@ -143,15 +133,15 @@ class View:
     return View(shape, strides, offset, mask, contiguous)
   @functools.lru_cache(None)  # pylint: disable=method-cache-max-size-none
-  def vars(self) -> Set[Variable]:
+  def vars(self) -> set[Variable]:
     flatten_mask = tuple(x for m in self.mask for x in m) if self.mask is not None else tuple()
     return functools.reduce(operator.or_, [x.vars() for x in self.shape+self.strides+(self.offset,)+flatten_mask if isinstance(x, UOp)], set())
   @functools.lru_cache(None)  # pylint: disable=method-cache-max-size-none
-  def unbind(self) -> Tuple[View, Dict[Variable, int]]:
+  def unbind(self) -> tuple[View, dict[Variable, int]]:
     var_unboundvar_val = [(v, v.unbind()) for v in self.vars()]
     unbound_vars = {v:uv for v,(uv,_) in var_unboundvar_val}
-    def substitute(x): return x if isinstance(x, int) else x.substitute(unbound_vars)
+    def substitute(x:sint): return x if isinstance(x, int) else x.substitute(unbound_vars)
     new_shape = tuple(map(substitute, self.shape))
     new_strides = tuple(map(substitute, self.strides))
     new_offset = substitute(self.offset)
@@ -165,27 +155,26 @@ class View:
     if vm1.contiguous and vm1.shape == vm2.shape: return vm2
     if vm1.contiguous and vm1.size() == vm2.size() and (ret := vm2.reshape(vm1.shape)) is not None: return ret
     if vm1.mask:
-      for b,e in vm1.mask:
-        if resolve(b >= e, False): return View.create(vm1.shape, (0,) * len(vm1.shape), 0, ((0,0),) * len(vm1.shape))
-      return (merged := vm2 + vm1.shrink(vm1.mask)) and merged.pad(tuple((b,s-e) for (b,e),s in zip(vm1.mask, vm1.shape)))
+      if (new_vm1 := vm1.shrink(vm1.mask)) == vm1 or (merged := vm2 + new_vm1) is None: return None
+      return merged.pad(tuple((b,s-e) for (b,e),s in zip(vm1.mask, vm1.shape)))
+    if not all_int(vm1.shape): return None
     # Project vm1's offset and strides on to vm2.
-    origin = un1d(vm2.shape, vm1.offset)
-    terms: List[List[Tuple[int, sint]]] = [[] for _ in origin]
-    strides: List[sint] = [0] * len(vm1.shape)
+    origin = unravel(vm2.shape, vm1.offset)
+    terms: list[list[tuple[int, sint]]] = [[] for _ in vm2.shape]
+    strides: list[sint] = [0] * len(vm1.shape)
     for d1, st in enumerate(vm1.strides):
       if st == 0: continue
-      for d2, (o, s1) in enumerate(zip(origin, un1d(vm2.shape, vm1.offset + st))):
+      for d2, (o, s1) in enumerate(zip(origin, unravel(vm2.shape, vm1.offset + st))):
         if (s1 := s1 - o) == 0: continue
         terms[d2].append((d1, s1))
         strides[d1] += s1 * vm2.strides[d2]
     # Merge dimensions in vm2 if required.
     # NB: Merging too many dimensions can make it difficult to project vm2's mask, hence only combining when required.
-    if not all_int(vm1.shape): return None
-    idxs: List[UOp] = [UOp.variable(f"idx{i}", 0, s-1) for i,s in enumerate(vm1.shape)]
+    idxs: list[UOp] = [UOp.variable(f"idx{i}", 0, s-1) for i,s in enumerate(vm1.shape)]
     merged_size, merged_term = 1, UOp.const(dtypes.int, 0)
-    extents: List[Tuple[sint, UOp]] = []
+    extents: list[tuple[sint, UOp]] = []
     for term, s, o in zip(reversed(terms), reversed(vm2.shape), reversed(origin)):
       merged_term += (sum([idxs[d1] * s1 for d1, s1 in term]) + o) * merged_size
       merged_size *= s
@@ -194,8 +183,8 @@ class View:
         merged_size, merged_term = 1, UOp.const(dtypes.int, 0)
     if resolve(merged_term != 0): return None
     if (vm2_shape := tuple(s for s,_ in reversed(extents))) != vm2.shape:
-      reshaped_vm2 = vm2.reshape(vm2_shape)
-      if reshaped_vm2 is None: return None
+      if (reshaped_vm2 := vm2.reshape(vm2_shape)) is None: return None
+      # NOTE: this != to prevent infinite loop
       if reshaped_vm2.shape != vm2.shape: return reshaped_vm2 + vm1
     if vm2.mask:
@@ -203,54 +192,45 @@ class View:
       newb, newe, bad = [0] * len(vm1.shape), list(vm1.shape), False
       for (b, e), o, term, (_, t) in zip(vm2.mask, origin, terms, reversed(extents)):
         if resolve(b <= t.vmin and t.vmax < e, False): continue
-        if not all_int([o, b, e]):
-          bad = True
-          continue
         if len(term) != 1:
           if not term and newe: newe[0] = 0
           else: bad = True
           continue
         d1, s1 = term[0]
-        if not isinstance(s1, int) or not isinstance(newe[d1], int):
-          bad = True
-          continue
-        newb[d1] = max(newb[d1], math.ceil((b - o if s1 > 0 else e - o - 1) / s1))
+        newb[d1] = max(newb[d1], ceildiv(b - o if s1 > 0 else e - o - 1, s1))
         newe[d1] = min(newe[d1], (b - o if s1 < 0 else e - o - 1) // s1 + 1)
       # If any of vm1 was masked off, try again with that mask in place.
-      for b, e, s in zip(newb, newe, vm1.shape):
-        if (b, e) != (0, s):
-          return vm2 + View.create(vm1.shape, vm1.strides, vm1.offset, tuple(zip(newb, newe)))
+      if any((b, e) != (0, s) for b, e, s in zip(newb, newe, vm1.shape)):
+        return vm2 + View.create(vm1.shape, vm1.strides, vm1.offset, tuple(zip(newb, newe)))
       # Otherwise if vm2's mask was violated, then cannot merge.
       if bad: return None
     return View.create(vm1.shape, tuple(strides), sum(o * s for o, s in zip(origin, vm2.strides)) + vm2.offset)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def invert(self, out_shape:Tuple[sint, ...]) -> Optional[View]:
+  def invert(self, out_shape:tuple[sint, ...]) -> Optional[View]:
     ret = View.create(self.shape)
     if self.mask: ret = ret.shrink(self.mask)
-    ret = ret.stride(tuple(-1 if x < 0 else 1 for x in self.strides)).permute(argsort(tuple(-x if x > 0 else x for x in self.strides)))
+    ret = ret.flip(tuple(x < 0 for x in self.strides)).permute(argsort(tuple(-x if x > 0 else x for x in self.strides)))
     return ret if prod(ret.shape) == prod(out_shape) else None   # don't support shrink, expand, or stride != (-1, 1)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
   def minify(self):
-    min_shape = tuple(x[0] for x in _merge_dims(self.shape, self.strides, self.mask))
+    min_shape = tuple(x[0] for x in merge_dims(self.shape, self.strides, self.mask))
     return nv if (nv := self.reshape(min_shape)) else self
-  def __unsafe_resize(self, arg: Tuple[Tuple[sint, sint], ...], mask=None) -> View:
+  def __unsafe_resize(self, arg: tuple[tuple[sint, sint], ...], mask=None) -> View:
     offset = sum([s * x[0] for s, x in zip(self.strides,arg)])
     if self.mask:
       # move the old mask
       nmask = tuple([(smax(0, smin(mx-ax,ay-ax)), smax(0, smin(my-ax,ay-ax))) for (mx,my),(ax,ay) in zip(self.mask, arg)])
       # merge the masks if we have two
       mask = tuple([(smax(mx1, mx2), smin(my1, my2)) for (mx1, my1), (mx2, my2) in zip(nmask, mask)]) if mask is not None else nmask
-    shape = [y-x for x,y in arg]
-    if mask is not None and all(m[0] == 0 and m[1] == s for m,s in zip(mask, shape)): mask = None
-    return View.create(tuple(s.ssimplify() if isinstance(s, UOp) else s for s in shape), self.strides, self.offset+offset, mask)
+    return View.create(tuple([y-x for x,y in arg]), self.strides, self.offset+offset, mask)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def pad(self, arg: Tuple[Tuple[sint, sint], ...]) -> View:
+  def pad(self, arg: tuple[tuple[sint, sint], ...]) -> View:
     assert len(arg) == len(self.shape), f"invalid pad {arg} for {self.shape}"
     # NOTE: not checking for symbolic arg
     for b,e in arg: assert not all_int([b,e]) or b>=0 and e>=0, f"invalid pad {arg} for {self.shape}"
@@ -261,58 +241,46 @@ class View:
     return self
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def shrink(self, arg: Tuple[Tuple[sint, sint], ...]) -> View:
+  def shrink(self, arg: tuple[tuple[sint, sint], ...]) -> View:
     assert len(arg) == len(self.shape), f"invalid shrink {arg} for {self.shape}"
     # NOTE: not checking for symbolic arg
     for s,(b,e) in zip(self.shape,arg): assert not all_int([s,b,e]) or (0<=b<=e<=s), f"invalid shrink {arg} for {self.shape}"
     return self.__unsafe_resize(arg)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def expand(self, new_shape: Tuple[sint, ...]) -> View:
+  def expand(self, new_shape: tuple[sint, ...]) -> View:
     if len(new_shape) != len(self.shape): raise ValueError(f"expand arg {new_shape=} must have same number of dimensions as shape {self.shape=}")
-    if 0 in self.shape:
-      assert all((s == x == 0) or (s > 0 and (x % s) == 0) for s,x in zip(self.shape, new_shape)), f"can't expand {self.shape} into {new_shape}"
-      return View.create(new_shape)
-    # TODO: this resolve might be wrong
-    assert all((not resolve(s != x, False) or s == 1) for s,x in zip(self.shape, new_shape)), f"can't expand {self.shape} into {new_shape}"
-    # NOTE: can the mask ever be (0,0)?
+    # NOTE: does not check multiple of symbolic shape
+    assert all(resolve(s == ns) or s == 1 for s,ns in zip(self.shape, new_shape)), f"can't expand {self.shape} into {new_shape}"
+    if 0 in self.shape: return View.create(new_shape)
     # TODO: this resolve may not be needed, but it's hard because vars need to be sorted
     mask = tuple([(((0,0) if m != (0,1) else (0,ns)) if resolve(s != ns, False) else m) \
                   for m,s,ns in zip(self.mask, self.shape, new_shape)]) if self.mask else None
     return View.create(new_shape, self.strides, self.offset, mask)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def permute(self, axis: Tuple[int, ...]) -> View:
+  def permute(self, axis: tuple[int, ...]) -> View:
     assert sorted(axis) == list(range(len(self.shape))), f"invalid permutation {axis} of len {len(self.shape)}"
     return View.create(tuple(self.shape[a] for a in axis), tuple(self.strides[a] for a in axis), self.offset,
                        tuple(self.mask[a] for a in axis) if self.mask is not None else None)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def stride(self, mul: Tuple[int, ...]) -> View:
-    # except for the negative case, you can build this from the others. invertible in the negative case
-    assert all(isinstance(x, int) and x != 0 for x in mul), f"invalid stride {mul} for {self.shape}"
-    strides = tuple([z*m for z,m in zip(self.strides, mul)])
-    new_shape = tuple([(s+(abs(m)-1))//abs(m) for s,m in zip(self.shape, mul)])
-    offset = sum([(s-1)*z for s,z,m in zip(self.shape, self.strides, mul) if m < 0])
-    mask = tuple([(((mx if m > 0 else s-my)+(abs(m)-1))//abs(m), ((my if m > 0 else s-mx)+(abs(m)-1))//abs(m)) \
-                  for (mx,my),s,m in zip(self.mask, self.shape, mul)]) if self.mask is not None else None
-    return View.create(new_shape, strides, self.offset + offset, mask)
+  def flip(self, arg: tuple[bool, ...]) -> View:
+    offset = sum((s-1)*z for s,z,f in zip(self.shape, self.strides, arg) if f)
+    mask = tuple((s-my,s-mx) if f else (mx,my) for (mx,my),s,f in zip(self.mask, self.shape, arg)) if self.mask is not None else None
+    return View.create(self.shape, tuple(-z if f else z for z,f in zip(self.strides, arg)), self.offset+offset, mask)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def reshape(self, new_shape: Tuple[sint, ...]) -> Optional[View]:
+  def reshape(self, new_shape: tuple[sint, ...]) -> Optional[View]:
     if self.shape == new_shape: return self
-    # TODO: this resolve shouldn't be needed
-    assert all(resolve(x >= 0) for x in new_shape), f"shape can't contain negative numbers {new_shape}"
-    if 0 in self.shape:
-      assert 0 in new_shape, f"cannot reshape 0 size to {new_shape}"
-      return View.create(new_shape)
+    assert all(x >= 0 for x in new_shape), f"shape can't contain negative numbers {new_shape}"
     # check for the same size
     if (self_all_int := all_int(self.shape)):
       assert all(isinstance(s, (int, UOp)) for s in new_shape), f"{self.shape=} -> {new_shape=} contains non (int, Variable) dim"
-      if resolve(prod(self.shape) != prod(new_shape), False):
-        raise ValueError(f"size mismatched, can't reshape {self.shape=} -> {new_shape=}")
+      if resolve(prod(self.shape) != prod(new_shape), False): raise ValueError(f"size mismatched, can't reshape {self.shape=} -> {new_shape=}")
+    if 0 in self.shape: return View.create(new_shape)
     if new_shape == () and self.mask and any(mx==my for (mx,my) in self.mask): return None
     # after the asserts, it's okay to check contiguous
@@ -322,29 +290,26 @@ class View:
     if self_all_int and not all_int(new_shape):
       if len(self.shape) != len(new_shape): raise ValueError(f"cannot symbolic reshape non-contiguous {self} -> {new_shape}")
       for si, so in zip(self.shape, new_shape):
-        if isinstance(so, int):
-          if si != so: raise ValueError(f"cannot symbolic reshape non-contiguous {self} -> {new_shape}")
-        else:
-          var_vals = dict([v.unbind() for v in so.vars()])
-          if si != sym_infer(so, var_vals): raise ValueError(f"cannot symbolic reshape non-contiguous {self} -> {new_shape}")
+        if not isinstance(so, int): so = sym_infer(so, dict([v.unbind() for v in so.vars()]))
+        if si != so: raise ValueError(f"cannot symbolic reshape non-contiguous {self} -> {new_shape}")
       # all dimensions matched, return the new view directly
       return View(new_shape, self.strides, self.offset, self.mask, self.contiguous)
-    strides, r_new_shape = [], reversed(new_shape)
-    for merged_dim, new_stride, real_dim in reversed(_merge_dims(self.shape, self.strides, self.mask)):
+    r_strides, r_new_shape = [], reversed(new_shape)
+    for merged_size, new_stride, real_size in reversed(merge_dims(self.shape, self.strides, self.mask)):
+      # TODO: write with get_contraction
       acc = 1
       # TODO: third resolve shouldn't be needed
-      while resolve(acc <= merged_dim) and resolve(acc != merged_dim) and resolve((new_dim := next(r_new_shape, 0)) > 0):
-        strides.append(new_stride)
-        if resolve(new_dim != 1): new_stride *= (new_dim if resolve((acc := acc * new_dim) < real_dim) else 0)
-      if resolve(acc != merged_dim): break
-    else:
-      strides += [0,] * (len(new_shape) - len(strides))
-      new_mask = _reshape_mask(self.mask, self.shape, new_shape)
-      if new_mask is not None:
-        new_strides = canonicalize_strides(tuple(e-b for b,e in new_mask), tuple(reversed(strides)))
-        extra_offset = (sum(m[0] * s for m,s in zip(self.mask, self.strides)) if self.mask else 0) - \
-                       (sum(m[0] * s for m,s in zip(new_mask, new_strides)))
-        return View.create(new_shape, new_strides, self.offset + extra_offset, new_mask)
+      while resolve(acc <= merged_size) and resolve(acc != merged_size) and resolve((new_dim := next(r_new_shape, 0)) > 0):
+        r_strides.append(new_stride * acc)
+        acc = acc * new_dim
+        if not resolve(acc < real_size): new_stride = 0
+      if resolve(acc != merged_size): return None
+    new_strides = (0,) * (len(new_shape) - len(r_strides)) + tuple(r_strides[::-1])
+    if (new_mask:=_reshape_mask(self.mask, self.shape, new_shape)) is not None:
+      extra_offset = (sum(m[0] * s for m,s in zip(self.mask, self.strides)) if self.mask else 0) - \
+                     (sum(m[0] * s for m,s in zip(new_mask, new_strides)))
+      return View.create(new_shape, new_strides, self.offset + extra_offset, new_mask)
     return None

tinygrad/spec.py ADDED Viewed

@@ -0,0 +1,154 @@
+from typing import cast
+from tinygrad.ops import PatternMatcher, UPat, GroupOp, Ops, UOp, print_uops
+from tinygrad.dtype import DType, ImageDType, dtypes, PtrDType
+from tinygrad.helpers import all_int, all_same, dedup, prod
+# *** this is the spec of a Tensor in UOp ***
+tensor_uop_spec = PatternMatcher([
+  (UPat(Ops.DEVICE, dtypes.void, (), name="device"), lambda device: isinstance(device.arg, str)),
+  (UPat(Ops.BUFFER, src=(UPat(Ops.DEVICE),), name="buf"),
+   lambda buf: isinstance(buf.arg, tuple) and len(buf.arg) == 2 and all_int(buf.arg) and isinstance(buf.dtype, (DType, ImageDType))),
+  (UPat(GroupOp.Movement, name="mv", src=(UPat.var("x"),)),
+   # naturally correct
+   lambda mv,x: (isinstance(mv.arg, tuple) and mv.dtype == x.dtype) or
+   # "make things that can't be images not images" can change the buffer dtype
+   # this is fine as long as it's a realized buffer and base dtypes match.
+   ((isinstance(mv.dtype, ImageDType) or isinstance(x.dtype, ImageDType)) and x.dtype.base == mv.dtype.base and x.is_realized)),
+  # Tensor variable bindings
+  (UPat(Ops.BIND, dtypes.int, (UPat(Ops.DEFINE_VAR), UPat.cvar(dtype=dtypes.int)), arg=None), lambda: True),
+  # Tensor const has a device and an unmasked ShapeTracker of stride 0
+  (UPat(Ops.CONST, src=(UPat(Ops.VIEW, name="st", src=(UPat(Ops.DEVICE),)),)),
+   lambda st: st.st.views[0].mask is None and len(st.st.views) == 1 and all(s == 0 for s in st.st.views[0].strides)),
+  # DETACH and CONTIGUOUS change how we interpret the source UOp
+  # CONTIGUOUS ensures the source UOp realizes
+  (UPat((Ops.DETACH, Ops.CONTIGUOUS, Ops.CONTIGUOUS_BACKWARD), name="root", src=(UPat.var("x"),), arg=None), lambda root,x: root.dtype == x.dtype),
+  # COPY
+  # NOTE: the arg here specifies clone=True, which prevents folding same device copy
+  (UPat(Ops.COPY, name="copy", src=(UPat(Ops.DEVICE), UPat.var("x"))), lambda copy,x: isinstance(copy.arg, bool) and copy.dtype == x.dtype),
+  # VIEW(BUFFER) applies a ShapeTracker on top of the underlying device buffer
+  # NOTE: VIEW size exactly matches the underlying BUFFER, tensor doesn't apply movement ops to the VIEW
+  (UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf"),)),
+   lambda view,buf: view.dtype == buf.dtype and view.size == buf.size and view.st.contiguous),
+  # ASSIGN changes the value of a realized buffer
+  (UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))),
+   lambda assign,target,new_val: target.is_realized and (assign.dtype == target.dtype == new_val.dtype)),
+])
+# ***** uop type spec *****
+# this is the matcher for the final rendered UOps
+# matcher functions returns True or False (or None to not match)
+spec = PatternMatcher([
+  (UPat(Ops.DEFINE_GLOBAL, name="x"), lambda x: isinstance(x.dtype, (PtrDType, ImageDType)) and not x.dtype.local),
+  (UPat(Ops.DEFINE_LOCAL, name="x"), lambda x: isinstance(x.dtype, PtrDType) and x.dtype.local),
+  (UPat(Ops.DEFINE_ACC, src=(UPat.var("c"),), name="x", allow_any_len=True),
+   lambda x,c: all(y.op is Ops.RANGE for y in x.src[1:]) and c.dtype == x.dtype),
+  (UPat(Ops.DEFINE_VAR, name="x"), lambda x: isinstance(x.arg[1], int) and isinstance(x.arg[2], int)),
+  (UPat(Ops.RANGE, src=(UPat.var("x"), UPat.var("y")), name="rng"), lambda rng,x,y: rng.dtype == x.dtype == y.dtype and isinstance(rng.arg, int)),
+  (UPat(Ops.SPECIAL, src=()), lambda: True),
+  # TODO: confirm the args of both of these are shapetrackers
+  (UPat(Ops.VIEW, dtypes.void, src=()), lambda: True),
+  (UPat(Ops.VIEW, src=(UPat.var("src"),), name="x"), lambda x,src: src.op is not Ops.STORE and x.dtype == src.dtype),
+  (UPat(Ops.VALID, dtypes.bool, (UPat(Ops.VIEW),)), lambda: True),
+  (UPat(Ops.CONST, name="x"), lambda x: type(x.arg) is type(dtypes.as_const(x.arg, x.dtype))),
+  # early LOAD has a <buf, shapetracker, store?>
+  (UPat(Ops.LOAD, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat(Ops.VIEW))), lambda: True),
+  (UPat(Ops.LOAD, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat(Ops.VIEW), UPat(Ops.STORE))), lambda: True),
+  # early STORE has a <buf, shapetracker, val>
+  (UPat(Ops.STORE, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat(Ops.VIEW), UPat())), lambda: True),
+  # **** new style load/store ****
+  # INDEX is used in new style load/store
+  (UPat(Ops.INDEX, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat())), lambda: True),
+  # LOAD takes a <bufidx, alt?, gate?, barrier?>
+  (UPat(Ops.LOAD, src=(UPat((Ops.INDEX, Ops.CAST)),)), lambda: True),
+  (UPat(Ops.LOAD, src=(UPat((Ops.INDEX, Ops.CAST)), UPat((Ops.IF, Ops.BARRIER)))), lambda: True),
+  (UPat(Ops.LOAD, src=(UPat((Ops.INDEX, Ops.CAST)), UPat.var("alt"), UPat(dtype=dtypes.bool)), name="ld"), lambda ld,alt: ld.dtype == alt.dtype),
+  # STORE takes a <bufidx, val, gate?>
+  (UPat(Ops.STORE, dtype=dtypes.void, src=(UPat((Ops.INDEX, Ops.CAST)), UPat())), lambda: True),
+  (UPat(Ops.STORE, dtype=dtypes.void, src=(UPat((Ops.INDEX, Ops.CAST)), UPat(), UPat(dtype=dtypes.bool))), lambda: True),
+  (UPat(Ops.STORE, dtype=dtypes.void, src=(UPat((Ops.INDEX, Ops.CAST)), UPat(), UPat(Ops.IF))), lambda: True),
+  # most ALUs have all matching dtypes, except CMPLT, CMPNE, and WHERE
+  (UPat(Ops.WHERE, name="w", src=(UPat(dtype=dtypes.bool), UPat.var("x"), UPat.var("y"))), lambda w,x,y: w.dtype == x.dtype == y.dtype),
+  (UPat((Ops.CMPLT, Ops.CMPNE), dtype=dtypes.bool, src=(UPat.var("x"), UPat.var("y"))), lambda x,y: x.dtype.base == y.dtype.base),
+  # and SHL/SHR, the shift distance can be an int
+  (UPat((Ops.SHL, Ops.SHR), src=(UPat.var("x"), UPat.var("y")), name="a"), lambda a,x,y: a.dtype == x.dtype and y.dtype in (x.dtype, dtypes.uint)),
+  (UPat(Ops.IDIV, name="x"), lambda x: None if dtypes.is_int(x.dtype) else False),
+  (UPat(GroupOp.ALU, name="x"), lambda x: all(x.dtype.base == y.dtype.base for y in x.src)),
+  (UPat(Ops.ASSIGN, src=(UPat((Ops.DEFINE_ACC, Ops.DEFINE_GLOBAL)), UPat())), lambda: True),
+  (UPat(Ops.ENDRANGE, dtype=dtypes.void, src=(UPat(Ops.RANGE),)), lambda: True),
+  # WMMA has a <a, b, acc>
+  (UPat(Ops.WMMA, src=(UPat(), UPat(), UPat()), name="x"), lambda x: isinstance(x.arg, tuple) and len(x.arg) == 8),
+  (UPat(Ops.CONTRACT, name="x"), lambda x: x.dtype.count == prod(y[1] for y in x.arg)),
+  (UPat(Ops.UNROLL, name="x"), lambda x: x.src[0].dtype.count == prod(y[1] for y in x.arg)),
+  # if has a <gate, barrier?>
+  (UPat(Ops.IF, dtype=dtypes.void, src=(UPat(),)), lambda: True),
+  (UPat(Ops.IF, dtype=dtypes.void, src=(UPat(), UPat(Ops.BARRIER))), lambda: True),
+  (UPat(Ops.ENDIF, dtype=dtypes.void, src=(UPat(Ops.IF),)), lambda: True),
+  (UPat(Ops.REDUCE_AXIS, name="x"), lambda x: isinstance(x.arg, tuple) and len(x.arg) == 2 and x.arg[0] in {Ops.ADD, Ops.MUL, Ops.MAX}),
+  (UPat(Ops.GEP, src=(UPat.var("src"),), name="gep"), lambda gep,src: gep.dtype == src.dtype.scalar()),
+  (UPat(Ops.VECTORIZE, name="x"), lambda x: len(x.src)>1 and len(x.src) == x.dtype.count and all(x.dtype == y.dtype.vec(len(x.src)) for y in x.src)),
+  (UPat((Ops.BITCAST, Ops.CAST), src=(UPat(),), name="x"), lambda x: x.arg is None),
+  (UPat(Ops.BARRIER, dtypes.void, src=UPat(Ops.STORE, allow_any_len=True)), lambda: True), # NOTE: all pointers must be local
+  # NOTE: for testing, we let sinks be anything
+  #(UPat(UOps.SINK, src=UPat(UOps.STORE)), lambda: True),
+  (UPat(Ops.SINK, dtypes.void), lambda: True),
+  (UPat(Ops.NOOP), lambda: True),
+  # PTX LOAD/STORE
+  (UPat((Ops.LOAD, Ops.STORE), src=(UPat(dtype=dtypes.int64),), allow_any_len=True), lambda: True),
+])
+# *** this is the spec of a Kernel in UOp ***
+kernel_spec = PatternMatcher([
+  (UPat(Ops.DEVICE, src=()), lambda: True),
+  (UPat(Ops.BUFFER, src=(UPat(Ops.DEVICE),)), lambda: True),
+  # TODO: currently kernel only has buffer parents, this is incomplete. it should be BUFFER and ASSIGN
+  (UPat(Ops.KERNEL, src=UPat(Ops.BUFFER)), lambda: True),
+])
+# *** this is the UOp shape spec ***
+def verify_sink_dims(sink:UOp):
+  shape_dims = [sorted(dedup(dims)) for dims in zip(*[x.shape for x in sink.toposort if x.op is not Ops.SINK and x.st is not None])]
+  return all_same([x.st_arg.size for x in sink.src]) and all(len(x) == 1 or (len(x) == 2 and x[0] == 1) for x in shape_dims)
+shape_spec = PatternMatcher([
+  # shapes must have either 1 or n in each dimension
+  (UPat(Ops.SINK, src=UPat(Ops.STORE), allow_any_len=True, name="sink"), verify_sink_dims),
+  # all parent UOps must have the same shape
+  (UPat(GroupOp.All-{Ops.SINK}, name="root"), lambda root: all_same([x.shape for x in root.src if x.st is not None])),
+])
+# ***** uop helpers *****
+def type_verify(uops:list[UOp], *extra_specs:PatternMatcher):
+  specs = [spec, *extra_specs]
+  for i,u in enumerate(uops):
+    spec_ret = [cast(bool|None, s.rewrite(u)) for s in specs]
+    if any(ret is False for ret in spec_ret) or all(ret is None for ret in spec_ret):
+      print_uops(uops)
+      raise RuntimeError(f"UOp verification failed at {i} on {u.op} {u.dtype} {len(u.src)} {[x.op for x in u.src]} {u.arg}")

tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

tinygrad 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl