PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/kernel.py +230 -190
tinygrad/codegen/linearizer.py +278 -384
tinygrad/codegen/uops.py +415 -0
tinygrad/device.py +132 -275
tinygrad/dtype.py +53 -37
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +195 -0
tinygrad/engine/realize.py +191 -0
tinygrad/engine/schedule.py +362 -0
tinygrad/engine/search.py +196 -0
tinygrad/{mlops.py → function.py} +28 -14
tinygrad/helpers.py +72 -43
tinygrad/lazy.py +141 -240
tinygrad/multi.py +169 -0
tinygrad/nn/__init__.py +179 -8
tinygrad/nn/datasets.py +7 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +86 -17
tinygrad/ops.py +70 -44
tinygrad/renderer/__init__.py +61 -0
tinygrad/renderer/assembly.py +276 -0
tinygrad/renderer/cstyle.py +299 -206
tinygrad/renderer/llvmir.py +118 -123
tinygrad/runtime/autogen/amd_gpu.py +1900 -0
tinygrad/runtime/autogen/comgr.py +865 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5761 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33328 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/hip_comgr.py +47 -0
tinygrad/runtime/driver/hsa.py +143 -0
tinygrad/runtime/graph/clang.py +38 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +143 -0
tinygrad/runtime/graph/hsa.py +171 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +564 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +130 -38
tinygrad/runtime/ops_disk.py +45 -42
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_hsa.py +278 -0
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +42 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +630 -0
tinygrad/runtime/ops_python.py +204 -0
tinygrad/shape/shapetracker.py +41 -105
tinygrad/shape/symbolic.py +98 -95
tinygrad/shape/view.py +137 -35
tinygrad/tensor.py +2367 -442
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/METADATA +19 -9
tinygrad-0.9.0.dist-info/RECORD +60 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0

tinygrad/shape/symbolic.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import functools
 from math import gcd
 from tinygrad.helpers import partition
-from typing import List, Dict, Callable, Tuple, Type, Union, Optional, Any, Set
+from typing import List, Dict, Callable, Tuple, Type, Union, Optional, Any, Set, Mapping
 # NOTE: Python has different behavior for negative mod and floor div than c
 # symbolic matches the Python behavior, but the code output is agnostic, and will never have negative numbers in div or mod
@@ -10,14 +10,14 @@ from typing import List, Dict, Callable, Tuple, Type, Union, Optional, Any, Set
 class Node:
   b: Union[Node, int]
   min: int
-  max: int
+  max: sint
   def render(self, ops=None, ctx=None) -> Any:
     if ops is None: ops = render_python
     assert self.__class__ in (Variable, NumNode) or self.min != self.max
     return ops[type(self)](self, ops, ctx)
   def vars(self) -> Set[Variable]: return set()
   # substitute Variables with the values in var_vals
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node: raise RuntimeError(self.__class__.__name__)
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: raise RuntimeError(self.__class__.__name__)
   def unbind(self) -> Tuple[Node, Optional[int]]: return self.substitute({v: v.unbind()[0] for v in self.vars() if v.val is not None}), None
   @functools.cached_property
@@ -32,7 +32,7 @@ class Node:
     if not isinstance(other, Node): return NotImplemented
     return self.key == other.key
   def __neg__(self): return self*-1
-  def __add__(self, b:Union[Node,int]): return Node.sum([self, b if isinstance(b, Node) else NumNode(b)])
+  def __add__(self, b:Union[Node,int]): return Node.sum([self, NumNode(b) if isinstance(b, int) else b])
   def __radd__(self, b:int): return self+b
   def __sub__(self, b:Union[Node,int]): return self+-b
   def __rsub__(self, b:int): return -self+b
@@ -43,24 +43,20 @@ class Node:
   def __mul__(self, b:Union[Node, int]):
     if b == 0: return NumNode(0)
     if b == 1: return self
-    if self.__class__ is NumNode: return NumNode(self.b*b) if isinstance(b, int) else b*self.b
     return create_node(MulNode(self, b.b)) if isinstance(b, NumNode) else create_node(MulNode(self, b))
   def __rmul__(self, b:int): return self*b
   # *** complex ops ***
-  def __rfloordiv__(self, b:int):
-    if self.min > b >= 0: return NumNode(0)
-    if isinstance(self, NumNode): return NumNode(b // self.b)
-    raise RuntimeError(f"not supported: {b} // {self}")
+  def __rfloordiv__(self, b:int): return NumNode(b) // self
   def __floordiv__(self, b:Union[Node,int], factoring_allowed=True):
     if isinstance(b, Node):
-      if b.__class__ is NumNode: return self // b.b
+      if b.__class__ is NumNode: return self.__floordiv__(b.b, factoring_allowed)
       if self == b: return NumNode(1)
       if (b - self).min > 0 and self.min >= 0: return NumNode(0) # b - self simplifies the node
       raise RuntimeError(f"not supported: {self} // {b}")
     assert b != 0
-    if b < 0: return (self//-b)*-1
+    if b < 0: return (self*-1).__floordiv__(-b, factoring_allowed)
     if b == 1: return self
     # the numerator of div is not allowed to be negative
@@ -70,10 +66,7 @@ class Node:
       return (self + -offset*b).__floordiv__(b, factoring_allowed=False) + offset
     return create_node(DivNode(self, b))
-  def __rmod__(self, b:int):
-    if self.min > b >= 0: return NumNode(b)
-    if isinstance(self, NumNode): return NumNode(b % self.b)
-    raise RuntimeError(f"not supported: {b} % {self}")
+  def __rmod__(self, b:int): return NumNode(b) % self
   def __mod__(self, b:Union[Node,int]):
     if isinstance(b, Node):
       if b.__class__ is NumNode: return self % b.b
@@ -102,7 +95,7 @@ class Node:
       else: mul_groups[node] = mul_groups.get(node, 0) + 1
     new_nodes = [MulNode(a, b_sum) if b_sum != 1 else a for a, b_sum in mul_groups.items() if b_sum != 0]
     if num_node_sum: new_nodes.append(NumNode(num_node_sum))
-    return create_rednode(SumNode, new_nodes) if len(new_nodes) > 1 else new_nodes[0] if len(new_nodes) == 1 else NumNode(0)
+    return create_node(SumNode(new_nodes)) if len(new_nodes) > 1 else new_nodes[0] if len(new_nodes) == 1 else NumNode(0)
   @staticmethod
   def ands(nodes:List[Node]) -> Node:
@@ -112,19 +105,20 @@ class Node:
     # filter 1s
     nodes = [x for x in nodes if x.min != x.max]
-    return create_rednode(AndNode, nodes) if len(nodes) > 1 else (nodes[0] if len(nodes) == 1 else NumNode(1))
+    return create_node(AndNode(nodes)) if len(nodes) > 1 else (nodes[0] if len(nodes) == 1 else NumNode(1))
 # 4 basic node types
 class Variable(Node):
   def __new__(cls, *args):
-    if len(args) == 0: return super().__new__(cls)   # fix pickle
     expr, nmin, nmax = args
     assert nmin >= 0 and nmin <= nmax, f"invalid Variable {expr=} {nmin=} {nmax=}"
     if nmin == nmax: return NumNode(nmin)
     return super().__new__(cls)
-  def __init__(self, expr:Optional[str], nmin:int, nmax:int):
+  def __getnewargs__(self): return (self.expr, self.min, self.max)  # args passed to __new__ when unpickling
+  def __init__(self, expr:str, nmin:int, nmax:sint):
     self.expr, self.min, self.max = expr, nmin, nmax
     self._val: Optional[int] = None
   @property
@@ -139,7 +133,7 @@ class Variable(Node):
     assert self.val is not None, f"cannot unbind {self}"
     return Variable(self.expr, self.min, self.max), self.val
   def vars(self): return {self}
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node: return var_vals.get(self, self)
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return var_vals.get(self, self)
 class NumNode(Node):
   def __init__(self, num:int):
@@ -149,97 +143,129 @@ class NumNode(Node):
   def bind(self, val):
     assert self.b == val, f"cannot bind {val} to {self}"
     return self
+  def __mul__(self, b:Union[Node,int]): return NumNode(self.b*b) if isinstance(b, int) else b*self.b
   def __eq__(self, other): return self.b == other
-  def __hash__(self): return self.hash  # needed with __eq__ override
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node: return self
+  def __hash__(self): return hash(self.b)  # needed with __eq__ override
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return self
 def create_node(ret:Node):
   assert ret.min <= ret.max, f"min greater than max! {ret.min} {ret.max} when creating {type(ret)} {ret}"
   if ret.min == ret.max: return NumNode(ret.min)
   return ret
+def create_lt_node(lhs:Node, b:Union[Node, int]):
+  if isinstance(lhs, SumNode):
+    if isinstance(b, int):
+      new_sum = []
+      for x in lhs.nodes:
+        # TODO: should we just force the last one to always be the number
+        if isinstance(x, NumNode): b -= x.b
+        else: new_sum.append(x)
+      lhs = Node.sum(new_sum)
+      nodes = lhs.nodes if isinstance(lhs, SumNode) else [lhs]
+      assert all(not isinstance(node, MulNode) or isinstance(node.b, int) for node in nodes), "not supported"
+      muls, others = partition(nodes, lambda x: isinstance(x, MulNode) and x.b > 0 and x.max >= b)
+      if muls:
+        # NOTE: gcd in python 3.8 takes exactly 2 args
+        mul_gcd = b
+        for x in muls: mul_gcd = gcd(mul_gcd, x.b)  # type: ignore  # mypy cannot tell that x.b is int here due to assert above
+        all_others = Node.sum(others)
+        if all_others.min >= 0 and all_others.max < mul_gcd:
+          lhs, b = Node.sum([mul//mul_gcd for mul in muls]), b//mul_gcd
+    return create_node(LtNode(lhs, b)) if isinstance(lhs, SumNode) else create_lt_node(lhs, b)
+  if isinstance(lhs, MulNode):
+    if isinstance(b, Node) or isinstance(lhs.b, Node) or lhs.b == -1: return create_node(LtNode(lhs, b))
+    sgn = 1 if lhs.b > 0 else -1
+    return create_node(LtNode(lhs.a*sgn, (b + abs(lhs.b) - 1)//abs(lhs.b)))
+  return create_node(LtNode(lhs, b))
+def create_ge_node(lhs:Node, b:Union[Node, int]): return create_lt_node(-lhs, -b+1)
 class OpNode(Node):
   def __init__(self, a:Node, b:Union[Node, int]):
     self.a, self.b = a, b
     self.min, self.max = self.get_bounds()
   def vars(self): return self.a.vars() | (self.b.vars() if isinstance(self.b, Node) else set())
-  def get_bounds(self) -> Tuple[int, int]: raise NotImplementedError("must be implemented")
+  def get_bounds(self) -> Tuple[int, sint]: raise NotImplementedError("must be implemented")
 class LtNode(OpNode):
-  def __floordiv__(self, b: Union[Node, int], _=False): return (self.a//b) < (self.b//b)
   def get_bounds(self) -> Tuple[int, int]:
+    if self.a == self.b: return (0, 0)
     if isinstance(self.b, int): return (1, 1) if self.a.max < self.b else (0, 0) if self.a.min >= self.b else (0, 1)
     return (1, 1) if self.a.max < self.b.min else (0, 0) if self.a.min >= self.b.max else (0, 1)
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node:
-    return self.a.substitute(var_vals) < (self.b if isinstance(self.b, int) else self.b.substitute(var_vals))
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
+    return create_lt_node(self.a.substitute(var_vals), (self.b if isinstance(self.b, int) else self.b.substitute(var_vals)))
 class MulNode(OpNode):
-  def __lt__(self, b: Union[Node, int]):
-    if isinstance(b, Node) or isinstance(self.b, Node) or self.b == -1: return Node.__lt__(self, b)
-    sgn = 1 if self.b > 0 else -1
-    return Node.__lt__(self.a*sgn, (b + abs(self.b) - 1)//abs(self.b))
   def __mul__(self, b: Union[Node, int]): return self.a*(self.b*b) # two muls in one mul
   def __floordiv__(self, b: Union[Node, int], factoring_allowed=False): # NOTE: mod negative isn't handled right
     if self.b % b == 0: return self.a*(self.b//b)
     if b % self.b == 0 and self.b > 0: return self.a//(b//self.b)
     return Node.__floordiv__(self, b, factoring_allowed)
   def __mod__(self, b: Union[Node, int]): return Node.__mod__(self.a * (self.b%b), b)
-  def get_bounds(self) -> Tuple[int, int]: return (self.a.min*self.b, self.a.max*self.b) if self.b >= 0 else (self.a.max*self.b, self.a.min*self.b)
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node:
+  def get_bounds(self) -> Tuple[int, sint]:
+    assert self.a.min >= 0
+    if isinstance(self.b, int): return (self.a.min*self.b, self.a.max*self.b) if self.b >= 0 else (self.a.max*self.b, self.a.min*self.b)
+    return (self.a.min*self.b.min, self.a.max*self.b.max) if self.b.min >= 0 else (self.a.max*self.b.min, self.a.min*self.b.max)
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
     return self.a.substitute(var_vals) * (self.b if isinstance(self.b, int) else self.b.substitute(var_vals))
 class DivNode(OpNode):
   def __floordiv__(self, b: Union[Node, int], _=False): return self.a//(self.b*b) # two divs is one div
-  def get_bounds(self) -> Tuple[int, int]:
+  def get_bounds(self) -> Tuple[int, sint]:
     assert self.a.min >= 0 and isinstance(self.b, int)
     return self.a.min//self.b, self.a.max//self.b
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node: return self.a.substitute(var_vals) // self.b
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return self.a.substitute(var_vals) // self.b
 class ModNode(OpNode):
   def __mod__(self, b: Union[Node, int]):
-    if isinstance(b, Node) or isinstance(self.b, Node): return Node.__mod__(self, b)
-    return self.a % b if self.b % b == 0 else Node.__mod__(self, b)
+    if isinstance(b, int) and isinstance(self.b, int) and self.b % b == 0: return self.a % b
+    return Node.__mod__(self, b)
   def __floordiv__(self, b: Union[Node, int], factoring_allowed=True):
     return (self.a//b) % (self.b//b) if self.b % b == 0 else Node.__floordiv__(self, b, factoring_allowed)
-  def get_bounds(self) -> Tuple[int, int]:
+  def get_bounds(self) -> Tuple[int, sint]:
     assert self.a.min >= 0 and isinstance(self.b, int)
-    return (0, self.b-1) if self.a.max - self.a.min >= self.b or (self.a.min != self.a.max and self.a.min%self.b >= self.a.max%self.b) else (self.a.min%self.b, self.a.max%self.b)  # noqa: E501
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node: return self.a.substitute(var_vals) % self.b
+    if self.a.max - self.a.min >= self.b or (self.a.min != self.a.max and self.a.min%self.b >= self.a.max%self.b): return (0, self.b-1)
+    return (self.a.min%self.b, self.a.max%self.b)
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return self.a.substitute(var_vals) % self.b
 class RedNode(Node):
-  def __init__(self, nodes:List[Node]): self.nodes = nodes
+  def __init__(self, nodes:List[Node]):
+    self.nodes = nodes
+    self.min, self.max = self.get_bounds()
   def vars(self) -> Set[Variable]: return set.union(*[x.vars() for x in self.nodes], set())
+  def get_bounds(self) -> Tuple[int, sint]: raise NotImplementedError("must be implemented")
 class SumNode(RedNode):
+  def get_bounds(self) -> Tuple[int, sint]: return sum([x.min for x in self.nodes]), sum([x.max for x in self.nodes])
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
   def __mul__(self, b: Union[Node, int]): return Node.sum([x*b for x in self.nodes]) # distribute mul into sum
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def __floordiv__(self, b: Union[Node, int], factoring_allowed=True):
+  def __floordiv__(self, b: Union[Node, sint], factoring_allowed=True):
+    if self == b: return NumNode(1)
     fully_divided: List[Node] = []
     rest: List[Node] = []
-    if isinstance(b, SumNode):
-      nu_num = sum(node.b for node in self.flat_components if node.__class__ is NumNode)
-      de_num = sum(node.b for node in b.flat_components if node.__class__ is NumNode)
-      if nu_num > 0 and de_num and (d:=nu_num//de_num) > 0: return NumNode(d) + (self-b*d) // b
     if isinstance(b, Node):
       for x in self.flat_components:
         if x % b == 0: fully_divided.append(x // b)
         else: rest.append(x)
-      if (sum_fully_divided:=create_rednode(SumNode, fully_divided)) != 0: return sum_fully_divided + create_rednode(SumNode, rest) // b
+      if (sum_fully_divided:=create_node(SumNode(fully_divided))) != 0: return sum_fully_divided + create_node(SumNode(rest)) // b
       return Node.__floordiv__(self, b, False)
     if b == 1: return self
     if not factoring_allowed: return Node.__floordiv__(self, b, factoring_allowed)
-    fully_divided, rest = [], []
     _gcd = b
     divisor = 1
     for x in self.flat_components:
       if x.__class__ in (NumNode, MulNode):
-        if x.b%b == 0: fully_divided.append(x//b)
+        if x.b % b == 0: fully_divided.append(x // b)
         else:
+          if x.__class__ is NumNode and (div := x.b // b):
+            fully_divided.append(NumNode(div))
+            x = NumNode(x.b - b * div)
           rest.append(x)
           if isinstance(x.b, int):
             _gcd = gcd(_gcd, x.b)
-            if x.__class__ == MulNode and divisor == 1 and b%x.b == 0: divisor = x.b
+            if x.__class__ == MulNode and divisor == 1 and b % x.b == 0: divisor = x.b
           else:
             _gcd = 1
       else:
@@ -251,39 +277,13 @@ class SumNode(RedNode):
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
   def __mod__(self, b: Union[Node, int]):
-    if isinstance(b, SumNode):
-      nu_num = sum(node.b for node in self.flat_components if node.__class__ is NumNode)
-      de_num = sum(node.b for node in b.flat_components if node.__class__ is NumNode)
-      if nu_num > 0 and de_num and (d:=nu_num//de_num) > 0: return (self-b*d) % b
+    if self == b: return NumNode(0)
     if isinstance(b, Node) and (b - self).min > 0: return self # b - self simplifies the node
-    new_nodes: List[Node] = []
-    for x in self.nodes:
-      if x.__class__ in (NumNode, MulNode): new_nodes.append(x%b) # might simplify
-      else: new_nodes.append(x)
-    return Node.__mod__(Node.sum(new_nodes), b)
-  def __lt__(self, b:Union[Node,int]):
-    lhs: Node = self
-    if isinstance(b, int):
-      new_sum = []
-      for x in self.nodes:
-        # TODO: should we just force the last one to always be the number
-        if isinstance(x, NumNode): b -= x.b
-        else: new_sum.append(x)
-      lhs = Node.sum(new_sum)
-      nodes = lhs.nodes if isinstance(lhs, SumNode) else [lhs]
-      assert all(not isinstance(node, MulNode) or isinstance(node.b, int) for node in nodes), "not supported"
-      muls, others = partition(nodes, lambda x: isinstance(x, MulNode) and x.b > 0 and x.max >= b)
-      if muls:
-        # NOTE: gcd in python 3.8 takes exactly 2 args
-        mul_gcd = b
-        for x in muls: mul_gcd = gcd(mul_gcd, x.b)  # type: ignore  # mypy cannot tell that x.b is int here due to assert above
-        all_others = Node.sum(others)
-        if all_others.min >= 0 and all_others.max < mul_gcd:
-          lhs, b = Node.sum([mul//mul_gcd for mul in muls]), b//mul_gcd
-    return Node.__lt__(lhs, b) if isinstance(lhs, SumNode) else lhs < b
+    new_sum = Node.sum([node%b if node.__class__ in (NumNode, MulNode) else node for node in self.nodes])
+    return Node.__mod__(new_sum, b)
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node: return Node.sum([node.substitute(var_vals) for node in self.nodes])
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
+    return Node.sum([node.substitute(var_vals) for node in self.nodes])
   # recursively expand sumnode components
   # TODO: can remove this if there's no SumNode inside SumNode
@@ -291,36 +291,39 @@ class SumNode(RedNode):
   def flat_components(self): return [y for x in self.nodes for y in (x.flat_components if isinstance(x, SumNode) else [x])]
 class AndNode(RedNode):
-  def substitute(self, var_vals: Dict[Variable, Node]) -> Node:
+  def get_bounds(self) -> Tuple[int, sint]: return min([x.min for x in self.nodes]), max([x.max for x in self.nodes])
+  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
     subed = []
     for node in self.nodes:
       if not (sub:=node.substitute(var_vals)): return NumNode(0)
       subed.append(sub)
     return Node.ands(subed)
-def create_rednode(typ:Type[RedNode], nodes:List[Node]):
-  ret = typ(nodes)
-  if typ == SumNode: ret.min, ret.max = (sum([x.min for x in nodes]), sum([x.max for x in nodes]))
-  elif typ == AndNode: ret.min, ret.max = (min([x.min for x in nodes]), max([x.max for x in nodes]))
-  return create_node(ret)
 def sym_render(a: Union[Node, int], ops=None, ctx=None) -> str: return str(a) if isinstance(a, int) else a.render(ops, ctx)
-def sym_infer(a: Union[Node, int], var_vals: Dict[Variable, int]) -> int:
+def sym_infer(a: Union[Node, int], var_vals: Optional[Dict[Variable, int]]) -> int:
   if isinstance(a, (int, float)): return a
-  ret = a.substitute({k:NumNode(v) for k, v in var_vals.items()})
+  ret = a.substitute({k:NumNode(v) for k, v in var_vals.items()}) if var_vals is not None else a
   assert isinstance(ret, NumNode), f"sym_infer didn't produce NumNode from {a} with {var_vals}"
   return ret.b
-# symbolic int
-sint = Union[Node, int]
+# symbolic int, these are allowed in a Tensor shape
+sint = Union[int, Variable, MulNode, SumNode]
+def render_mulnode(node:MulNode, ops, ctx):
+  # TODO: add ProdNode and remove this case
+  if isinstance(node.a,Variable) and isinstance(node.b,Variable) and node.a.expr and node.b.expr and node.b.expr < node.a.expr:
+    return f"({sym_render(node.b,ops,ctx)}*{node.a.render(ops,ctx)})"
+  return f"({node.a.render(ops,ctx)}*{sym_render(node.b,ops,ctx)})"
-render_python: Dict[Type, Callable] = {
-  Variable: lambda self,ops,ctx: f"{self.expr}[{self.min}-{self.max}{'='+str(self.val) if self._val is not None else ''}]" if ctx == "DEBUG" else (f"Variable('{self.expr}', {self.min}, {self.max})"+(f".bind({self.val})" if self._val is not None else '') if ctx == "REPR" else f"{self.expr}"),  # noqa: E501
+render_python: Dict[Type, Callable[..., str]] = {
+  Variable: lambda self,ops,ctx: f"{self.expr}[{self.min}-{self.max}{'='+str(self.val) if self._val is not None else ''}]" if ctx == "DEBUG" \
+    else (f"Variable('{self.expr}', {self.min}, {self.max})"+(f".bind({self.val})" if self._val is not None else '') if ctx == "REPR" \
+    else f"{self.expr}"),
   NumNode: lambda self,ops,ctx: f"NumNode({self.b})" if ctx == "REPR" else f"{self.b}",
-  MulNode: lambda self,ops,ctx: f"({sym_render(self.b,ops,ctx)}*{self.a.render(ops,ctx)})" if isinstance(self.a,Variable) and isinstance(self.b,Variable) and self.a.expr and self.b.expr and self.b.expr < self.a.expr else f"({self.a.render(ops,ctx)}*{sym_render(self.b,ops,ctx)})",  # noqa: E501
+  MulNode: render_mulnode,
   DivNode: lambda self,ops,ctx: f"({self.a.render(ops,ctx)}//{self.b})",
   ModNode: lambda self,ops,ctx: f"({self.a.render(ops,ctx)}%{self.b})",
   LtNode: lambda self,ops,ctx: f"({self.a.render(ops,ctx)}<{sym_render(self.b,ops,ctx)})",
   SumNode: lambda self,ops,ctx: f"({'+'.join(sorted([x.render(ops,ctx) for x in self.nodes]))})",
-  AndNode: lambda self,ops,ctx: f"({' and '.join(sorted([x.render(ops,ctx) for x in self.nodes]))})"
+  AndNode: lambda self,ops,ctx: f"({' and '.join(sorted([x.render(ops,ctx) for x in self.nodes]))})",
 }

tinygrad/shape/view.py CHANGED Viewed

@@ -1,35 +1,35 @@
 from __future__ import annotations
-import functools, operator
+import functools, operator, itertools, math
 from dataclasses import dataclass
-from typing import Tuple, List, Optional, Dict, cast
+from typing import Tuple, List, Optional, Dict, Set, cast
 from tinygrad.helpers import prod, all_int, argsort
-from tinygrad.shape.symbolic import Node, NumNode, Variable, Set, sint
+from tinygrad.shape.symbolic import Node, NumNode, Variable, sint
 @functools.lru_cache(maxsize=None)
-def filter_strides(shape:Tuple[int, ...], strides:Tuple[int, ...]) -> Tuple[int, ...]:
-  return tuple(stride if shp != 1 else 0 for stride, shp in zip(strides, shape))
+def canonicalize_strides(shape:Tuple[sint, ...], strides:Tuple[sint, ...]) -> Tuple[sint, ...]:
+  return tuple(0 if s == 1 else st for s, st in zip(shape, strides))
 @functools.lru_cache(maxsize=None)
-def strides_for_shape(shape:Tuple[int, ...]) -> Tuple[int, ...]:
-  strides = [1] if shape else []
-  for d in reversed(shape[1:]): strides.append(d*strides[-1])
-  return filter_strides(shape, tuple(reversed(strides)))
+def strides_for_shape(shape:Tuple[sint, ...]) -> Tuple[sint, ...]:
+  if not shape: return ()
+  strides = tuple(itertools.accumulate(reversed(shape[1:]), operator.mul, initial=1))
+  return canonicalize_strides(shape, strides[::-1])
 @functools.lru_cache(maxsize=None)
-def _merge_dims(shape:Tuple[int, ...], strides:Tuple[int, ...], mask:Optional[Tuple[Tuple[int, int], ...]] = None) -> Tuple[Tuple[int, int, int], ...]:  # noqa: E501
+def _merge_dims(shape:Tuple[int, ...], strides:Tuple[int, ...], mask:Optional[Tuple[Tuple[int, int], ...]]=None) -> Tuple[Tuple[int, int, int], ...]:
   # merge contiguous subparts or zero strided dims. ret = List[(merged_dims, stride, merged dims w/o zero stride), ...]
   if not shape: return tuple()
   assert len(shape) == len(strides)
   ret = [(shape[0], strides[0], shape[0] if strides[0] else 0)]
-  # state (0, 1, 2) -> (none, in-progress, done). wrt merging zero strided dimensions
-  state = 1 if mask and strides[0] == 0 and shape[0] != 1 and mask[0][1] - mask[0][0] == 1 else 0
+  # wrt merging zero strided dimensions
+  merging = strides[0] == 0 and (mask[0][1] - mask[0][0] == 1 if mask else shape[0] == 1)
   for i, (sh, st) in enumerate(zip(shape[1:], strides[1:]), start=1):
     if sh == 1: continue
-    if state == 1 or ret[-1][1] == sh * st: # mergeable
-      ret[-1] = (ret[-1][0] * sh, st, (sh if state == 1 else ret[-1][2] * sh) if st else 0)
+    if merging or ret[-1][1] == sh * st: # mergeable
+      ret[-1] = (ret[-1][0] * sh, st, (sh if merging else ret[-1][2] * sh) if st else 0)
     else: ret.append((sh, st, sh if st else 0)) # begin new
     # merging ends with either non-zero strided dim or zero strided dim with mask range > 1
-    state = 1 if (st == 0 and mask and mask[i][1] - mask[i][0] == 1) else (2 if state != 0 else 0)
+    merging = st == 0 and (mask[i][1] - mask[i][0] == 1 if mask else sh == 1)
   return tuple(ret)
 @functools.lru_cache(maxsize=None)
@@ -52,7 +52,8 @@ def _reshape_mask(view: View, new_shape:Tuple[sint, ...]) -> Tuple[Optional[Tupl
         if mask[1] - mask[0] < 1: return ((0, 0),) * len(new_shape), False # invalid mask
       else: # mask can only be splitted if reshape doesn't cut across the mask.
-        if ((l % next_stride != 0 or r % next_stride != 0) and l // next_stride != (r - 1) // next_stride): return view.mask, True
+        if (((l % next_stride != 0 or r % next_stride != 0) and l // next_stride != (r - 1) // next_stride)
+            or old_dim % next_stride != 0): return view.mask, True
         new_mask.append((l % next_stride // curr_stride, (r - 1) % next_stride // curr_stride + 1))
         curr_stride, new_dim = next_stride,  next(r_new_shape, 1) # need to get mask for next dimension
@@ -67,6 +68,15 @@ def _reshape_mask(view: View, new_shape:Tuple[sint, ...]) -> Tuple[Optional[Tupl
   return tuple(reversed(new_mask)), False
+def un1d(shape:Tuple[sint, ...], offs:sint) -> List[sint]:
+  strides = strides_for_shape(shape)
+  result = []
+  for stride in strides:
+    here = offs // stride if stride else 0
+    result.append(here)
+    offs -= here * stride
+  return result
 @dataclass(frozen=True)
 class View:
   shape:Tuple[sint, ...]
@@ -75,11 +85,28 @@ class View:
   mask:Optional[Tuple[Tuple[sint, sint], ...]]
   contiguous:bool
+  @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
+  def size(self) -> int:
+    # NOTE: Variable and the Node derived from it in symbolic shapes can only have int as max.
+    ret = prod([x.max if isinstance(x, Node) else x for x in self.shape])
+    assert isinstance(ret, int), f"{ret=} is not int"
+    return ret
   @staticmethod
   @functools.lru_cache(maxsize=None)
   def create(shape:Tuple[sint, ...], strides:Optional[Tuple[sint, ...]]=None, offset:sint=0, mask:Optional[Tuple[Tuple[sint, sint], ...]]=None):
-    strides = filter_strides(shape, strides) if strides else strides_for_shape(shape)
+    strides = canonicalize_strides(shape, strides) if strides else strides_for_shape(shape)
+    # canonicalize empty mask
+    if mask is not None and all(m == (0,s) for m,s in zip(mask, shape)): mask = None
     contiguous = offset == 0 and mask is None and strides == strides_for_shape(shape)
+    # if any dimension has size >1, but is masked such that only one index in the dimension is unmasked
+    # then its stride can also be set to 0, albeit with a corresponding adjustment required to the offset
+    # TODO: assert comparison with LtNode to avoid mis-using symbolic
+    if mask and any(elim := [not (b+1 < e) for b,e in mask]):
+      if any(not (b < e) for b,e in mask):
+        strides, offset, mask = (0,) * len(shape), 0, ((0,0),) * len(shape)
+      offset += sum((strides[i] * mask[i][0]) if e else 0 for i, e in enumerate(elim))
+      strides = tuple(0 if e else st for st,e in zip(strides, elim))
     return View(shape, strides, offset, mask, contiguous)
   @functools.lru_cache(None)  # pylint: disable=method-cache-max-size-none
@@ -88,13 +115,81 @@ class View:
     return functools.reduce(operator.or_, [x.vars() for x in self.shape+self.strides+(self.offset,)+flatten_mask if isinstance(x, Node)], set())
   @functools.lru_cache(None)  # pylint: disable=method-cache-max-size-none
-  def unbind(self) -> View:
-    unbound_vars:Dict[Variable,Node] = {v: v.unbind()[0] for v in self.vars() if v.val is not None}
+  def unbind(self) -> Tuple[View, Dict[Variable, int]]:
+    var_unboundvar_val = [(v, v.unbind()) for v in self.vars() if v.val is not None]
+    unbound_vars = {v:uv for v,(uv,_) in var_unboundvar_val}
     new_shape = tuple([s if isinstance(s, int) else s.substitute(unbound_vars) for s in self.shape])
     new_strides = tuple([s if isinstance(s, int) else s.substitute(unbound_vars) for s in self.strides])
     new_offset = self.offset if isinstance(self.offset, int) else self.offset.substitute(unbound_vars)
-    new_mask = tuple((a if isinstance(a, int) else a.substitute(unbound_vars), b if isinstance(b, int) else b.substitute(unbound_vars)) for (a, b) in self.mask) if self.mask is not None else None  # noqa: E501
-    return View.create(new_shape, new_strides, new_offset, new_mask)
+    new_mask = tuple((a if isinstance(a, int) else a.substitute(unbound_vars),
+                      b if isinstance(b, int) else b.substitute(unbound_vars)) for (a, b) in self.mask) if self.mask is not None else None
+    return View.create(new_shape, new_strides, new_offset, new_mask), dict(x[1] for x in var_unboundvar_val)
+  @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
+  def __add__(self, vm1:View) -> Optional[View]:
+    vm2 = self
+    if vm2.contiguous: return vm1
+    if vm1.contiguous and vm1.shape == vm2.shape: return vm2
+    if vm1.contiguous and vm1.size() == vm2.size() and (ret := vm2.reshape(vm1.shape)) is not None: return ret
+    if vm1.mask:
+      for b,e in vm1.mask:
+        if not (b < e): return View.create(vm1.shape, (0,) * len(vm1.shape), 0, ((0,0),) * len(vm1.shape))
+      return (merged := vm2 + vm1.shrink(vm1.mask)) and merged.pad(tuple((b,s-e) for (b,e),s in zip(vm1.mask, vm1.shape)))
+    # Project vm1's offset and strides on to vm2.
+    origin = un1d(vm2.shape, vm1.offset)
+    terms: List[List[Tuple[int, sint]]] = [[] for _ in origin]
+    strides: List[sint] = [0] * len(vm1.shape)
+    for d1, st in enumerate(vm1.strides):
+      if st == 0: continue
+      for d2, (o, s1) in enumerate(zip(origin, un1d(vm2.shape, vm1.offset + st))):
+        if (s1 := s1 - o) == 0: continue
+        terms[d2].append((d1, s1))
+        strides[d1] += s1 * vm2.strides[d2]
+    # Merge dimensions in vm2 if required.
+    # NB: Merging too many dimensions can make it difficult to project vm2's mask, hence only combining when required.
+    idxs: List[Node] = [Variable(f"idx{i}", 0, s-1) for i,s in enumerate(vm1.shape)]
+    merged_size, merged_term = 1, NumNode(0)
+    extents: List[Tuple[sint, Node]] = []
+    for term, s, o in zip(reversed(terms), reversed(vm2.shape), reversed(origin)):
+      merged_term += Variable.sum([idxs[d1] * (s1 * merged_size) for d1, s1 in term]) + o * merged_size
+      merged_size *= s
+      if not (merged_term >= merged_size) and not (merged_term < 0):
+        extents.append((merged_size, merged_term))
+        merged_size, merged_term = 1, NumNode(0)
+    if merged_term: return None
+    if (vm2_shape := tuple(s for s,_ in reversed(extents))) != vm2.shape:
+      return (reshaped_vm2 := vm2.reshape(vm2_shape)) and reshaped_vm2 + vm1
+    if vm2.mask:
+      # Try to project vm2's mask on to vm1.
+      newb, newe, bad = [0] * len(vm1.shape), list(vm1.shape), False
+      for d2, ((b, e), o, (_, t)) in enumerate(zip(vm2.mask, origin, reversed(extents))):
+        if not (t.min < b or t.max >= e): continue
+        if not isinstance(o, int) or not isinstance(b, int) or not isinstance(e, int):
+          bad = True
+          continue
+        term = terms[d2]
+        if len(term) != 1:
+          if not term and newe: newe[0] = 0
+          else: bad = True
+          continue
+        d1, s1 = term[0]
+        if not isinstance(s1, int) or not isinstance(newe[d1], int):
+          bad = True
+          continue
+        newb[d1] = max(newb[d1], math.ceil((b - o if s1 > 0 else e - o - 1) / s1))
+        newe[d1] = min(newe[d1], (b - o if s1 < 0 else e - o - 1) // s1 + 1)
+      # If any of vm1 was masked off, try again with that mask in place.
+      for b, e, s in zip(newb, newe, vm1.shape):
+        if b != 0 or e != s:
+          return vm2 + View.create(vm1.shape, vm1.strides, vm1.offset, tuple(zip(newb, newe)))
+      # Otherwise if vm2's mask was violated, then cannot merge.
+      if bad: return None
+    return View.create(vm1.shape, tuple(strides), sum(o * s for o, s in zip(origin, vm2.strides)) + vm2.offset)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
   def invert(self, out_shape:Tuple[sint, ...]) -> Optional[View]:
@@ -103,7 +198,10 @@ class View:
     ret = ret.stride(tuple(-1 if x < 0 else 1 for x in self.strides)).permute(argsort(tuple(-x if x > 0 else x for x in self.strides)))
     return ret if prod(ret.shape) == prod(out_shape) else None   # don't support shrink, expand, or stride != (-1, 1)
-  # MovementOps live here now
+  @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
+  def minify(self):
+    min_shape = tuple(x[0] for x in _merge_dims(self.shape, self.strides, self.mask))
+    return nv if (nv := self.reshape(min_shape)) else self
   def __unsafe_resize(self, arg: Tuple[Tuple[sint, sint], ...], mask=None) -> View:
     offset = sum([s * x[0] for s, x in zip(self.strides,arg)])
@@ -117,8 +215,8 @@ class View:
     return View.create(tuple(s.b if isinstance(s, NumNode) else s for s in shape), self.strides, self.offset+offset, mask)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def pad(self, arg: Tuple[Tuple[int, int], ...]) -> View:
-    assert all((b>=0 and e>=0) for b,e in arg) and len(arg) == len(self.shape)
+  def pad(self, arg: Tuple[Tuple[sint, sint], ...]) -> View:
+    assert all((b>=0 and e>=0) for b,e in arg) and len(arg) == len(self.shape), f"{self.shape=}, {arg=}"
     if any(b or e for b, e in arg):
       zvarg = tuple([(-b,s+e) for s,(b,e) in zip(self.shape, arg)])
       mask = tuple([(b,s+b) for s,(b,_) in zip(self.shape, arg)])
@@ -145,7 +243,8 @@ class View:
   def permute(self, axis: Tuple[int, ...]) -> View:
     assert all(isinstance(x, int) and x >= 0 and x < len(self.shape) for x in axis), f"invalid permute {axis} for {self.shape}"
     assert len(set(axis)) == len(axis) and len(axis) == len(self.shape), f"can't permute {self.shape} with {axis}"
-    return View.create(tuple([self.shape[a] for a in axis]), tuple([self.strides[a] for a in axis]), self.offset, tuple([self.mask[a] for a in axis]) if self.mask is not None else None)  # noqa: E501
+    return View.create(tuple(self.shape[a] for a in axis), tuple(self.strides[a] for a in axis), self.offset,
+                       tuple(self.mask[a] for a in axis) if self.mask is not None else None)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
   def stride(self, mul: Tuple[int, ...]) -> View:
@@ -154,7 +253,8 @@ class View:
     strides = tuple([z*m for z,m in zip(self.strides, mul)])
     new_shape = tuple([(s+(abs(m)-1))//abs(m) for s,m in zip(self.shape, mul)])
     offset = sum([(s-1)*z for s,z,m in zip(self.shape, self.strides, mul) if m < 0])
-    mask = tuple([(((mx if m > 0 else s-my)+(abs(m)-1))//abs(m), ((my if m > 0 else s-mx)+(abs(m)-1))//abs(m)) for (mx,my),s,m in zip(self.mask, self.shape, mul)]) if self.mask is not None else None  # noqa: E501
+    mask = tuple([(((mx if m > 0 else s-my)+(abs(m)-1))//abs(m), ((my if m > 0 else s-mx)+(abs(m)-1))//abs(m)) \
+                  for (mx,my),s,m in zip(self.mask, self.shape, mul)]) if self.mask is not None else None
     return View.create(new_shape, strides, self.offset + offset, mask)
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
@@ -177,18 +277,20 @@ class View:
     if self.contiguous: return View.create(new_shape)
     strides, r_new_shape = [], reversed(new_shape)
-    for merged_dim, s, real_dim in reversed(_merge_dims(self.shape, self.strides, self.mask)):
-      acc, new_stride = 1, s
+    for merged_dim, new_stride, real_dim in reversed(_merge_dims(self.shape, self.strides, self.mask)):
+      acc = 1
+      # TODO: this <= and != is for symbolic!?
       while acc <= merged_dim and acc != merged_dim and (new_dim := next(r_new_shape, None)):
-        strides.append(new_stride if new_dim != 1 else 0)
-        if new_dim == 1: continue
-        new_stride *= (new_dim if (acc :=  acc * new_dim) < real_dim else 0)
+        strides.append(new_stride)
+        if new_dim != 1: new_stride *= (new_dim if (acc :=  acc * new_dim) < real_dim else 0)
       if acc != merged_dim: break
     else:
       strides += [0,] * (len(new_shape) - len(strides))
-      mask, extra = _reshape_mask(self, new_shape)
-      fstrides = filter_strides(tuple(e-b for b,e in mask) if mask else new_shape, tuple(reversed(strides)))
-      extra_offset = (sum(m[0] * s for m,s in zip(self.mask, self.strides)) if self.mask else 0) - (sum(m[0] * s for m,s in zip(mask, fstrides)) if mask else 0) # noqa: E501
-      if not extra: return View.create(new_shape, fstrides, self.offset + extra_offset, mask)
+      new_mask, extra = _reshape_mask(self, new_shape)
+      if not extra:
+        new_strides = canonicalize_strides(tuple(e-b for b,e in new_mask) if new_mask else new_shape, tuple(reversed(strides)))
+        extra_offset = (sum(m[0] * s for m,s in zip(self.mask, self.strides)) if self.mask else 0) - \
+                       (sum(m[0] * s for m,s in zip(new_mask, new_strides)) if new_mask else 0)
+        return View.create(new_shape, new_strides, self.offset + extra_offset, new_mask)
     return None

tinygrad 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl