PyPI - tinygrad - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

tinygrad/__init__.py +6 -0
tinygrad/codegen/kernel.py +572 -83
tinygrad/codegen/linearizer.py +415 -395
tinygrad/codegen/uops.py +415 -0
tinygrad/device.py +183 -0
tinygrad/dtype.py +113 -0
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +195 -0
tinygrad/engine/realize.py +191 -0
tinygrad/engine/schedule.py +362 -0
tinygrad/engine/search.py +196 -0
tinygrad/{mlops.py → function.py} +76 -55
tinygrad/helpers.py +196 -89
tinygrad/lazy.py +210 -371
tinygrad/multi.py +169 -0
tinygrad/nn/__init__.py +202 -22
tinygrad/nn/datasets.py +7 -0
tinygrad/nn/optim.py +112 -32
tinygrad/nn/state.py +136 -39
tinygrad/ops.py +119 -202
tinygrad/renderer/__init__.py +61 -0
tinygrad/renderer/assembly.py +276 -0
tinygrad/renderer/cstyle.py +353 -166
tinygrad/renderer/llvmir.py +150 -138
tinygrad/runtime/autogen/amd_gpu.py +1900 -0
tinygrad/runtime/autogen/comgr.py +865 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5761 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33328 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/hip_comgr.py +47 -0
tinygrad/runtime/driver/hsa.py +143 -0
tinygrad/runtime/graph/clang.py +38 -0
tinygrad/runtime/graph/cuda.py +81 -0
tinygrad/runtime/graph/hcq.py +143 -0
tinygrad/runtime/graph/hsa.py +171 -0
tinygrad/runtime/graph/metal.py +75 -0
tinygrad/runtime/ops_amd.py +564 -0
tinygrad/runtime/ops_clang.py +24 -77
tinygrad/runtime/ops_cuda.py +175 -89
tinygrad/runtime/ops_disk.py +56 -33
tinygrad/runtime/ops_gpu.py +92 -95
tinygrad/runtime/ops_hsa.py +278 -0
tinygrad/runtime/ops_llvm.py +39 -60
tinygrad/runtime/ops_metal.py +92 -74
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +630 -0
tinygrad/runtime/ops_python.py +204 -0
tinygrad/shape/shapetracker.py +86 -254
tinygrad/shape/symbolic.py +166 -141
tinygrad/shape/view.py +296 -0
tinygrad/tensor.py +2619 -448
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
tinygrad-0.9.0.dist-info/METADATA +227 -0
tinygrad-0.9.0.dist-info/RECORD +60 -0
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/assembly.py +0 -190
tinygrad/codegen/optimizer.py +0 -379
tinygrad/codegen/search.py +0 -72
tinygrad/graph.py +0 -83
tinygrad/jit.py +0 -57
tinygrad/nn/image.py +0 -100
tinygrad/renderer/assembly_arm64.py +0 -169
tinygrad/renderer/assembly_ptx.py +0 -98
tinygrad/renderer/wgsl.py +0 -53
tinygrad/runtime/lib.py +0 -113
tinygrad/runtime/ops_cpu.py +0 -51
tinygrad/runtime/ops_hip.py +0 -82
tinygrad/runtime/ops_shm.py +0 -29
tinygrad/runtime/ops_torch.py +0 -30
tinygrad/runtime/ops_webgpu.py +0 -45
tinygrad-0.7.0.dist-info/METADATA +0 -212
tinygrad-0.7.0.dist-info/RECORD +0 -40
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0

tinygrad/codegen/uops.py ADDED Viewed

@@ -0,0 +1,415 @@
+from __future__ import annotations
+from typing import Iterator, Optional, Tuple, Any, Dict, List, DefaultDict, Set, Callable
+import functools, itertools, heapq
+from collections import defaultdict
+from enum import Enum, auto
+from dataclasses import dataclass
+from tinygrad.dtype import dtypes, DType
+from tinygrad.shape.symbolic import sint, Variable
+from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, exec_alu
+from tinygrad.helpers import prod, DEBUG, getenv
+# the order of these UOps controls the order of the toposort
+class UOps(Enum):
+  # ops that aren't rendered
+  SINK = auto()
+  DEFINE_GLOBAL = auto(); DEFINE_VAR = auto(); DEFINE_LOCAL = auto(); DEFINE_ACC = auto() # noqa: E702
+  CONST = auto(); SPECIAL = auto() # noqa: E702
+  NOOP = auto(); UNMUL = auto(); GEP = auto() # noqa: E702
+  # math ops
+  CAST = auto(); BITCAST = auto() # noqa: E702
+  ALU = auto(); WMMA = auto() # noqa: E702
+  # memory/assignment ops
+  LOAD = auto(); STORE = auto(); PHI = auto() # noqa: E702
+  # control flow ops
+  BARRIER = auto(); IF = auto(); RANGE = auto() # noqa: E702
+  # these two are not graph nodes
+  ENDRANGE = auto(); ENDIF = auto() # noqa: E702
+@dataclass(eq=False)
+class UOp:
+  uop: UOps
+  dtype: Optional[DType] = None
+  vin: Tuple[UOp, ...] = tuple()
+  arg: Any = None
+  def tuple(self): return (self.uop, self.dtype, self.vin, self.arg)
+  @functools.cached_property
+  def cmp_tuple(self):
+    # NOTE: this sort of DEFINE_VAR shouldn't have to be here. only for PTX
+    return (self.uop.value, (self.arg if self.uop is not UOps.DEFINE_VAR else self.arg.expr) if self.uop is not UOps.ALU else \
+            (type(self.uop), self.uop.value), self.dtype, self.vin)
+  def __lt__(self, x:UOp): return self.cmp_tuple < x.cmp_tuple
+  def __repr__(self):
+    return f"{str(self.uop):20s}: {str(self.dtype) if self.dtype is not None else '':25s} {str([x.uop for x in self.vin]):32s} {self.arg}"
+  def cast(self, dtype): return UOp(UOps.CAST, dtype, (self,))
+  def __neg__(self): return UOp.alu(UnaryOps.NEG, self)
+  def __add__(self, x): return UOp.alu(BinaryOps.ADD, self, x)
+  def __sub__(self, x): return UOp.alu(BinaryOps.SUB, self, x)
+  def __mul__(self, x): return UOp.alu(BinaryOps.MUL, self, x)
+  @staticmethod
+  def max(x, y): return UOp.alu(BinaryOps.MAX, x, y)
+  @staticmethod
+  def min(x, y): return -UOp.alu(BinaryOps.MAX, -x, -y)
+  @staticmethod
+  def const(dtype, val): return UOp(UOps.CONST, dtype, arg=dtypes.as_const(val, dtype))
+  @staticmethod
+  def alu(arg, *vin:UOp): return UOp(UOps.ALU, dtypes.bool if arg in {BinaryOps.CMPLT, BinaryOps.CMPEQ} else vin[-1].dtype, vin, arg)
+  @functools.cached_property
+  def parents(self) -> Set[UOp]: return set.union(set(self.vin), *[x.parents for x in self.vin])
+def uop_alu_resolve(u:UOp) -> sint:
+  if u.uop is UOps.CONST: return u.arg
+  if u.uop is UOps.DEFINE_VAR: return u.arg
+  if u.uop is UOps.SPECIAL: return u.arg[2]-1
+  if u.uop is UOps.ALU and u.arg is BinaryOps.MUL: return uop_alu_resolve(u.vin[0]) * uop_alu_resolve(u.vin[1])
+  if u.uop is UOps.ALU and u.arg is BinaryOps.ADD: return uop_alu_resolve(u.vin[0]) + uop_alu_resolve(u.vin[1])
+  raise RuntimeError(f"ALU resolve fail @ {u.uop}")
+# *** simplification logic ***
+def _match(uop:UOp, pattern:Dict[str, Any], store:Dict[str, UOp]) -> bool:
+  for k,v in pattern.items():
+    if k == "__name__":
+      if v in store and store[v] != uop: return False
+      store[v] = uop
+    elif k == "arg":
+      if uop.arg != v: return False
+    elif k == "dtype":
+      if isinstance(v, set):
+        if uop.dtype not in v: return False
+      elif uop.dtype != v: return False
+    elif k == "uop":
+      if isinstance(v, set):
+        if uop.uop not in v: return False
+      elif uop.uop != v: return False
+    elif k == "vin":
+      # only one if it's a tuple
+      # try all permutations if it's a list
+      # repeat if it's a dict
+      for vp in itertools.permutations(v) if isinstance(v, list) else ([v] if isinstance(v, tuple) else [(v,)*len(uop.vin)]):
+        if len(uop.vin) != len(vp) and (len(uop.vin) not in pattern.get('__allow_len__', [])): return False
+        new_store = store.copy()
+        if all(_match(uu, vv, new_store) for uu, vv in zip(uop.vin, vp)):
+          for k,v in new_store.items(): store[k] = v
+          return True
+      return False
+  return True
+class PatternMatcher:
+  def __init__(self, patterns:List[Tuple[Dict[str, Any], Callable]]):
+    self.patterns = patterns
+    self.pdict: DefaultDict[Tuple[UOps, Any], List[Tuple[Dict[str, Any], Callable]]] = defaultdict(list)
+    # uop is required, arg is optional
+    for p,fxn in self.patterns:
+      uops = p["uop"]
+      if isinstance(uops, set):
+        for uop in uops: self.pdict[(uop, p.get("arg", None))].append((p, fxn))
+      else:
+        self.pdict[(uops, p.get("arg", None))].append((p, fxn))
+  def rewrite(self, uop:UOp) -> Optional[UOp]:
+    for p,fxn in itertools.chain(self.pdict[(uop.uop, uop.arg)], self.pdict[(uop.uop, None)]):
+      store: Dict[str, UOp] = {}
+      if _match(uop, p, store): return fxn(**store)
+    return None
+def sum_collapse(phi_input, loop, val1, val2):
+  for v1,v2 in [(val1, val2), (val2, val1)]:
+    if loop not in v1.parents:
+      loop_range = loop.vin[1]-loop.vin[0]
+      ret = v1*loop_range.cast(v1.dtype)
+      return UOp(UOps.PHI, phi_input.dtype, (phi_input, v2))+ret
+  return None
+def loop_collapse(loop_start, loop_end, compval, idx, mval, multconst):
+  if mval.arg >= 0 or loop_start.arg != 0:
+    # TODO: support and test this with other mvals and loop_starts
+    if DEBUG >= 1: print(f"WARNING, NOT FOLDING: mval:{mval.arg} loop_start:{loop_start.arg}")
+    return None
+  comprange = UOp.min(loop_end, UOp.max(UOp.alu(BinaryOps.DIV, idx-compval-mval, mval) + (loop_end-loop_start), loop_start))
+  return UOp(UOps.UNMUL, multconst.dtype, (comprange.cast(multconst.dtype) * multconst, loop_end-loop_start))
+# this is symbolic 2.0
+constant_folder = PatternMatcher([
+  # arange loop folding (early)
+  ({"uop": UOps.ALU, "arg": TernaryOps.WHERE, "vin": ({"uop": UOps.ALU, "arg": BinaryOps.CMPLT, "vin": (
+    {"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin":
+      [{"__name__": "idx"}, {"uop": UOps.ALU, "arg": BinaryOps.MUL,
+        "vin": [{"__name__": "mval", "uop": UOps.CONST}, {"uop": UOps.RANGE, "vin": ({"__name__": "loop_start"}, {"__name__": "loop_end"})}]}]},
+      {"__name__": "compval", "uop": UOps.CONST})}, {"__name__": "multconst", "uop": UOps.CONST}, {"uop": UOps.CONST, "arg": 0})}, loop_collapse),
+  # sum collapse to mul (with possible GEP)
+  ({"uop": UOps.PHI, "vin": ({"__name__": "phi_input", "uop": UOps.DEFINE_ACC, "vin": ({"uop": UOps.RANGE, "__name__": "loop"},)},
+      {"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin": ({"__name__": "val1"}, {"__name__": "val2"})})}, sum_collapse),
+  ({"uop": UOps.PHI, "vin": ({"__name__": "phi_input", "uop": UOps.GEP,
+                              "vin": ({"uop": UOps.DEFINE_ACC, "vin":({"uop": UOps.RANGE, "__name__": "loop"},)},)},
+      {"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin": ({"__name__": "val1"}, {"__name__": "val2"})})}, sum_collapse),
+  # deal with UNMUL
+  ({"uop": UOps.ALU, "arg": BinaryOps.MUL, "vin": [{"uop": UOps.CONST, "__name__": "c1"},
+                                                   {"uop": UOps.UNMUL, "vin": [{"uop": UOps.CONST, "__name__": "c2"}, {"__name__": "v"}]}]},
+                                                   lambda c1,c2,v: v if c1.arg == c2.arg else None),
+  ({"uop": UOps.UNMUL, "vin": ({"uop": UOps.CONST, "__name__": "zero", "arg": 0}, {})}, lambda zero: zero),
+  ({"__name__": "root", "uop": UOps.CAST, "vin": ({"uop": UOps.UNMUL, "__name__": "unmul"},)},
+    lambda root,unmul: UOp(UOps.UNMUL, root.dtype, (unmul.vin[0].cast(root.dtype), unmul.vin[1]))),
+  # max on special can go away (TODO: special should be variable, same thing applies)
+  ({"uop": UOps.ALU, "arg": BinaryOps.MAX, "vin": [{"__name__": "c", "uop": UOps.CONST}, {"__name__": "s", "uop": UOps.SPECIAL}]},
+    lambda c,s: c if (s.arg[2]-1) <= c.arg else None),
+  # const rules
+  ({"__name__": "root", "uop": UOps.GEP, "vin": ({"__name__": "c", "uop": UOps.CONST},)}, lambda root, c: UOp.const(root.dtype, c.arg)),
+  ({"__name__": "root", "uop": UOps.CAST, "vin": {"__name__": "c", "uop": UOps.CONST}}, lambda root, c: UOp.const(root.dtype, c.arg)),
+  # a phi on a DEFINE_ACC without loops or a CONST is a noop. this is for correctness, not just speed
+  ({"uop": UOps.PHI, "vin": ({"uop": UOps.DEFINE_ACC, "__name__": "acc"}, {"__name__": "acc"})}, lambda acc: UOp.const(acc.dtype, acc.arg[0])),
+  ({"uop": UOps.PHI, "vin": ({"uop": UOps.DEFINE_ACC, "vin": tuple()}, {"__name__": "x"})}, lambda x: x),
+  ({"uop": UOps.PHI, "vin": ({"uop": UOps.CONST}, {"__name__": "x"})}, lambda x: x),
+  # a DEFINE_ACC without inputs is a const + GEP on a const is the const
+  ({"__name__": "root", "uop": UOps.DEFINE_ACC, "vin": tuple()}, lambda root: UOp.const(root.dtype, root.arg[0])),
+  ({"__name__": "root", "uop": UOps.GEP, "vin": ({"__name__": "x", "uop": UOps.CONST},)}, lambda root,x: UOp.const(root.dtype, x.arg)),
+  # max -2147483648
+  ({"uop": UOps.ALU, "arg": BinaryOps.MAX, "dtype": dtypes.int, "vin": [{"__name__": "x"}, {"uop": UOps.CONST, "arg": -2147483648}]}, lambda x: x),
+  # -(-x) -> x
+  ({"uop": UOps.ALU, "arg": UnaryOps.NEG, "vin": ({"uop": UOps.ALU, "arg": UnaryOps.NEG, "vin": ({"__name__": "x"},)})}, lambda x: x),
+  # x+-y -> x-y
+  ({"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin": ({"__name__": "x"}, {"__name__": "my", "uop": UOps.ALU, "arg": UnaryOps.NEG})},
+    lambda x, my: x-my.vin[0]),
+  # -1*x -> -x
+  ({"uop": UOps.ALU, "arg": BinaryOps.MUL, "vin": [{"__name__": "x"}, {"uop": UOps.CONST, "arg": -1}]}, lambda x: -x),
+  # bool < False is always false, True < bool is always false
+  ({"uop": UOps.ALU, "arg": BinaryOps.CMPLT, "vin": ({}, {"__name__": "x", "uop": UOps.CONST, "dtype": dtypes.bool, "arg": False})}, lambda x: x),
+  ({"uop": UOps.ALU, "arg": BinaryOps.CMPLT, "vin": ({"__name__": "x", "uop": UOps.CONST, "dtype": dtypes.bool, "arg": True}, {})},
+    lambda x: UOp.const(dtypes.bool, False)),
+  # a conditional with the same results either way is a noop, also fold const conditionals
+  ({"uop": UOps.ALU, "arg": TernaryOps.WHERE, "vin": ({}, {"__name__": "val"}, {"__name__": "val"})}, lambda val: val),
+  ({"uop": UOps.ALU, "arg": TernaryOps.WHERE, "vin": ({"__name__": "gate", "uop": UOps.CONST}, {"__name__": "c0"}, {"__name__": "c1"})},
+    lambda gate, c0, c1: c0 if gate.arg else c1),
+  # ** constant folding **
+  ({"__name__": "root", "uop": UOps.ALU, "vin": {"uop": UOps.CONST}},
+    lambda root: UOp.const(root.dtype, exec_alu(root.arg, root.dtype, [x.arg for x in root.vin]))),
+  # ** self folding **
+  ({"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin": [{"__name__": "x"}, {"uop": UOps.CONST, "arg": 0}]}, lambda x: x),   # x+0 -> x or 0+x -> x
+  ({"uop": UOps.ALU, "arg": BinaryOps.MUL, "vin": [{"__name__": "x"}, {"uop": UOps.CONST, "arg": 1}]}, lambda x: x),   # x*1 -> x or 1*x -> x
+  ({"uop": UOps.ALU, "arg": BinaryOps.SUB, "vin": ({"__name__": "x"}, {"uop": UOps.CONST, "arg": 0})}, lambda x: x),   # x-0 -> x
+  ({"uop": UOps.ALU, "arg": BinaryOps.DIV, "vin": ({"__name__": "x"}, {"uop": UOps.CONST, "arg": 1})}, lambda x: x),   # x/1 -> x
+  ({"uop": UOps.ALU, "arg": BinaryOps.DIV, "vin": ({"__name__": "x"}, {"uop": UOps.CONST, "arg": -1})}, lambda x: -x), # x/-1 -> -x
+  # ** zero folding **
+  ({"uop": UOps.ALU, "arg": BinaryOps.MUL, "vin": [{}, {"__name__": "c", "uop": UOps.CONST, "arg": 0}]}, lambda c: c), # x*0 -> 0 or 0*x -> 0
+  ({"uop": UOps.ALU, "arg": BinaryOps.SUB, "vin": ({"__name__": "x"}, {"__name__": "x"})}, lambda x: UOp.const(x.dtype, 0)),   # x-x -> 0
+  # ** load/store folding **
+  ({"uop": UOps.STORE, "vin": ({"__name__": "buf"}, {"__name__": "idx"},
+                               {"uop": UOps.LOAD, "vin": ({"__name__": "buf"}, {"__name__": "idx"})})}, lambda buf, idx: UOp(UOps.NOOP)),
+  # ** two stage add/sub folding **
+  ({"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin": [{"uop": UOps.ALU, "arg": BinaryOps.ADD,
+                     "vin": [{"__name__": "x"}, {"__name__": "c1", "uop": UOps.CONST}]}, {"__name__": "c2", "uop": UOps.CONST}]},
+     lambda x,c1,c2: x+UOp.const(x.dtype, exec_alu(BinaryOps.ADD, x.dtype, [c1.arg, c2.arg]))),
+  ({"uop": UOps.ALU, "arg": BinaryOps.ADD, "vin": [{"uop": UOps.ALU, "arg": BinaryOps.SUB,
+                     "vin": ({"__name__": "x"}, {"__name__": "c1", "uop": UOps.CONST})}, {"__name__": "c2", "uop": UOps.CONST}]},
+     lambda x,c1,c2: x+UOp.const(x.dtype, exec_alu(BinaryOps.SUB, x.dtype, [c2.arg, c1.arg]))),
+  # TODO: can do the invert of this (flip alt/load) when we fix double ops
+  ({"uop": UOps.STORE, "vin": ({"__name__": "buf"}, {"__name__": "idx"}, {"uop": UOps.ALU, "arg": TernaryOps.WHERE,
+                       "vin": ({"__name__": "gate"}, {"__name__": "alt"}, {"uop": UOps.LOAD, "vin": ({"__name__": "buf"}, {"__name__": "idx"})})})},
+    lambda buf, idx, gate, alt: UOp(UOps.STORE, None, (buf, idx, alt, gate))),
+  # store float4/float2 directly (remove CAST/GEP)
+  ({"uop": UOps.STORE, "vin": ({"__name__": "buf"}, {"__name__": "idx"}, {"uop": UOps.CAST, "vin":
+                                tuple({"uop": UOps.GEP, "vin": ({"__name__": "val"},), "arg": i} for i in range(4))})},
+   lambda buf,idx,val: UOp(UOps.STORE, None, (buf, idx, val))),
+  ({"uop": UOps.STORE, "vin": ({"__name__": "buf"}, {"__name__": "idx"}, {"uop": UOps.CAST, "vin":
+                                tuple({"uop": UOps.GEP, "vin": ({"__name__": "val"},), "arg": i} for i in range(2))})},
+   lambda buf,idx,val: UOp(UOps.STORE, None, (buf, idx, val))),
+  # CAST-PHI-GEP -> PHI-CAST
+  ({"__name__": "root", "uop": UOps.CAST, "vin":
+    tuple({"uop": UOps.PHI, "vin": ({"uop": UOps.GEP, "vin": ({"__name__": "val"},), "arg": i}, {"__name__": f"v{i}"})} for i in range(4))},
+    lambda root, val, v0, v1, v2, v3: UOp(UOps.PHI, root.dtype, (val, UOp(UOps.CAST, val.dtype, (v0, v1, v2, v3))))),
+  ({"__name__": "root", "uop": UOps.CAST, "vin":
+    tuple({"uop": UOps.PHI, "vin": ({"uop": UOps.GEP, "vin": ({"__name__": "val"},), "arg": i}, {"__name__": f"v{i}"})} for i in range(2))},
+    lambda root, val, v0, v1: UOp(UOps.PHI, root.dtype, (val, UOp(UOps.CAST, val.dtype, (v0, v1))))),
+  # NEG/CMPLT -> CMPLT
+  ({"uop": UOps.ALU, "arg": BinaryOps.CMPLT, "vin": ({"uop": UOps.ALU, "arg": UnaryOps.NEG, "vin": ({"__name__": "x"},)},
+                                                     {"__name__": "c", "uop": UOps.CONST, "dtype": dtypes.int})},
+    lambda c,x: UOp(UOps.ALU, dtypes.bool, (UOp.const(c.dtype, -c.arg), x), BinaryOps.CMPLT)),
+  # cast NOOP (NOTE: it's str to deal with PtrDType)
+  ({"__name__": "root", "uop": UOps.CAST}, lambda root: root.vin[0] if str(root.dtype) == str(root.vin[0].dtype) else None),
+])
+# *** uop graph ***
+class UOpGraph:
+  def __init__(self):
+    self.nodes: Dict[Tuple, UOp] = {}
+    self._uops: Optional[List[UOp]] = None
+  def __iter__(self) -> Iterator[UOp]: return iter(self.uops)
+  def __getitem__(self, index) -> UOp: return self.uops[index]
+  def vars(self) -> List[Variable]: return [x.arg for x in self.uops if x.uop is UOps.DEFINE_VAR]
+  def globals(self) -> List[Tuple[int, bool]]: return [x.arg for x in self.uops if x.uop is UOps.DEFINE_GLOBAL]
+  @property
+  def uops(self):
+    if self._uops is None: self.linearize()
+    return self._uops
+  def graph(self):
+    from tinygrad.engine.graph import graph_uops
+    graph_uops(self.uops)
+  def print(self):
+    for i,u in enumerate(self):
+      print(f"{i:4d} {str(u.uop):20s}: {str(u.dtype) if u.dtype is not None else '':25s} " f"{str([self.uops.index(x) for x in u.vin]):32s} {u.arg}")
+  def graph_rewrite(self, sink, pm):
+    # recursive rewrite
+    changed = getenv("UOPS_REWRITE", 1)
+    run_cnt = 0
+    while changed:
+      changed = 0
+      @functools.lru_cache
+      def rewrite(u:UOp) -> UOp:
+        nonlocal changed
+        recurse_cnt = 0
+        up = u
+        # locally recursively rewrite
+        while (rewritten := pm.rewrite(up)):
+          assert recurse_cnt < 100, f"recursive_rewrite looped {up} <--> {rewritten}"
+          up = rewritten
+          recurse_cnt += 1
+        changed += recurse_cnt
+        # NOTE: this changes UOp, so we have to delete caches
+        up.vin = tuple(rewrite(x) for x in up.vin)
+        if hasattr(up, "parents"): del up.parents
+        if hasattr(up, "cmp_tuple"): del up.cmp_tuple
+        # replace with cached nodes
+        if found:=self.nodes.get(key:=up.tuple()): return found
+        else: self.nodes[key] = up
+        return up
+      sink = rewrite(sink)
+      run_cnt += 1
+      assert run_cnt < 100, "exceeded 100 rewrite loops!"
+    return sink
+  def linearize(self, extra_pm:Optional[PatternMatcher]=None, type_verify=True):
+    # NOTE: relinearizering should be okay
+    #assert self._uops is None, "already linearized"
+    # get sink
+    _sinks: List[UOp] = []
+    for u in self.nodes.values():
+      if u.uop is UOps.STORE: _sinks.append(u)
+      if u.uop is UOps.SINK: _sinks.extend(u.vin)
+    sink = UOp(UOps.SINK, None, tuple(_sinks))
+    del _sinks
+    sink = self.graph_rewrite(sink, constant_folder)
+    if extra_pm: sink = self.graph_rewrite(sink, PatternMatcher(constant_folder.patterns+extra_pm.patterns))
+    # filter nodes that don't link to a sink
+    # BFS toposort
+    graph: DefaultDict[UOp, List[UOp]] = defaultdict(list)
+    in_degree: DefaultDict[UOp, int] = defaultdict(int)
+    loops = []
+    ifs = []
+    nodes: Dict[UOp, None] = {}
+    def add_parents(u:UOp):
+      if u in nodes: return
+      nodes[u] = None
+      for x in u.vin:
+        add_parents(x)
+        in_degree[u] += 1
+        graph[x].append(u)
+      if u.uop is UOps.RANGE: loops.append(u)
+      if u.uop is UOps.IF: ifs.append(u)
+    sink = UOp(UOps.SINK, None, tuple(x for x in sink.vin if x.uop is not UOps.NOOP))
+    add_parents(sink)
+    @functools.lru_cache(None)
+    def get_recursive_children(x:UOp, include_self=False) -> Set[UOp]:
+      if x.uop is UOps.SINK: return set()
+      return set.union(set((x,)) if include_self else set(), *([get_recursive_children(u, True) for u in graph[x]] if x.uop is not UOps.PHI else []))
+    loops_children = {l:get_recursive_children(l) for l in loops[::-1]}
+    queue: List = []
+    def push(u):
+      priority = 0
+      # prefer uops that are loop children
+      for l, ss in loops_children.items():
+        if u in ss: priority -= l.arg[0]*1000 + l.arg[1]
+      heapq.heappush(queue, (priority, u))
+    for u in nodes:
+      if in_degree[u] == 0: push(u)
+    if getenv("FUZZ_UOPS", 0):
+      from test.external.fuzz_uops import fuzz_uops
+      self.fuzz_paths = fuzz_uops(graph, in_degree.copy(), loops_children)
+    self._uops = []
+    while queue:
+      p,x = heapq.heappop(queue)
+      if DEBUG >= 7: print(p,x)
+      if x.uop is UOps.DEFINE_ACC and len(x.vin):
+        idx = min([self._uops.index(l) for l in x.vin])
+        self._uops.insert(idx, x)
+      else:
+        self._uops.append(x)
+      for u, ss in loops_children.items():
+        if x in ss:
+          ss.remove(x)
+          if len(ss) == 0: self._uops.append(UOp(UOps.ENDRANGE, None, (u,)))
+      for u in graph[x]:
+        in_degree[u] -= 1
+        if in_degree[u] == 0: push(u)
+    assert self._uops[-1].uop is UOps.SINK, f"didn't end with SINK, ended with {self._uops[-1]}"
+    self._uops = self._uops[:-1]
+    # TODO: ifs should be removed and just the store should be gated
+    for u in ifs[::-1]: self._uops.append(UOp(UOps.ENDIF, None, (u,)))
+    if type_verify: self.type_verify()
+  def add(self, uop:UOps, dtype:Optional[DType]=None, vin:Tuple[UOp, ...]=tuple(), arg:Any=None) -> UOp:
+    if found:=self.nodes.get(key:=(uop, dtype, vin, arg)): return found
+    self.nodes[key] = ret = UOp(*key)
+    return ret
+  # *** checker functions ***
+  def flops_mem(self) -> Tuple[sint, sint]:
+    flops: sint = 0
+    mem: sint = 0
+    mults: sint = 1
+    mult_stack = []
+    for u in self.uops:
+      if u.uop is UOps.RANGE:
+        mult_stack.append(mults)
+        mults *= uop_alu_resolve(u.vin[1])
+      elif u.uop is UOps.ENDRANGE:
+        mults = mult_stack.pop(-1)
+      elif u.uop is UOps.ALU:
+        flops += mults * (2 if u.arg == TernaryOps.MULACC else 1)
+      elif u.uop is UOps.LOAD:
+        assert u.dtype is not None
+        mem += u.dtype.itemsize * mults
+      elif u.uop is UOps.STORE:
+        assert u.vin[2].dtype is not None
+        mem += u.vin[2].dtype.itemsize * mults
+      elif u.uop is UOps.WMMA:
+        assert u.arg[1] is not None
+        flops += 2 * prod(u.arg[1]) // 32 * mults
+    return flops, mem
+  def type_verify(self):
+    for u in self.uops:
+      uop, arg, vin, dtype = u.uop, u.arg, u.vin, u.dtype
+      if uop in {UOps.CONST, UOps.DEFINE_ACC}:
+        if uop is UOps.DEFINE_ACC: arg = arg[0]
+        assert dtype is not None and type(arg) is type(dtypes.as_const(arg, dtype)), f"type of {arg=} does not match {dtype}"
+      if uop in {UOps.CAST, UOps.BITCAST}: assert arg is None   # type is the output type, not an arg
+      if uop is UOps.ALU:
+        if arg in UnaryOps:
+          assert dtype == vin[0].dtype, f"{arg} dtype mismatch {dtype=} != {vin[0].dtype=}"
+        elif arg in (BinaryOps.CMPLT, BinaryOps.CMPEQ):
+          assert dtype == dtypes.bool, f"{arg} output dtype mismatch {dtype=} != {dtypes.bool}"
+          assert vin[0].dtype == vin[1].dtype, f"{arg} dtype mismatch {dtype=} != {vin[0].dtype=} != {vin[1].dtype=}"
+        elif arg in BinaryOps:
+          assert dtype == vin[0].dtype == vin[1].dtype, f"{arg} dtype mismatch {dtype=} != {vin[0].dtype=} != {vin[1].dtype=}"
+        elif arg == TernaryOps.WHERE:
+          assert vin[0].dtype == dtypes.bool, f"{arg} selector dtype mismatch {vin[0].dtype=} != {dtypes.bool}"
+          assert dtype == vin[1].dtype == vin[2].dtype, f"{arg} choice dtype mismatch {dtype=} != {vin[1].dtype=} != {vin[2].dtype=}"

tinygrad/device.py ADDED Viewed

@@ -0,0 +1,183 @@
+from __future__ import annotations
+import multiprocessing
+from dataclasses import dataclass
+from collections import defaultdict
+from typing import List, Optional, Dict, Tuple, Any
+import importlib, inspect, functools, pathlib, os, ctypes
+from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
+from tinygrad.dtype import DType, ImageDType
+from tinygrad.renderer import Renderer
+# **************** Device ****************
+class _Device:
+  def __init__(self) -> None: self._devices: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]  # noqa: E501
+  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
+  def _canonicalize(self, device:str) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "")   # noqa: E501
+  # NOTE: you can't cache canonicalize in case Device.DEFAULT changes
+  def canonicalize(self, device:Optional[str]) -> str: return self._canonicalize(device) if device is not None else Device.DEFAULT
+  def __getitem__(self, ix:str) -> Compiled: return self.__get_canonicalized_item(self.canonicalize(ix))
+  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
+  def __get_canonicalized_item(self, ix:str) -> Compiled:
+    if DEBUG >= 1: print(f"opening device {ix} from pid:{os.getpid()}")
+    assert multiprocessing.current_process().name == "MainProcess" or ix.split(":")[0] in ["DISK", "NPY"], f"can only open device {ix} from parent"
+    x = ix.split(":")[0].upper()
+    return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._devices][0](ix)  # noqa: E501
+  @functools.cached_property
+  def DEFAULT(self) -> str:
+    device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._devices, None)   # type: ignore
+    if device_from_env: return device_from_env
+    for device in ["METAL", "HSA", "CUDA", "GPU", "CLANG", "LLVM"]:
+      try:
+        if self[device]:
+          os.environ[device] = "1"   # we set this in environment for spawned children
+          return device
+      except Exception: pass
+    raise RuntimeError("no usable devices")
+Device = _Device()
+# **************** Buffer + Allocators ****************
+@dataclass(frozen=True, eq=True)
+class BufferOptions:
+  image: Optional[ImageDType] = None
+  uncached: bool = False
+  cpu_access: bool = False
+  host: bool = False
+  nolru: bool = False
+class Buffer:
+  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None,
+               initial_value:Optional[bytes]=None, lb_refcount=0, base:Optional[Buffer]=None, offset:int=0, preallocate=False):
+    assert isinstance(dtype, DType)
+    if isinstance(dtype, ImageDType): options = BufferOptions(image=dtype) # TODO: image hack shouldn't be here. where should it be?
+    self.device, self.size, self.dtype, self.options, self.offset = device, size, dtype, options, offset
+    if base is None:
+      assert offset == 0, "base buffers can't have offset"
+      self._base = None
+      self._lb_refcount = lb_refcount
+      if opaque is not None: self.allocate(opaque)
+      if initial_value is not None:
+        self.allocate()
+        self.copyin(memoryview(initial_value))
+    else:
+      assert base._base is None, "base can't have a base"
+      assert device == base.device, "base must have the same device"
+      self._base = base
+    if preallocate: self.allocate()
+  @property
+  def base(self) -> Buffer: return self._base if self._base is not None else self
+  @property
+  def lb_refcount(self): return self.base._lb_refcount
+  def ref(self, cnt): self.base._lb_refcount += cnt
+  def is_allocated(self) -> bool: return hasattr(self, '_buf')
+  def ensure_allocated(self) -> Buffer: return self.allocate() if not hasattr(self, '_buf') else self
+  def allocate(self, opaque=None) -> Buffer:
+    assert not hasattr(self, '_buf'), "can't allocate already allocated buffer"
+    self.allocator = Device[self.device].allocator
+    if self._base is not None:
+      self._base.ensure_allocated()
+      assert hasattr(self.allocator, "offset"), "offset function required for view"
+      self._buf: Any = self.allocator.offset(self.base._buf, self.nbytes, self.offset)
+    else:
+      self._buf = opaque if opaque is not None else self.allocator.alloc(self.nbytes, self.options)
+      if not self.device.startswith("DISK"): GlobalCounters.mem_used += self.nbytes
+    return self
+  def __reduce__(self):
+    buf = None
+    if self._base is not None:
+      return self.__class__, (self.device, self.size, self.dtype, None, None, None, 0, self.base, self.offset, hasattr(self, '_buf'))
+    if self.device == "NPY": return self.__class__, (self.device, self.size, self.dtype, self._buf, self.options, None, self.lb_refcount)
+    if self.is_allocated():
+      buf = bytearray(self.nbytes)
+      self.copyout(memoryview(buf))
+    return self.__class__, (self.device, self.size, self.dtype, None, self.options, buf, self.lb_refcount)
+  @property
+  def nbytes(self): return self.size*self.dtype.itemsize
+  def __del__(self):
+    if not hasattr(self, '_buf'): return
+    if self._base is None:
+      if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes
+      self.allocator.free(self._buf, self.nbytes, self.options)
+  def __repr__(self):
+    return f"<buf real:{hasattr(self, '_buf')} device:{self.device} size:{self.size} dtype:{self.dtype}" + \
+           (f" offset:{self.offset}" if hasattr(self, "base") else "") + \
+           (">" if self.options is None else f" {self.options=}>")
+  def as_buffer(self, allow_zero_copy=False, force_zero_copy=False) -> memoryview:
+    # zero copy with as_buffer (disabled by default due to use after free)
+    if (force_zero_copy or allow_zero_copy) and hasattr(self.allocator, 'as_buffer'): return self.allocator.as_buffer(self._buf)
+    assert not force_zero_copy, "force zero copy was passed, but copy is required"
+    return self.copyout(memoryview(bytearray(self.nbytes)))
+  def copyin(self, mv:memoryview):
+    mv = flat_mv(mv)
+    assert len(mv) == self.nbytes, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
+    assert self.is_allocated(), "can't copyin to unallocated buffer"
+    self.allocator.copyin(self._buf, mv)
+    return self
+  def copyout(self, mv:memoryview) -> memoryview:
+    mv = flat_mv(mv)
+    assert len(mv) == self.nbytes, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
+    assert self.is_allocated(), "can't copyout unallocated buffer"
+    self.allocator.copyout(mv, self._buf)
+    return mv
+  def view(self, size:int, dtype:DType, offset:int) -> Buffer:
+    assert offset < self.nbytes, "offset must be less than nbytes"
+    if self._base is not None: return Buffer(self.device, size, dtype, base=self._base, offset=self.offset+offset)
+    return Buffer(self.device, size, dtype, base=self, offset=offset)
+# TODO: size, dest, src are the same type. can we enforce this?
+class Allocator:
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
+    assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
+    return self._alloc(size, options if options is not None else BufferOptions())
+  def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
+  def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
+    self._free(opaque, options if options is not None else BufferOptions())
+  def _free(self, opaque, options:BufferOptions): pass  # if opaque is a Python object, you don't need a free
+  def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
+  def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
+class LRUAllocator(Allocator):  # pylint: disable=abstract-method
+  def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list)
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
+    if len(c := self.cache[(size, options)]): return c.pop()
+    try: return super().alloc(size, options)
+    except (RuntimeError, MemoryError):
+      self.free_cache()
+      return super().alloc(size, options)
+  def free_cache(self):
+    for (sz,options),opaques in self.cache.items():
+      for opaque in opaques: super().free(opaque, sz, options)
+      opaques.clear()
+  def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
+    if getenv("LRU", 1) and (options is None or not options.nolru): self.cache[(size, options)].append(opaque)
+    else: super().free(opaque, size, options)
+class _MallocAllocator(LRUAllocator):
+  def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
+  def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
+  def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
+  def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
+  def offset(self, buf, size:int, offset:int): return from_mv(self.as_buffer(buf)[offset:offset+size])
+MallocAllocator = _MallocAllocator()
+# **************** for Compiled Devices ****************
+class CompileError(Exception): pass
+class Compiler:
+  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
+  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
+  def compile_cached(self, src:str) -> bytes:
+    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
+      assert not getenv("ASSERT_COMPILE"), "tried to compile with ASSERT_COMPILE set"
+      lib = self.compile(src)
+      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
+    return lib
+class Compiled:
+  def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
+    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
+    self.renderer = renderer if renderer else Renderer()
+  def synchronize(self): pass  # override this in your device

tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl