PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +253 -225
tinygrad/codegen/linearizer.py +398 -436
tinygrad/codegen/uops.py +451 -0
tinygrad/device.py +268 -274
tinygrad/dtype.py +56 -40
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +198 -0
tinygrad/engine/realize.py +192 -0
tinygrad/engine/schedule.py +370 -0
tinygrad/engine/search.py +199 -0
tinygrad/{mlops.py → function.py} +40 -32
tinygrad/helpers.py +144 -46
tinygrad/lazy.py +143 -242
tinygrad/multi.py +173 -0
tinygrad/nn/__init__.py +180 -9
tinygrad/nn/datasets.py +8 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +87 -19
tinygrad/ops.py +104 -45
tinygrad/renderer/__init__.py +65 -0
tinygrad/renderer/assembly.py +269 -0
tinygrad/renderer/cstyle.py +308 -210
tinygrad/renderer/llvmir.py +119 -124
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +13403 -0
tinygrad/runtime/autogen/comgr.py +891 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5893 -0
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33597 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +56 -0
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +39 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +187 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +550 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +129 -37
tinygrad/runtime/ops_disk.py +111 -43
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +41 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +625 -0
tinygrad/runtime/ops_python.py +208 -0
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +46 -107
tinygrad/shape/symbolic.py +99 -98
tinygrad/shape/view.py +162 -45
tinygrad/tensor.py +2492 -483
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/dtype.py CHANGED Viewed

@@ -1,39 +1,39 @@
-from typing import NamedTuple, Final, Optional, ClassVar, Set, Tuple, Dict
-import numpy as np  # TODO: remove numpy
+from typing import Final, Optional, ClassVar, Set, Tuple, Dict, Union
+from dataclasses import dataclass
 import functools
+from tinygrad.helpers import getenv
-# TODO: migrate this from NamedTuple -> dataclass
-class DType(NamedTuple):
+ConstType = Union[float, int, bool]
+@dataclass(frozen=True, order=True)
+class DType:
   priority: int  # this determines when things get upcasted
   itemsize: int
   name: str
-  np: Optional[type]  # TODO: someday this will be removed with the "remove numpy" project
-  sz: int = 1
-  def __repr__(self): return f"dtypes.{INVERSE_DTYPES_DICT[self]}" if self.sz == 1 else f"dtypes._{INVERSE_DTYPES_DICT[self.scalar()]}{self.sz}"
+  fmt: Optional[str]
+  count: int
+  def __repr__(self): return f"dtypes.{'_'*(c:=self.count!=1)}{INVERSE_DTYPES_DICT[self.name if not c else self.scalar().name]}{str(self.count)*c}"
   def vec(self, sz:int):
-    assert sz > 1 and self.sz == 1, f"can't vectorize {self} with size {sz}"
-    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self]}{sz}", None, sz)
-  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.sz))]] if self.sz > 1 else self
+    assert sz > 1 and self.count == 1, f"can't vectorize {self} with size {sz}"
+    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self.name]}{sz}", None, sz)
+  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.count))]] if self.count > 1 else self
 # dependent typing?
+@dataclass(frozen=True, repr=False)
 class ImageDType(DType):
-  def __new__(cls, priority, itemsize, name, np, shape, base):
-    return super().__new__(cls, priority, itemsize, name, np)
-  def __init__(self, priority, itemsize, name, np, shape, base):
-    self.shape: Tuple[int, ...] = shape  # arbitrary arg for the dtype, used in image for the shape
-    self.base: DType = base
-    super().__init__()
+  shape: Tuple[int, ...]   # arbitrary arg for the dtype, used in image for the shape
+  base: DType
   def scalar(self): return self.base
   def vec(self, sz:int): return self.base.vec(sz)
   def __repr__(self): return f"dtypes.{self.name}({self.shape})"
-  # TODO: fix this to not need these
-  def __hash__(self): return hash((super().__hash__(), self.shape))
-  def __eq__(self, x): return super().__eq__(x) and self.shape == x.shape
-  def __ne__(self, x): return super().__ne__(x) or self.shape != x.shape
+# @dataclass(frozen=True, init=False, repr=False, eq=False)
 class PtrDType(DType):
-  def __new__(cls, dt:DType): return super().__new__(cls, dt.priority, dt.itemsize, dt.name, dt.np, dt.sz)
+  def __init__(self, dt:DType): super().__init__(dt.priority, dt.itemsize, dt.name, dt.fmt, dt.count)
   def __repr__(self): return f"ptr.{super().__repr__()}"
+  def __hash__(self): return super().__hash__()
+  def __eq__(self, dt): return self.priority==dt.priority and self.itemsize==dt.itemsize and self.name==dt.name and self.count==dt.count
+  def __ne__(self, dt): return not (self == dt)
 class dtypes:
   @staticmethod
@@ -43,25 +43,31 @@ class dtypes:
   @staticmethod
   def is_unsigned(x: DType) -> bool: return x.scalar() in (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
   @staticmethod
-  def from_np(x) -> DType: return DTYPES_DICT[np.dtype(x).name]
-  @staticmethod  # NOTE: isinstance(True, int) is True in python
-  def from_py(x) -> DType: return dtypes.default_float if isinstance(x, float) else dtypes.bool if isinstance(x, bool) else dtypes.default_int
+  def from_py(x) -> DType:
+    if x.__class__ is float: return dtypes.default_float
+    if x.__class__ is int: return dtypes.default_int
+    if x.__class__ is bool: return dtypes.bool
+    # put this in the last is faster because there are more items than lists/tuples to check
+    if x.__class__ is list or x.__class__ is tuple: return max(dtypes.from_py(xi) for xi in x) if x else dtypes.default_float
+    raise RuntimeError(f"Could not infer dtype of {x} with type {type(x)}")
+  @staticmethod
+  def as_const(val: ConstType, dtype:DType): return int(val) if dtypes.is_int(dtype) else float(val) if dtypes.is_float(dtype) else bool(val)
   @staticmethod
   def fields() -> Dict[str, DType]: return DTYPES_DICT
-  bool: Final[DType] = DType(0, 1, "bool", np.bool_)
-  int8: Final[DType] = DType(1, 1, "char", np.int8)
-  uint8: Final[DType] = DType(2, 1, "unsigned char", np.uint8)
-  int16: Final[DType] = DType(3, 2, "short", np.int16)
-  uint16: Final[DType] = DType(4, 2, "unsigned short", np.uint16)
-  int32: Final[DType] = DType(5, 4, "int", np.int32)
-  uint32: Final[DType] = DType(6, 4, "unsigned int", np.uint32)
-  int64: Final[DType] = DType(7, 8, "long", np.int64)
-  uint64: Final[DType] = DType(8, 8, "unsigned long", np.uint64)
-  float16: Final[DType] = DType(9, 2, "half", np.float16)
+  bool: Final[DType] = DType(0, 1, "bool", '?', 1)
+  int8: Final[DType] = DType(1, 1, "char", 'b', 1)
+  uint8: Final[DType] = DType(2, 1, "unsigned char", 'B', 1)
+  int16: Final[DType] = DType(3, 2, "short", 'h', 1)
+  uint16: Final[DType] = DType(4, 2, "unsigned short", 'H', 1)
+  int32: Final[DType] = DType(5, 4, "int", 'i', 1)
+  uint32: Final[DType] = DType(6, 4, "unsigned int", 'I', 1)
+  int64: Final[DType] = DType(7, 8, "long", 'l', 1)
+  uint64: Final[DType] = DType(8, 8, "unsigned long", 'L', 1)
+  float16: Final[DType] = DType(9, 2, "half", 'e', 1)
   # bfloat16 has higher priority than float16, so least_upper_dtype(dtypes.int64, dtypes.uint64) = dtypes.float16
-  bfloat16: Final[DType] = DType(10, 2, "__bf16", None)
-  float32: Final[DType] = DType(11, 4, "float", np.float32)
-  float64: Final[DType] = DType(12, 8, "double", np.float64)
+  bfloat16: Final[DType] = DType(10, 2, "__bf16", None, 1)
+  float32: Final[DType] = DType(11, 4, "float", 'f', 1)
+  float64: Final[DType] = DType(12, 8, "double", 'd', 1)
   # dtype aliases
   half = float16; float = float32; double = float64 # noqa: E702
@@ -70,13 +76,17 @@ class dtypes:
   # NOTE: these are image dtypes
   @staticmethod
-  def imageh(shp): return ImageDType(100, 2, "imageh", np.float16, shp, dtypes.float32)
+  def imageh(shp): return ImageDType(100, 2, "imageh", 'e', 1, shape=shp, base=dtypes.float32)
   @staticmethod
-  def imagef(shp): return ImageDType(100, 4, "imagef", np.float32, shp, dtypes.float32)
+  def imagef(shp): return ImageDType(100, 4, "imagef", 'f', 1, shape=shp, base=dtypes.float32)
   default_float: ClassVar[DType] = float32
   default_int: ClassVar[DType] = int32
+if (env_default_float := getenv("DEFAULT_FLOAT", "")):
+  dtypes.default_float = getattr(dtypes, env_default_float.lower())
+  assert dtypes.is_float(dtypes.default_float), f"{env_default_float} is not a float dtype"
 # https://jax.readthedocs.io/en/latest/jep/9407-type-promotion.html
 # we don't support weak type and complex type
 promo_lattice = { dtypes.bool: [dtypes.int8, dtypes.uint8], dtypes.int8: [dtypes.int16], dtypes.int16: [dtypes.int32], dtypes.int32: [dtypes.int64],
@@ -94,4 +104,10 @@ def least_upper_float(dt:DType) -> DType: return dt if dtypes.is_float(dt) else
 # HACK: staticmethods are not callable in 3.8 so we have to compare the class
 DTYPES_DICT = {k: v for k, v in dtypes.__dict__.items() if not (k.startswith(('__', 'default')) or v.__class__ is staticmethod)}
-INVERSE_DTYPES_DICT = {v:k for k,v in DTYPES_DICT.items()}
+INVERSE_DTYPES_DICT = {v.name:k for k,v in DTYPES_DICT.items()}
+def sum_acc_dtype(dt:DType):
+  # default acc dtype for sum
+  if dtypes.is_unsigned(dt): return least_upper_dtype(dt, dtypes.uint)
+  if dtypes.is_int(dt) or dt == dtypes.bool: return least_upper_dtype(dt, dtypes.int)
+  return least_upper_dtype(dt, dtypes.float)

tinygrad/engine/__init__.py ADDED Viewed

File without changes

tinygrad/engine/graph.py ADDED Viewed

@@ -0,0 +1,100 @@
+import os, atexit, functools, contextlib
+from collections import defaultdict
+from typing import List, Any, DefaultDict, Union
+from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LoadOps, BufferOps, TernaryOps, LazyOp
+from tinygrad.device import Device
+from tinygrad.helpers import GRAPHPATH, DEBUG, GlobalCounters, getenv
+from tinygrad.codegen.uops import UOps, UOp, UPat
+from tinygrad.shape.symbolic import NumNode
+from tinygrad.lazy import LazyBuffer
+with contextlib.suppress(ImportError): import networkx as nx
+# **** debugging and graphing ****
+if DEBUG >= 2:
+  def print_globalcounters():
+    if GlobalCounters.time_sum_s == 0: return
+    print(f"avg: {GlobalCounters.global_ops*1e-9/GlobalCounters.time_sum_s:8.2f} GFLOPS {GlobalCounters.global_mem*1e-9/GlobalCounters.time_sum_s:8.2f} GB/s",  # noqa: E501
+          f"{' '*10}total: {GlobalCounters.kernel_count:5d} kernels {GlobalCounters.global_ops*1e-9:8.2f} GOPS {GlobalCounters.global_mem*1e-9:8.2f} GB {GlobalCounters.time_sum_s*1e3:8.2f} ms")  # noqa: E501
+  atexit.register(print_globalcounters)
+def save_graph(G, fn, opt=""):
+  print("saving", G, f"to {fn}.svg")
+  nx.drawing.nx_pydot.write_dot(G, f'{fn}.dot')
+  os.system(f'dot {opt} -Tsvg {fn}.dot -o {fn}.svg')
+G:Any = None
+def init_graph():
+  global G
+  if G is not None: return
+  G = nx.DiGraph()
+  atexit.register(functools.partial(save_graph, G, GRAPHPATH)) # -Gnslimit=100 can make it finish, but you won't like results
+counts: DefaultDict[type, int] = defaultdict(int)
+def nm(x):
+  if not hasattr(x, 'node_id'):
+    setattr(x, 'node_id', counts[type(x)])
+    counts[type(x)] += 1
+  return x.node_id
+def realized_lazybuffer(lb:'LazyBuffer', num):
+  init_graph()
+  G.nodes[nm(lb)]['style'] = '"filled,bold"'
+  G.nodes[nm(lb)]['fillcolor'] = G.nodes[nm(lb)]['fillcolor'][:-2]
+  G.nodes[nm(lb)]['label'] = '"' + G.nodes[nm(lb)]["label"].replace('"', '') + f'\nK:{num}"'
+top_colors = {LoadOps: '#FFFFa0', UnaryOps: "#c0c0c0", ReduceOps: "#FFA0A0", BinaryOps: "#c0c0c0",
+              TernaryOps: "#c0c0c0", BufferOps: '#a0a0ff'}
+def log_lazybuffer(lb:'LazyBuffer', scheduled=False):
+  init_graph()
+  if lb.base.realized is None and lb.base.op is LoadOps.CONST: return
+  if lb.base != lb:
+    offset = lb.st.expr_idxs([NumNode(0)] * len(lb.st.shape))[0]
+    label = f"{lb.st.shape}\n{lb.st.real_strides()}" + (f"\n{offset}" if offset != 0 else "")
+    G.add_node(nm(lb), style='"filled,dashed"', fillcolor="#80ff8080", color="black", label=label)
+    G.add_edge(nm(lb.base), nm(lb), color='#00000060')
+    lb = lb.base
+  if lb.realized is None:
+    label_append = []
+    for idx,x in enumerate(lb.srcs):
+      if nm(x) not in G.nodes: log_lazybuffer(x)
+      if x.base.realized is None and x.base.op is LoadOps.CONST:
+        label_append.append(f"\nCONST{idx} {x.base.arg:g}")
+      else:
+        G.add_edge(nm(x), nm(lb), color='#a0a0a0')
+    label = '"' + \
+      (str(set(x.shape for x in lb.srcs))+"\n"+str(lb.shape) if lb.op in ReduceOps else str(lb.shape)) + \
+      (f"\n{lb.dtype.name}" if lb.dtype.name != "float" else "")+f"\n{lb.op}"+(f"\n{lb.arg}" if lb.op in {LoadOps.CONST, UnaryOps.CAST} else "") + \
+      (f"\n{lb.device}" if lb.device != Device.DEFAULT else "") + ''.join(label_append) + '"'
+    G.add_node(nm(lb), style='"filled,dashed"', fillcolor=[v for k,v in top_colors.items() if lb.op in k][0] + "80", color="black", label=label)
+    if scheduled: G.nodes[nm(lb)]['shape'] = 'box'
+  else:
+    if nm(lb) not in G.nodes:
+      # realized but unseen?
+      G.add_node(nm(lb), label=f'"{str(lb.base.realized)[5:-1].replace(" ", chr(10))}\nb:{nm(lb.realized)}"', style='filled', fillcolor="#f0c08080")
+def _tree(dag:Union[LazyOp, UOp, UPat], cycles, cnt):
+  cnt[0] += 1
+  src = dag.src if isinstance(dag.src, (list, tuple)) else [] if dag.src is None else [dag.src]
+  if len(src) == 0: return [f"━━ {dag.op} {dag.arg}"]
+  if (lid := id(dag)) in cycles and cycles[lid][1] > (tcnt := getenv("TREE_CYCLE_CNT", 5)) and tcnt >= 0:
+    return [f"━⬆︎ goto {cycles[id(dag)][0]}: {dag.op}"]
+  cycles[lid] = (cnt[0], 1 if lid not in cycles else cycles[lid][1]+1)
+  lines = [f"━┳ {dag.op} {dag.arg}"]
+  childs = [_tree(c, cycles, cnt) for c in src]
+  for c in childs[:-1]: lines += [f" ┣{c[0]}"] + [f" ┃{l}" for l in c[1:]]
+  return lines + [" ┗"+childs[-1][0]] + ["  "+l for l in childs[-1][1:]]
+def print_tree(dag:Union[LazyOp, UOp, UPat]): print("\n".join([f"{str(i).rjust(3)} {s}" for i,s in enumerate(_tree(dag, {}, [-1]))]))
+def graph_uops(uops:List[UOp]):
+  colors = {UOps.ALU: "#ffffc0", UOps.LOAD: "#ffc0c0", UOps.STORE: "#c0ffc0", UOps.SPECIAL: "#c0c0ff", UOps.CONST: "#e0e0e0",
+            UOps.DEFINE_GLOBAL: "#ffe0b0", UOps.DEFINE_LOCAL: "#ffe0d0", UOps.DEFINE_ACC: "#f0ffe0",
+            UOps.RANGE: "#c8a0e0", UOps.PHI: "#e0ffc0", UOps.BARRIER: "#ff8080", UOps.IF: "#c8b0c0"}
+  G = nx.DiGraph()
+  for u in uops:
+    if u.op in {UOps.ENDRANGE, UOps.ENDIF}: continue
+    G.add_node(uops.index(u), label=f"{str(u.op)[5:]}{(' '+str(u.arg).replace(':', '')) if u.arg is not None else ''}\n{str(u.dtype)}", style="filled", fillcolor=colors.get(u.op, "#ffffff"))  # noqa: E501
+    for v in u.src: G.add_edge(uops.index(v), uops.index(u))
+  save_graph(G, f'{GRAPHPATH}.uops', '-Grankdir=LR')

tinygrad/engine/jit.py ADDED Viewed

@@ -0,0 +1,198 @@
+from __future__ import annotations
+from typing import TypeVar, Generic, Callable, List, Tuple, Union, Dict, cast, Optional, Any
+import functools, itertools, collections
+from tinygrad.tensor import Tensor
+from tinygrad.lazy import LazyBuffer
+from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, ContextVar, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
+from tinygrad.device import Buffer, Compiled, Device
+from tinygrad.dtype import DType
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.shape.symbolic import Variable, sint
+from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner
+from tinygrad.engine.schedule import _internal_memory_planner
+from tinygrad.nn.state import get_parameters
+from weakref import WeakKeyDictionary
+def apply_graph_to_jit(jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]) -> List[ExecItem]:
+  # Split JIT cache into batches for faster graph execution.
+  # This allows the accelerator to run some batches while subsequent graphs are still being updated.
+  max_batch_size = getenv("JIT_BATCH_SIZE", 32)
+  graphed_jit_cache: List[ExecItem] = []
+  current_batch: List[ExecItem] = []
+  current_device: Optional[Compiled] = None
+  def flush_batch():
+    nonlocal current_batch, current_device, max_batch_size
+    try:
+      if len(current_batch) <= 1 or current_device is None: raise GraphException("only one kernel doesn't graph")
+      graph_runner = current_device.graph(current_batch, input_rawbuffers, var_vals)
+      # clear jit inputs to allow their memory to be freed/reused
+      for (j,i) in graph_runner.input_replace.keys(): graph_runner.jit_cache[j].bufs[i] = None
+      graphed_jit_cache.append(ExecItem(graph_runner, cast(List[Optional[Buffer]], input_rawbuffers)))
+      max_batch_size *= 2
+      if DEBUG >= 2: print(f"\tJIT GRAPHing batch with {len(current_batch)} kernels on device {current_device}")
+    except GraphException as e:
+      graphed_jit_cache.extend(current_batch)
+      if DEBUG >= 2: print(f"\tJIT GRAPHing failed batch with {len(current_batch)} kernels on device {current_device}: {e}")
+    current_batch = []
+    current_device = None
+  for ji in jit_cache:
+    if ji.prg.__class__ in {EmptyOp, ViewOp}: continue
+    ji_graph_dev: Optional[Compiled] = None # device on which the ji will be graphed. Not graphed if None.
+    if isinstance(ji.prg, CompiledRunner): ji_graph_dev = ji.prg.device
+    elif isinstance(ji.prg, BufferXfer) and ji.bufs[0] and ji.bufs[0].device.split(":", 1)[0] in {"CUDA", "NV", "AMD"}:
+      ji_graph_dev = Device[ji.bufs[0].device]
+    graph_class = (ji_graph_dev.graph.func if isinstance(ji_graph_dev.graph, functools.partial) else ji_graph_dev.graph) if ji_graph_dev else None #type: ignore
+    can_be_graphed = ji_graph_dev and ji_graph_dev.graph
+    can_share_graph = (ji_graph_dev == current_device or (isinstance(graph_class, type) and issubclass(graph_class, MultiGraphRunner)) and
+                       type(ji_graph_dev) == type(current_device))
+    can_extend_graph_batch = can_be_graphed and len(current_batch) < max_batch_size and can_share_graph
+    if not can_extend_graph_batch and len(current_batch) > 0: flush_batch()
+    if can_be_graphed: current_batch.append(ji)
+    else: graphed_jit_cache.append(ji)
+    current_device = ji_graph_dev
+  if len(current_batch) > 0: flush_batch()
+  return graphed_jit_cache
+def get_input_replace(jit_cache: List[ExecItem], input_rawbuffers:List[Buffer]) -> Dict[Tuple[int, int], int]:
+  input_replace: Dict[Tuple[int, int], int] = {}
+  for j,ji in enumerate(jit_cache):
+    for i,a in enumerate(ji.bufs):
+      if a in input_rawbuffers:
+        input_replace[(j,i)] = input_rawbuffers.index(a)
+  return input_replace
+class GraphRunner(Runner):  # pylint: disable=abstract-method
+  def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
+    self.jit_cache = jit_cache
+    self.input_replace = get_input_replace(jit_cache, input_rawbuffers)
+    self.jc_idx_with_updatable_launch_dims = []
+    self.jc_idx_with_updatable_var_vals = []
+    op_estimate: sint = 0
+    mem_estimate: sint = 0
+    for j,ji in enumerate(jit_cache):
+      op_estimate += ji.prg.op_estimate
+      mem_estimate += ji.prg.mem_estimate
+      if isinstance(ji.prg, CompiledRunner):
+        if ji.prg.p.vars: self.jc_idx_with_updatable_var_vals.append(j)
+        if (ji.prg.p.global_size and not all_int(ji.prg.p.global_size)) or (ji.prg.p.local_size and not all_int(ji.prg.p.local_size)):
+          self.jc_idx_with_updatable_launch_dims.append(j)
+    self.vars = sorted(var_vals.keys(), key=lambda v: v.expr)
+    super().__init__(colored(f"<batched {len(self.jit_cache)}>", "cyan"), jit_cache[0].prg.dname.split(":")[0], op_estimate, mem_estimate)
+class MultiGraphRunner(GraphRunner):  # pylint: disable=abstract-method
+  def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
+    self.w_dependency_map: Dict[Any, Any] = {}
+    self.r_dependency_map: Dict[Any, List[Any]] = collections.defaultdict(list)
+    super().__init__(jit_cache, input_rawbuffers, var_vals)
+  def _access_resources(self, read, write, new_dependency:Any):
+    # To synchronize access to resources, we monitor the necessary prerequisites for accessing each resource,
+    # whether for write or read operations. A resource can be accessed by either a single writer or multiple readers.
+    wait_nodes = []
+    for rawbuf in read + write:
+      if id(rawbuf.base._buf) in self.w_dependency_map: wait_nodes.append(self.w_dependency_map[id(rawbuf.base._buf)])
+    for rawbuf in write:
+      if id(rawbuf.base._buf) in self.r_dependency_map: wait_nodes.extend(self.r_dependency_map.pop(id(rawbuf.base._buf)))
+    for rawbuf in read: self.r_dependency_map[id(rawbuf.base._buf)].append(new_dependency)
+    for rawbuf in write: self.w_dependency_map[id(rawbuf.base._buf)] = new_dependency
+    return list({id(x):x for x in wait_nodes}.values())
+ReturnType = TypeVar('ReturnType')
+IN_JIT = ContextVar('IN_JIT', 0)
+class TinyJit(Generic[ReturnType]):
+  def __init__(self, fxn:Callable[..., ReturnType]):
+    self.fxn = fxn
+    self.reset()
+  def add_buffer(self, b:Buffer) -> Buffer:
+    if found:=self.buffer_replace.get(b, None): return found
+    if b.is_allocated() or b.lb_refcount > 0: return b
+    if b._base is not None:
+      self.buffer_replace[b] = ret = Buffer(b.device, b.size, b.dtype, base=self.buffer_replace.get(b._base, b._base), offset=b.offset)
+    else:
+      self.buffer_replace[b] = ret = Buffer(b.device, b.size, b.dtype, options=b.options)
+    return ret
+  def add(self, ei:ExecItem):
+    self.jit_cache.append(ExecItem(ei.prg, [self.add_buffer(buf) for buf in ei.bufs if buf is not None]))
+  def reset(self):
+    self.jit_cache: List[ExecItem] = []
+    self.input_replace: Dict[Tuple[int, int], int] = {}
+    self.extra_view_inputs: List[Tuple[int, int, str, int, DType]] = []
+    self.buffer_replace: WeakKeyDictionary[Buffer, Buffer] = WeakKeyDictionary()
+    self.cnt: int = 0
+  def __get__(self, obj, objtype): return functools.partial(self.__call__, obj) # add support for instance methods
+  def __call__(self, *args, **kwargs) -> ReturnType:
+    input_tensors: List[Tuple[Union[int, str], Tensor]] = \
+      [(cast(Union[int, str], name),t) for name,t in itertools.chain(enumerate(args), sorted(kwargs.items())) if t.__class__ is Tensor]
+    if input_tensors: Tensor.realize(*[t for _,t in input_tensors])
+    names: List[Union[int, str]] = [name for name,_ in input_tensors]
+    lbs: List[LazyBuffer] = flatten([t.lazydata.lbs for _,t in input_tensors])
+    st_varvals_dtype_device = [(*lb.st.unbind(), lb.dtype, lb.device) for lb in lbs]
+    input_buffers: List[Buffer] = [lb.base.realized for lb in lbs if lb.base.realized is not None]
+    assert len(set(input_buffers)) == len(input_buffers), "duplicate inputs to JIT"
+    var_vals: Dict[Variable, int] = merge_dicts([varvals for _,varvals,_,_ in st_varvals_dtype_device] + \
+                                                [dict(v.unbind() for v in itertools.chain(args, kwargs.values()) if isinstance(v, Variable))])
+    st_vars_dtype_device = [(x[0], tuple(sorted(x[1].keys(), key=lambda v: v.expr)), x[2], x[3]) for x in st_varvals_dtype_device]
+    if not JIT or self.cnt == 0:
+      if IN_JIT: raise RuntimeError("having TinyJit inside another TinyJit is not supported")
+      # jit ignore
+      with Context(BEAM=0 if getenv("IGNORE_JIT_FIRST_BEAM") else BEAM.value, IN_JIT=1):
+        self.ret = self.fxn(*args, **kwargs)
+        if len(params:=get_parameters(self.ret)): Tensor.realize(params[0], *params[1:])
+    elif self.cnt == 1:
+      # jit capture
+      self.expected_names: List[Union[int, str]] = names
+      self.expected_st_vars_dtype_device: List[Tuple[ShapeTracker, Tuple[Variable, ...], DType, str]] = st_vars_dtype_device
+      with Context(GRAPH=getenv("JITGRAPH", GRAPH.value), BEAM=getenv("JITBEAM", BEAM.value)):
+        capturing.append(self)
+        self.ret = self.fxn(*args, **kwargs)
+        if len(params:=get_parameters(self.ret)): Tensor.realize(params[0], *params[1:])
+        capturing.clear()
+      del self.buffer_replace
+      assert len(self.jit_cache), "didn't JIT anything!"
+      if DEBUG >= 1: print(f"JIT captured {len(self.jit_cache)} kernels with {len(input_buffers)} inputs")
+      # track inputs that are views of buffers
+      for item in self.jit_cache:
+        for b in item.bufs:
+          if b is not None and b._base is not None and b._base in input_buffers:
+            input_buffers.append(b)
+            self.extra_view_inputs.append((input_buffers.index(b.base), b.offset, b.device, b.size, b.dtype))
+      # memory planning (optional)
+      assigned = _internal_memory_planner([cast(List[Buffer], item.bufs) for item in self.jit_cache], debug_prefix="JIT ")
+      self.jit_cache = [ExecItem(item.prg, [assigned.get(b,b).ensure_allocated() for b in item.bufs if b is not None]) for item in self.jit_cache]
+      # Condense the items into a graph executor.
+      if JIT < 2: self.jit_cache = apply_graph_to_jit(self.jit_cache, input_buffers, var_vals)
+      self.input_replace = get_input_replace(self.jit_cache, input_buffers)
+      if DEBUG >= 1 and len(set(self.input_replace.values())) != len(input_buffers): print("WARNING: some input tensors not found")
+    elif self.cnt >= 2:
+      # jit exec
+      assert self.expected_names == names, f"args mismatch in JIT: {self.expected_names=} != {names}"
+      assert self.expected_st_vars_dtype_device == st_vars_dtype_device, \
+        f"args mismatch in JIT: {self.expected_st_vars_dtype_device=} != {st_vars_dtype_device=}"
+      for idx, offset, device, size, dtype in self.extra_view_inputs:
+        input_buffers.append(Buffer(device, size, dtype, base=input_buffers[idx], offset=offset).ensure_allocated())
+      for (j,i),input_idx in self.input_replace.items(): self.jit_cache[j].bufs[i] = input_buffers[input_idx]
+      if DEBUG >= 1 and len(self.jit_cache) >= 10: print(f"jit execs {len(self.jit_cache)} kernels")
+      for ei in self.jit_cache: ei.run(var_vals, jit=True)
+    # clear jit inputs
+    for (j,i) in self.input_replace.keys(): self.jit_cache[j].bufs[i] = None
+    self.cnt += 1
+    return self.ret

tinygrad/engine/realize.py ADDED Viewed

@@ -0,0 +1,192 @@
+from typing import List, Dict, Optional, cast, Generator, Tuple
+import time
+from dataclasses import dataclass, replace
+from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING
+from tinygrad.ops import BufferOps, LoadOps, LazyOp
+from tinygrad.device import Device, Buffer
+from tinygrad.shape.symbolic import Variable, sym_infer, sint
+from tinygrad.renderer import Renderer, Program
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.engine.schedule import ScheduleItem
+# **************** Program Creation ****************
+logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
+def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
+  if DEBUG >= 3:
+    from tinygrad.engine.graph import print_tree
+    for op in ast: print_tree(op)
+  k = Linearizer(*ast, opts=renderer)
+  k.required_optimizations()
+  if not NOOPT:
+    if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
+    if BEAM >= 1:
+      from tinygrad.engine.search import beam_search, time_linearizer, bufs_from_lin
+      kb, k_opt = Linearizer(*ast, opts=renderer), k
+      kb.required_optimizations()
+      rawbufs = bufs_from_lin(kb, allocate=False)
+      k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
+      if getenv("BEAM_COMPARE", 1):
+        # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
+        lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
+        if used_tensor_cores:
+          lins.append(("hc", Linearizer(*ast, opts=renderer)))
+          lins[-1][1].hand_coded_optimizations()
+        timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
+        if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
+        k = timed[0][1]
+        if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
+  # TODO: check the correctness inline once compare_linearizer is in core
+  if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
+  if DEBUG >= 5: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
+  return k
+# **************** Runners ****************
+class Runner:
+  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
+    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
+  @property
+  def device(self): return Device[self.dname]
+  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
+    return self(rawbufs, {} if var_vals is None else var_vals)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
+    raise NotImplementedError("override this")
+class CompiledRunner(Runner):
+  def __init__(self, p:Program, precompiled:Optional[bytes]=None):
+    if DEBUG >= 4: print(p.src)
+    self.p:Program = p
+    self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
+    self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
+    super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
+  def __reduce__(self): return self.__class__, (self.p, self.lib)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
+    global_size, local_size = self.p.launch_dims(var_vals)
+    if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
+      # TODO: this is copied from get_program
+      from tinygrad.engine.search import optimize_local_size
+      local_size = optimize_local_size(self.clprg, global_size, rawbufs)
+      global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
+      self.p = replace(self.p, global_size=global_size, local_size=local_size)
+    lra = {}
+    if global_size:
+      lra['global_size'] = global_size
+      assert len(global_size) == 3, "global size must have len 3"
+    if local_size:
+      lra['local_size'] = local_size
+      assert len(local_size) == 3, "local size must have len 3"
+    return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
+class CustomOp(Runner):
+  def __init__(self, fxn):
+    self.fxn = fxn
+    super().__init__(self.fxn.__name__, "CUSTOM", 0, 0)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): self.fxn(*rawbufs)
+class EmptyOp(Runner):
+  def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass
+class ViewOp(Runner):
+  def __init__(self, buf:Buffer): super().__init__(colored(f"view {buf.nbytes:8d} @ {buf.offset:<10d}", "yellow"), buf.device)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
+    assert rawbufs[0]._base is not None and rawbufs[0]._base == rawbufs[1].base, f"must be base {rawbufs}"
+class BufferCopy(Runner):
+  def __init__(self, total_sz, dest_device, src_device):
+    if total_sz >= 1e6: name = f"{type(self).__name__[6:].lower()} {total_sz/1e6:7.2f}M, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
+    else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
+    super().__init__(colored(name, "yellow"), dest_device, 0, total_sz)
+  def copy(self, dest, src):
+    disk_supports_fast_copyout = src.device.startswith("DISK") and hasattr(src.allocator.device, 'io_uring') and hasattr(src.allocator.device, 'fd')
+    if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_disk') and disk_supports_fast_copyout and src.nbytes >= 4096:
+      dest.allocator.copy_from_disk(dest._buf, src._buf, src.nbytes)
+    elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'):
+      # fast(ish) path, uses readinto in diskbuffers
+      src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf)
+    else:
+      dest.copyin(src.as_buffer(allow_zero_copy=True))  # may allocate a CPU buffer depending on allow_zero_copy
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
+    dest, src = rawbufs[0:2]
+    assert dest.size == src.size and dest.dtype == src.dtype, f"buffer copy mismatch, {dest.size} != {src.size}, {dest.dtype} != {src.dtype}"
+    st = time.perf_counter()
+    self.copy(dest, src)
+    if wait:
+      Device[dest.device].synchronize()
+      return time.perf_counter() - st
+class BufferXfer(BufferCopy):
+  def copy(self, dest, src):
+    if hasattr(dest.allocator.device, "track_cross_buffer") and hasattr(src.allocator, "track_cross_device"):
+      dest.allocator.device.track_cross_buffer.append(src)
+      src.allocator.track_cross_device.add(dest.allocator.device)
+    dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
+# **************** method cache ****************
+method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
+def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
+  ckey = (dname, ast, BEAM.value, False)
+  if cret:=method_cache.get(ckey): return cret
+  bkey = (dname.split(":")[0], ast, BEAM.value, True)
+  if bret:=method_cache.get(bkey):
+    method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
+  else:
+    prg: Program = get_linearizer(Device[dname].renderer, ast).to_program()
+    if hasattr(prg.uops, "fuzz_paths"):
+      from test.external.fuzz_uops import UOpsFuzzerRunner
+      return UOpsFuzzerRunner(replace(prg, dname=dname))
+    method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
+  return ret
+# **************** lowering functions ****************
+@dataclass(frozen=True)
+class ExecItem:
+  prg: Runner
+  bufs: List[Optional[Buffer]]
+  def run(self, var_vals:Optional[Dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
+    bufs = [cast(Buffer, x) for x in self.bufs] if jit else [cast(Buffer, x).ensure_allocated() for x in self.bufs]
+    et = self.prg(bufs, var_vals if var_vals is not None else {}, wait=wait or DEBUG >= 2)
+    if do_update_stats:
+      GlobalCounters.kernel_count += 1
+      GlobalCounters.global_ops += (op_estimate:=sym_infer(self.prg.op_estimate, var_vals))
+      GlobalCounters.global_mem += (mem_estimate:=sym_infer(self.prg.mem_estimate, var_vals))
+      if et is not None: GlobalCounters.time_sum_s += et
+      if DEBUG >= 2:
+        ptm = (colored(f"{et*1e3:9.2f}ms", "yellow") if et > 0.01 else f"{et*1e6:9.2f}us") if et is not None else ""
+        print(f"{colored(f'*** {self.prg.dname[:7]:7s} {GlobalCounters.kernel_count:4d}', 'magenta' if jit else ('green' if self.prg.first_run else None))} {self.prg.display_name+' '*(38-ansilen(self.prg.display_name))} arg {len(self.bufs):3d} mem {GlobalCounters.mem_used/1e9:5.2f} GB " +  # noqa: E501
+              (str() if et is None else f"tm {ptm}/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_estimate/((et or 1e-20)*1e9):8.2f} GFLOPS, {mem_estimate/((et or 1e-20)*1e9):7.2f} GB/s)"))  # noqa: E501
+      self.prg.first_run = False
+    return et
+def lower_schedule_item(si:ScheduleItem) -> ExecItem:
+  assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY or getenv("USE_COPY_KERNEL")
+  if si.ast[0].op is BufferOps.STORE:
+    runner = get_runner(si.outputs[0].device, si.ast)
+    return ExecItem(runner, [si.bufs[x[0]] for x in runner.p.globals])
+  out, ast = si.outputs[0], si.ast[0]
+  if ast.op is LoadOps.COPY:
+    kernel_type = BufferCopy
+    if hasattr(Device[out.device].allocator, 'transfer') and out.device.split(":")[0] == si.inputs[0].device.split(":")[0]:
+      kernel_type = BufferXfer
+    return ExecItem(kernel_type(ast.arg, out.device, si.inputs[0].device), list(si.bufs))
+  if ast.op is LoadOps.CUSTOM: return ExecItem(CustomOp(ast.arg), list(si.bufs))
+  if ast.op is LoadOps.EMPTY: return ExecItem(EmptyOp(out), list(si.bufs))
+  if ast.op is LoadOps.VIEW: return ExecItem(ViewOp(out), list(si.bufs))
+  raise RuntimeError(f"don't know how to lower {ast}")
+def lower_schedule(schedule:List[ScheduleItem]) -> Generator[ExecItem, None, None]:
+  while len(schedule): yield lower_schedule_item(schedule.pop(0))
+# **************** main run function ****************
+capturing: List = []  # put classes with an add method in here
+def run_schedule(schedule:List[ScheduleItem], var_vals:Optional[Dict[Variable, int]]=None, do_update_stats=True):
+  for ei in lower_schedule(schedule):
+    if len(capturing) and CAPTURING: capturing[0].add(ei)
+    ei.run(var_vals, do_update_stats=do_update_stats)

tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl