PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +11 -6
tinygrad/codegen/kernel.py +308 -175
tinygrad/codegen/linearize.py +95 -0
tinygrad/codegen/lowerer.py +143 -0
tinygrad/codegen/transcendental.py +257 -0
tinygrad/codegen/uopgraph.py +506 -0
tinygrad/device.py +72 -171
tinygrad/dtype.py +122 -47
tinygrad/engine/jit.py +184 -87
tinygrad/{lazy.py → engine/lazy.py} +74 -66
tinygrad/engine/memory.py +51 -0
tinygrad/engine/realize.py +86 -61
tinygrad/engine/schedule.py +366 -317
tinygrad/engine/search.py +58 -47
tinygrad/function.py +59 -58
tinygrad/helpers.py +120 -102
tinygrad/multi.py +82 -78
tinygrad/nn/__init__.py +116 -67
tinygrad/nn/datasets.py +12 -5
tinygrad/nn/optim.py +1 -1
tinygrad/nn/state.py +91 -6
tinygrad/ops.py +1126 -143
tinygrad/renderer/__init__.py +47 -23
tinygrad/renderer/cstyle.py +338 -265
tinygrad/renderer/llvmir.py +125 -143
tinygrad/renderer/ptx.py +225 -0
tinygrad/runtime/autogen/adreno.py +17904 -0
tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/io_uring.py +97 -63
tinygrad/runtime/autogen/kfd.py +60 -47
tinygrad/runtime/autogen/kgsl.py +1386 -0
tinygrad/runtime/autogen/libc.py +5462 -0
tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/autogen/opencl.py +11 -11
tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
tinygrad/runtime/graph/clang.py +3 -3
tinygrad/runtime/graph/cuda.py +11 -15
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +71 -43
tinygrad/runtime/ops_amd.py +244 -323
tinygrad/runtime/ops_clang.py +12 -5
tinygrad/runtime/ops_cloud.py +220 -0
tinygrad/runtime/ops_cuda.py +42 -99
tinygrad/runtime/ops_disk.py +25 -26
tinygrad/runtime/ops_dsp.py +181 -0
tinygrad/runtime/ops_gpu.py +29 -16
tinygrad/runtime/ops_hip.py +68 -0
tinygrad/runtime/ops_llvm.py +15 -10
tinygrad/runtime/ops_metal.py +147 -64
tinygrad/runtime/ops_nv.py +356 -397
tinygrad/runtime/ops_python.py +78 -79
tinygrad/runtime/ops_qcom.py +405 -0
tinygrad/runtime/support/__init__.py +0 -0
tinygrad/runtime/support/compiler_cuda.py +77 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/runtime/support/hcq.py +539 -0
tinygrad/shape/shapetracker.py +40 -50
tinygrad/shape/view.py +102 -63
tinygrad/tensor.py +1109 -365
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
tinygrad-0.10.0.dist-info/RECORD +77 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad/codegen/uops.py +0 -451
tinygrad/engine/graph.py +0 -100
tinygrad/renderer/assembly.py +0 -269
tinygrad/shape/symbolic.py +0 -327
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0

tinygrad/engine/jit.py CHANGED Viewed

@@ -1,22 +1,24 @@
 from __future__ import annotations
 from typing import TypeVar, Generic, Callable, List, Tuple, Union, Dict, cast, Optional, Any
-import functools, itertools, collections
+import functools, collections
 from tinygrad.tensor import Tensor
-from tinygrad.lazy import LazyBuffer
-from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, ContextVar, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
+from tinygrad.engine.lazy import LazyBuffer
+from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, BEAM, getenv, colored, JIT, dedup, partition
 from tinygrad.device import Buffer, Compiled, Device
 from tinygrad.dtype import DType
+from tinygrad.ops import UOp, ssimplify, Variable, sint, sym_infer
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.shape.symbolic import Variable, sint
 from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner
-from tinygrad.engine.schedule import _internal_memory_planner
+from tinygrad.engine.memory import _internal_memory_planner
 from tinygrad.nn.state import get_parameters
+from dataclasses import dataclass
 from weakref import WeakKeyDictionary
-def apply_graph_to_jit(jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]) -> List[ExecItem]:
+class GraphException(Exception): pass
+def apply_graph_to_jit(jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], max_batch_size=0) -> List[ExecItem]:
   # Split JIT cache into batches for faster graph execution.
   # This allows the accelerator to run some batches while subsequent graphs are still being updated.
-  max_batch_size = getenv("JIT_BATCH_SIZE", 32)
   graphed_jit_cache: List[ExecItem] = []
   current_batch: List[ExecItem] = []
   current_device: Optional[Compiled] = None
@@ -30,10 +32,10 @@ def apply_graph_to_jit(jit_cache: List[ExecItem], input_rawbuffers: List[Buffer]
       for (j,i) in graph_runner.input_replace.keys(): graph_runner.jit_cache[j].bufs[i] = None
       graphed_jit_cache.append(ExecItem(graph_runner, cast(List[Optional[Buffer]], input_rawbuffers)))
       max_batch_size *= 2
-      if DEBUG >= 2: print(f"\tJIT GRAPHing batch with {len(current_batch)} kernels on device {current_device}")
+      if DEBUG >= 2: print(f"JIT GRAPHing batch with {len(current_batch)} kernels on device {current_device}")
     except GraphException as e:
       graphed_jit_cache.extend(current_batch)
-      if DEBUG >= 2: print(f"\tJIT GRAPHing failed batch with {len(current_batch)} kernels on device {current_device}: {e}")
+      if DEBUG >= 2: print(f"JIT GRAPHing failed batch with {len(current_batch)} kernels on device {current_device}: {e}")
     current_batch = []
     current_device = None
@@ -44,11 +46,11 @@ def apply_graph_to_jit(jit_cache: List[ExecItem], input_rawbuffers: List[Buffer]
     elif isinstance(ji.prg, BufferXfer) and ji.bufs[0] and ji.bufs[0].device.split(":", 1)[0] in {"CUDA", "NV", "AMD"}:
       ji_graph_dev = Device[ji.bufs[0].device]
-    graph_class = (ji_graph_dev.graph.func if isinstance(ji_graph_dev.graph, functools.partial) else ji_graph_dev.graph) if ji_graph_dev else None #type: ignore
+    graph_class = (ji_graph_dev.graph.func if isinstance(ji_graph_dev.graph, functools.partial) else ji_graph_dev.graph) if ji_graph_dev else None
     can_be_graphed = ji_graph_dev and ji_graph_dev.graph
     can_share_graph = (ji_graph_dev == current_device or (isinstance(graph_class, type) and issubclass(graph_class, MultiGraphRunner)) and
-                       type(ji_graph_dev) == type(current_device))
-    can_extend_graph_batch = can_be_graphed and len(current_batch) < max_batch_size and can_share_graph
+                       type(ji_graph_dev) is type(current_device))
+    can_extend_graph_batch = can_be_graphed and (max_batch_size == 0 or len(current_batch) < max_batch_size) and can_share_graph
     if not can_extend_graph_batch and len(current_batch) > 0: flush_batch()
     if can_be_graphed: current_batch.append(ji)
@@ -70,129 +72,224 @@ def get_input_replace(jit_cache: List[ExecItem], input_rawbuffers:List[Buffer])
 class GraphRunner(Runner):  # pylint: disable=abstract-method
   def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
     self.jit_cache = jit_cache
-    self.input_replace = get_input_replace(jit_cache, input_rawbuffers)
-    self.jc_idx_with_updatable_launch_dims = []
-    self.jc_idx_with_updatable_var_vals = []
+    self.input_replace:Dict[Tuple[int, int], int] = get_input_replace(jit_cache, input_rawbuffers)
+    self.var_vals_replace:Dict[int, List[int]] = {}
+    self.launch_dims_replace:Dict[int, Tuple[Optional[int], Optional[int]]] = {}
     op_estimate: sint = 0
     mem_estimate: sint = 0
+    lds_estimate: sint = 0
+    def is_sym_dim(dim) -> bool: return not all(isinstance(d, (int, float)) for d in dim)
+    self.vars = sorted(var_vals.keys(), key=lambda v: v.expr)
+    self.symbolic_dims = dedup([tuple(d) for ji in jit_cache if isinstance(ji.prg, CompiledRunner) and (d:=ji.prg.p.local_size) and is_sym_dim(d)] +
+                               [tuple(d) for ji in jit_cache if isinstance(ji.prg, CompiledRunner) and (d:=ji.prg.p.global_size) and is_sym_dim(d)])
+    def find_symbolic_dim(dim): return self.symbolic_dims.index(tuple(dim)) if dim is not None and tuple(dim) in self.symbolic_dims else None
     for j,ji in enumerate(jit_cache):
       op_estimate += ji.prg.op_estimate
       mem_estimate += ji.prg.mem_estimate
+      lds_estimate += ji.prg.lds_estimate
       if isinstance(ji.prg, CompiledRunner):
-        if ji.prg.p.vars: self.jc_idx_with_updatable_var_vals.append(j)
-        if (ji.prg.p.global_size and not all_int(ji.prg.p.global_size)) or (ji.prg.p.local_size and not all_int(ji.prg.p.local_size)):
-          self.jc_idx_with_updatable_launch_dims.append(j)
-    self.vars = sorted(var_vals.keys(), key=lambda v: v.expr)
-    super().__init__(colored(f"<batched {len(self.jit_cache)}>", "cyan"), jit_cache[0].prg.dname.split(":")[0], op_estimate, mem_estimate)
+        if ji.prg.p.vars: self.var_vals_replace[j] = [self.vars.index(v) for v in ji.prg.p.vars]
-class MultiGraphRunner(GraphRunner):  # pylint: disable=abstract-method
-  def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
-    self.w_dependency_map: Dict[Any, Any] = {}
-    self.r_dependency_map: Dict[Any, List[Any]] = collections.defaultdict(list)
-    super().__init__(jit_cache, input_rawbuffers, var_vals)
+        global_dim_idx, local_dim_idx = find_symbolic_dim(ji.prg.p.global_size), find_symbolic_dim(ji.prg.p.local_size)
+        if global_dim_idx is not None or local_dim_idx is not None: self.launch_dims_replace[j] = (global_dim_idx, local_dim_idx)
+    # used in MultiGraphRunner. the ints are id() of _bufs
+    self.w_dependency_map: Dict[int, Any] = {}
+    self.r_dependency_map: Dict[int, List[Any]] = collections.defaultdict(list)
+    super().__init__(colored(f"<batched {len(self.jit_cache)}>", "cyan"), jit_cache[0].prg.dname.split(":")[0],
+                     ssimplify(op_estimate), ssimplify(mem_estimate), ssimplify(lds_estimate))
+  def updated_vars(self, var_vals: Dict[Variable, int]):
+    vals = [var_vals[v] for v in self.vars]
+    for j, vidxs in self.var_vals_replace.items():
+      for i, v in enumerate(vidxs): yield j, i, vals[v]
-  def _access_resources(self, read, write, new_dependency:Any):
+  def updated_launch_dims(self, var_vals: Dict[Variable, int]):
+    dims = [tuple(sym_infer(s, var_vals) for s in dim) for dim in self.symbolic_dims]
+    for j, (gl, lc) in self.launch_dims_replace.items(): yield j, (dims[gl] if gl is not None else None), (dims[lc] if lc is not None else None)
+  def _access_resources(self, rawbufs:List[Buffer], write:List[int], new_dependency:Any):
     # To synchronize access to resources, we monitor the necessary prerequisites for accessing each resource,
     # whether for write or read operations. A resource can be accessed by either a single writer or multiple readers.
     wait_nodes = []
-    for rawbuf in read + write:
+    for i,rawbuf in enumerate(rawbufs):
       if id(rawbuf.base._buf) in self.w_dependency_map: wait_nodes.append(self.w_dependency_map[id(rawbuf.base._buf)])
-    for rawbuf in write:
-      if id(rawbuf.base._buf) in self.r_dependency_map: wait_nodes.extend(self.r_dependency_map.pop(id(rawbuf.base._buf)))
+      if i in write:
+        if id(rawbuf.base._buf) in self.r_dependency_map: wait_nodes.extend(self.r_dependency_map.pop(id(rawbuf.base._buf)))
+        self.w_dependency_map[id(rawbuf.base._buf)] = new_dependency
+      else: self.r_dependency_map[id(rawbuf.base._buf)].append(new_dependency)
-    for rawbuf in read: self.r_dependency_map[id(rawbuf.base._buf)].append(new_dependency)
-    for rawbuf in write: self.w_dependency_map[id(rawbuf.base._buf)] = new_dependency
     return list({id(x):x for x in wait_nodes}.values())
+# a marker for your graph supporting multiple devices of the same type
+class MultiGraphRunner(GraphRunner): pass # pylint: disable=abstract-method
 ReturnType = TypeVar('ReturnType')
-IN_JIT = ContextVar('IN_JIT', 0)
+@dataclass
+class CapturedJit(Generic[ReturnType]):
+  ret: Any  # includes the Tensors or any other returned object
+  jit_cache: List[ExecItem]
+  input_replace: Dict[Tuple[int, int], int]
+  extra_view_inputs: List[Tuple[int, int, str, int, DType]]
+  expected_names: List[Union[int, str]]
+  expected_st_vars_dtype_device: List[Tuple[ShapeTracker, Tuple[Variable, ...], DType, str]]
+  def __reduce__(self):
+    return self.__class__, (self.ret, self.jit_cache, self.input_replace, self.extra_view_inputs,
+                            self.expected_names, self.expected_st_vars_dtype_device)
+  def __post_init__(self):
+    self._jit_cache: List[ExecItem] = self.jit_cache
+    self._input_replace: Dict[Tuple[int, int], int] = self.input_replace
+    self._graphed = False
+    self._clear_inputs()
+  def _clear_inputs(self):
+    for (j,i) in self._input_replace.keys(): self._jit_cache[j].bufs[i] = None
+  # jit exec
+  def __call__(self, input_buffers:List[Buffer], var_vals:Dict[Variable, int]) -> ReturnType:
+    # assign inputs
+    for idx, offset, device, size, dtype in self.extra_view_inputs:
+      input_buffers.append(Buffer(device, size, dtype, base=input_buffers[idx], offset=offset).ensure_allocated())
+    for (j,i),input_idx in self._input_replace.items(): self._jit_cache[j].bufs[i] = input_buffers[input_idx]
+    # Condense the items into a graph executor.
+    if JIT < 2 and not self._graphed:
+      self._jit_cache = apply_graph_to_jit(self.jit_cache, input_buffers, var_vals, max_batch_size=getenv("JIT_BATCH_SIZE", 32))
+      self._input_replace = get_input_replace(self._jit_cache, input_buffers)
+      self._graphed = True
+    if DEBUG >= 1 and len(self._jit_cache) >= 10: print(f"jit execs {len(self._jit_cache)} kernels")
+    for ei in self._jit_cache: ei.run(var_vals, jit=True)
+    self._clear_inputs()
+    return self.ret
+def _prepare_jit_inputs(args, kwargs):
+  input_tensors: List[Tuple[int|str, Tensor]] = [(name,t) for name,t in list(enumerate(args))+sorted(kwargs.items()) if t.__class__ is Tensor]
+  names, tensors = [name for name,_ in input_tensors], [t for _,t in input_tensors]
+  if tensors: Tensor.realize(*tensors)
+  lbs: List[LazyBuffer] = flatten([t.lazydata.lbs for t in tensors])
+  input_buffers: List[Buffer] = [lb.base.realized for lb in lbs if lb.base.realized is not None]
+  assert len(set(input_buffers)) == len(input_buffers), "duplicate inputs to JIT"
+  st_varval_dtype_device = [(*lb.st.unbind(), lb.dtype, lb.device) for lb in lbs]
+  var_vals = merge_dicts([x[1] for x in st_varval_dtype_device] + [dict(v.unbind() for v in (args + tuple(kwargs.values())) if isinstance(v, UOp))])
+  st_vars_dtype_device = [(x[0], tuple(sorted(x[1].keys(), key=lambda v: v.expr)), x[2], x[3]) for x in st_varval_dtype_device]
+  return input_buffers, var_vals, names, st_vars_dtype_device
 class TinyJit(Generic[ReturnType]):
-  def __init__(self, fxn:Callable[..., ReturnType]):
+  def __init__(self, fxn:Optional[Callable[..., ReturnType]], captured:Optional[CapturedJit]=None, prune=False):
+    assert fxn or captured, "need either a function or a CapturedJit"
     self.fxn = fxn
-    self.reset()
+    self.captured: Optional[CapturedJit] = captured
+    self.cnt: int = 2 if self.fxn is None else 0
+    self.prune = prune
   def add_buffer(self, b:Buffer) -> Buffer:
-    if found:=self.buffer_replace.get(b, None): return found
+    if found:=self._buffer_replace.get(b, None): return found
     if b.is_allocated() or b.lb_refcount > 0: return b
     if b._base is not None:
-      self.buffer_replace[b] = ret = Buffer(b.device, b.size, b.dtype, base=self.buffer_replace.get(b._base, b._base), offset=b.offset)
+      self._buffer_replace[b] = ret = Buffer(b.device, b.size, b.dtype, base=self.add_buffer(b._base), offset=b.offset)
     else:
-      self.buffer_replace[b] = ret = Buffer(b.device, b.size, b.dtype, options=b.options)
+      self._buffer_replace[b] = ret = Buffer(b.device, b.size, b.dtype, options=b.options)
     return ret
   def add(self, ei:ExecItem):
-    self.jit_cache.append(ExecItem(ei.prg, [self.add_buffer(buf) for buf in ei.bufs if buf is not None]))
+    self._jit_cache.append(ExecItem(ei.prg, [self.add_buffer(buf) for buf in ei.bufs if buf is not None]))
   def reset(self):
-    self.jit_cache: List[ExecItem] = []
-    self.input_replace: Dict[Tuple[int, int], int] = {}
-    self.extra_view_inputs: List[Tuple[int, int, str, int, DType]] = []
-    self.buffer_replace: WeakKeyDictionary[Buffer, Buffer] = WeakKeyDictionary()
-    self.cnt: int = 0
+    assert self.fxn is not None, "can't reset without function"
+    self.cnt = 0
+    self.captured = None
+  def __reduce__(self):
+    assert self.captured is not None, "can't pickle an uncaptured JIT"
+    return self.__class__, (None, self.captured)
+  # keep legacy code working
+  @property
+  def jit_cache(self) -> List[ExecItem]: return self.captured._jit_cache if self.captured is not None else []
+  @property
+  def input_replace(self) -> Dict[Tuple[int, int], int]: return self.captured._input_replace if self.captured is not None else {}
   def __get__(self, obj, objtype): return functools.partial(self.__call__, obj) # add support for instance methods
   def __call__(self, *args, **kwargs) -> ReturnType:
-    input_tensors: List[Tuple[Union[int, str], Tensor]] = \
-      [(cast(Union[int, str], name),t) for name,t in itertools.chain(enumerate(args), sorted(kwargs.items())) if t.__class__ is Tensor]
-    if input_tensors: Tensor.realize(*[t for _,t in input_tensors])
-    names: List[Union[int, str]] = [name for name,_ in input_tensors]
-    lbs: List[LazyBuffer] = flatten([t.lazydata.lbs for _,t in input_tensors])
-    st_varvals_dtype_device = [(*lb.st.unbind(), lb.dtype, lb.device) for lb in lbs]
-    input_buffers: List[Buffer] = [lb.base.realized for lb in lbs if lb.base.realized is not None]
-    assert len(set(input_buffers)) == len(input_buffers), "duplicate inputs to JIT"
-    var_vals: Dict[Variable, int] = merge_dicts([varvals for _,varvals,_,_ in st_varvals_dtype_device] + \
-                                                [dict(v.unbind() for v in itertools.chain(args, kwargs.values()) if isinstance(v, Variable))])
-    st_vars_dtype_device = [(x[0], tuple(sorted(x[1].keys(), key=lambda v: v.expr)), x[2], x[3]) for x in st_varvals_dtype_device]
+    input_buffers, var_vals, names, st_vars_dtype_device = _prepare_jit_inputs(args, kwargs)
     if not JIT or self.cnt == 0:
-      if IN_JIT: raise RuntimeError("having TinyJit inside another TinyJit is not supported")
       # jit ignore
-      with Context(BEAM=0 if getenv("IGNORE_JIT_FIRST_BEAM") else BEAM.value, IN_JIT=1):
-        self.ret = self.fxn(*args, **kwargs)
-        if len(params:=get_parameters(self.ret)): Tensor.realize(params[0], *params[1:])
+      assert self.fxn is not None
+      with Context(BEAM=0 if getenv("IGNORE_JIT_FIRST_BEAM") else BEAM.value):
+        ret = self.fxn(*args, **kwargs)
+        if len(params:=get_parameters(ret)): Tensor.realize(params[0], *params[1:])
     elif self.cnt == 1:
       # jit capture
-      self.expected_names: List[Union[int, str]] = names
-      self.expected_st_vars_dtype_device: List[Tuple[ShapeTracker, Tuple[Variable, ...], DType, str]] = st_vars_dtype_device
-      with Context(GRAPH=getenv("JITGRAPH", GRAPH.value), BEAM=getenv("JITBEAM", BEAM.value)):
+      assert self.fxn is not None
+      if capturing: raise RuntimeError(f"having TinyJit inside another TinyJit is not supported {len(capturing)=} {capturing=}")
+      self._jit_cache: List[ExecItem] = []
+      self._buffer_replace: WeakKeyDictionary[Buffer, Buffer] = WeakKeyDictionary()
+      # TODO: should we always disable the memory planner here? it must be off for prune
+      with Context(BEAM=getenv("JITBEAM", BEAM.value), NO_MEMORY_PLANNER=int(self.prune)):
         capturing.append(self)
-        self.ret = self.fxn(*args, **kwargs)
-        if len(params:=get_parameters(self.ret)): Tensor.realize(params[0], *params[1:])
-        capturing.clear()
-      del self.buffer_replace
-      assert len(self.jit_cache), "didn't JIT anything!"
-      if DEBUG >= 1: print(f"JIT captured {len(self.jit_cache)} kernels with {len(input_buffers)} inputs")
+        try:
+          ret = self.fxn(*args, **kwargs)
+          if len(params:=get_parameters(ret)): Tensor.realize(params[0], *params[1:])
+        except Exception as e: raise e
+        finally: capturing.clear()
+      jit_cache = self._jit_cache
+      del self._buffer_replace, self._jit_cache
+      assert len(jit_cache), "didn't JIT anything!"
+      if DEBUG >= 1: print(f"JIT captured {len(jit_cache)} kernels with {len(input_buffers)} inputs")
       # track inputs that are views of buffers
-      for item in self.jit_cache:
+      # TODO: eventually expected_buffers should live in ExecItem
+      extra_view_inputs: List[Tuple[int, int, str, int, DType]] = []
+      for item in jit_cache:
         for b in item.bufs:
           if b is not None and b._base is not None and b._base in input_buffers:
             input_buffers.append(b)
-            self.extra_view_inputs.append((input_buffers.index(b.base), b.offset, b.device, b.size, b.dtype))
+            extra_view_inputs.append((input_buffers.index(b.base), b.offset, b.device, b.size, b.dtype))
+      # prune independent kernels (optional)
+      if self.prune:
+        depends = set(input_buffers)
+        for ei in jit_cache:
+          if any(b in depends for b in ei.bufs):
+            if isinstance(ei.prg, CompiledRunner):
+              depends.update(cast(Buffer, ei.bufs[out]) for out in ei.prg.p.outs)
+        pruned, onetime = partition(jit_cache,
+                                    lambda ei: not isinstance(ei.prg, CompiledRunner) or any(ei.bufs[out] in depends for out in ei.prg.p.outs))
+        if DEBUG >= 1: print(f"pruned from {len(jit_cache)} -> {len(pruned)} kernels")
+        # run the onetime kernels here
+        for ei in onetime:
+          for b in ei.bufs: cast(Buffer, b).ensure_allocated()
+          ei.run(var_vals, jit=True)
+        jit_cache = pruned
       # memory planning (optional)
-      assigned = _internal_memory_planner([cast(List[Buffer], item.bufs) for item in self.jit_cache], debug_prefix="JIT ")
-      self.jit_cache = [ExecItem(item.prg, [assigned.get(b,b).ensure_allocated() for b in item.bufs if b is not None]) for item in self.jit_cache]
+      # Exclude buffers involved in transfer ops to preserve parallelism.
+      noopt_buffers = {b for ji in jit_cache if isinstance(ji.prg, BufferXfer) for b in ji.bufs}
+      assigned = _internal_memory_planner([cast(List[Buffer], item.bufs) for item in jit_cache], noopt_buffers, debug_prefix="JIT ")
+      jit_cache = [ExecItem(item.prg, [assigned.get(b,b).ensure_allocated() for b in item.bufs if b is not None]) for item in jit_cache]
-      # Condense the items into a graph executor.
-      if JIT < 2: self.jit_cache = apply_graph_to_jit(self.jit_cache, input_buffers, var_vals)
+      input_replace = get_input_replace(jit_cache, input_buffers)
+      if DEBUG >= 1 and len(set(input_replace.values())) != len(input_buffers): print("WARNING: some input tensors not found")
-      self.input_replace = get_input_replace(self.jit_cache, input_buffers)
-      if DEBUG >= 1 and len(set(self.input_replace.values())) != len(input_buffers): print("WARNING: some input tensors not found")
+      # set this for next run
+      self.captured = CapturedJit(ret, jit_cache, input_replace, extra_view_inputs, names, st_vars_dtype_device)
     elif self.cnt >= 2:
       # jit exec
-      assert self.expected_names == names, f"args mismatch in JIT: {self.expected_names=} != {names}"
-      assert self.expected_st_vars_dtype_device == st_vars_dtype_device, \
-        f"args mismatch in JIT: {self.expected_st_vars_dtype_device=} != {st_vars_dtype_device=}"
-      for idx, offset, device, size, dtype in self.extra_view_inputs:
-        input_buffers.append(Buffer(device, size, dtype, base=input_buffers[idx], offset=offset).ensure_allocated())
-      for (j,i),input_idx in self.input_replace.items(): self.jit_cache[j].bufs[i] = input_buffers[input_idx]
-      if DEBUG >= 1 and len(self.jit_cache) >= 10: print(f"jit execs {len(self.jit_cache)} kernels")
-      for ei in self.jit_cache: ei.run(var_vals, jit=True)
-    # clear jit inputs
-    for (j,i) in self.input_replace.keys(): self.jit_cache[j].bufs[i] = None
+      assert self.captured is not None
+      assert self.captured.expected_names == names, f"args mismatch in JIT: {self.captured.expected_names=} != {names}"
+      assert self.captured.expected_st_vars_dtype_device == st_vars_dtype_device, \
+        f"args mismatch in JIT: {self.captured.expected_st_vars_dtype_device=} != {st_vars_dtype_device=}"
+      ret = self.captured(input_buffers, var_vals)
     self.cnt += 1
-    return self.ret
+    return ret

tinygrad/{lazy.py → engine/lazy.py} RENAMED Viewed

@@ -1,44 +1,44 @@
 from __future__ import annotations
-import math
-from typing import Union, Optional, Any, Tuple, List
-from tinygrad.dtype import dtypes, DType, ConstType
-from tinygrad.helpers import prod, getenv, all_int, all_same, DEBUG
-from tinygrad.ops import LoadOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, Op, exec_alu, python_alu
-from tinygrad.shape.symbolic import sint, Variable
+from typing import Optional, Any, Tuple, List, get_args
+from tinygrad.dtype import dtypes, DType, ConstType, to_dtype, ImageDType
+from tinygrad.helpers import prod, getenv, all_int, all_same, DEBUG, _METADATA, Metadata, SPLIT_REDUCEOP, LAZYCACHE
+from tinygrad.ops import exec_alu, python_alu
+from tinygrad.ops import identity_element, MathTrait, resolve, UOp, sint, GroupOp, Ops
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.device import Buffer
 from weakref import ref, ReferenceType, WeakValueDictionary
 lazycache: WeakValueDictionary[Any, LazyBuffer] = WeakValueDictionary()
-def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
-                      base:Optional[LazyBuffer]=None, enable_cache=bool(getenv("LAZYCACHE", 1))):
-  if st.size == 0: op, arg, srcs, base = LoadOps.CONST, 0, (), None
-  if op is LoadOps.CONST: arg, enable_cache = dtypes.as_const(arg, dtype) if not isinstance(arg, Variable) else arg, True
+def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Ops]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
+                      base:Optional[LazyBuffer]=None, enable_cache=bool(LAZYCACHE)):
+  if st.size == 0: op, arg, srcs, base = Ops.CONST, 0, (), None
+  dtype = to_dtype(dtype)
+  if op is Ops.CONST: arg, enable_cache = dtypes.as_const(arg, dtype) if not isinstance(arg, UOp) else arg, True
   cache_key = (device, st, dtype, op, arg, tuple(ref(x) for x in srcs)) if base is None else (st, ref(base))
-  if enable_cache and (rret := lazycache.get(cache_key, None)): return rret
+  if enable_cache and (rret := lazycache.get(cache_key, None)) is not None: return rret
-  ret = LazyBuffer(device, st, dtype, op, arg, srcs, base=base)
+  ret = LazyBuffer(device, st, dtype, op, arg, srcs, base=base, metadata=_METADATA.get())
   if enable_cache: lazycache[cache_key] = ret
   return ret
-view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "DISK"}
-class LazyBuffer:
+view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "METAL", "QCOM", "DSP", "DISK"}
+class LazyBuffer(MathTrait):
   def __init__(self, device:str, st:ShapeTracker, dtype:DType,
-               op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
-               base:Optional[LazyBuffer]=None):
-    self.device, self.st, self.dtype, self.shape, self.size = device, st, dtype, st.shape, st.size
+               op:Optional[Ops]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
+               base:Optional[LazyBuffer]=None, metadata:Optional[Metadata]=None):
+    self.device, self.st, self.dtype, self.shape, self.size, self.metadata = device, st, to_dtype(dtype), st.shape, st.size, metadata
     self._base: Optional[LazyBuffer] = None
     if base is None:
       # properties on base
-      self.op, self.arg, self.srcs = op, arg, srcs  # this is a LazyOp, except the src is LazyBuffers and not LazyOps
-      assert self.op is not LoadOps.ASSIGN or srcs[1].base.realized is not None, "assign target must be realized"
+      self.op, self.arg, self.srcs = op, arg, srcs  # this is a UOp, except the src is LazyBuffers and not UOps
+      assert self.op is not Ops.ASSIGN or srcs[0].base.realized is not None, "assign target must be realized"
-      if self.op is LoadOps.VIEW:
+      if self.op is Ops.BUFFER_VIEW:
         # some LazyBuffers can be processed with only a view, no AST required
-        self.buffer: Buffer = srcs[0].base.buffer.view(st.size, dtype, srcs[0].st.views[0].offset * srcs[0].dtype.itemsize)
+        self.buffer: Buffer = srcs[0].base.buffer.view(st.size, self.dtype, srcs[0].st.views[0].offset * srcs[0].dtype.itemsize)
       else:
-        self.buffer = srcs[1].base.buffer if self.op is LoadOps.ASSIGN else Buffer(device, self.size, dtype)
+        self.buffer = srcs[0].base.buffer if self.op is Ops.ASSIGN else Buffer(device, self.size, self.dtype)
       self.buffer.ref(1)
       self.contiguous_child: Optional[Tuple[ReferenceType[LazyBuffer], ShapeTracker]] = None
       self.forced_realize = False
@@ -51,7 +51,7 @@ class LazyBuffer:
     if hasattr(self, 'buffer'): self.buffer.ref(-1)
   def __repr__(self) -> str:
-    return f"<LB {self.device} {self.shape} {str(self.dtype)[7:]} {self.st if self.base != self else (self.op, self.realized)}>"
+    return f"<LB {self.device} {self.shape} {str(self.dtype)[7:]} {self.st if self.base is not self else (self.op, self.realized)}>"
   @property
   def realized(self) -> Optional[Buffer]:
@@ -67,36 +67,42 @@ class LazyBuffer:
   def lbs(self) -> List[LazyBuffer]: return [self]
   @staticmethod
-  def loadop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, src:Tuple[LazyBuffer, ...]=(), enable_cache=False) -> LazyBuffer:
+  def metaop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, src:Tuple[LazyBuffer, ...]=(), enable_cache=False) -> LazyBuffer:
     assert isinstance(src, tuple)
     return create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, src, enable_cache=enable_cache)
-  def const(self, val:ConstType, shape:Optional[Tuple[sint,...]]=None) -> LazyBuffer:
-    assert isinstance(val, (int,float,bool)), f"{val=} has {type(val)=}, not a ConstType"
-    shape = self.shape if shape is None else shape
-    return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, self.device, arg=val).reshape((1,)*len(shape)).expand(shape)
+  def const_like(self, b): return self.const_with_shape(b, self.shape)
+  def const_with_shape(self, val:ConstType, shape:Tuple[sint,...]) -> LazyBuffer:
+    assert isinstance(val, get_args(ConstType)), f"{val=} has {type(val)=}, not a ConstType"
+    return LazyBuffer.metaop(Ops.CONST, tuple(), self.dtype, self.device, arg=val).reshape((1,)*len(shape)).expand(shape)
+  @property
   def is_realized(self) -> bool: return self.base.realized is not None
   def assign(self, x:LazyBuffer) -> LazyBuffer:
     assert x.size == self.size, f"assign target must have same size {self.size=} != {x.size=}"
-    return LazyBuffer.loadop(LoadOps.ASSIGN, self.shape, self.dtype, self.device, arg=() if self.st.contiguous else (self.st,), src=(x, self.base))
+    assert self.is_realized, f"assign target must be realized {self}"
+    return LazyBuffer.metaop(Ops.ASSIGN, self.shape, self.dtype, self.device, arg=() if self.st.contiguous else (self.st,),
+                             src=(self.base, x), enable_cache=True)
-  def can_view(self): return self.st.consecutive and not self.is_unrealized_const() and self.device.split(":")[0] in view_supported_devices
+  def can_view(self):
+    return (self.st.consecutive and not self.is_unrealized_const() and not isinstance(self.dtype, ImageDType) and
+            self.device.split(":")[0] in view_supported_devices)
   def contiguous(self, allow_buffer_view=True):
     if not self.st.contiguous or self.size != self.base.size or self.is_unrealized_const():
-      ret = self.e(LoadOps.VIEW) if allow_buffer_view and self.can_view() else self.e(LoadOps.CONTIGUOUS)
+      ret = self.alu(Ops.BUFFER_VIEW) if allow_buffer_view and self.can_view() else self.alu(Ops.CONTIGUOUS)
       if (sti := self.st.invert(self.base.shape)) is not None: self.base.contiguous_child = ref(ret), sti
       return ret
     self.base.forced_realize = True
     return self
-  def cast(self, dtype:DType, bitcast:bool=False):
+  def bitcast(self, dtype:DType) -> LazyBuffer: return self.cast(dtype, bitcast=True)
+  def cast(self, dtype:DType, bitcast:bool=False, allow_buffer_view=True) -> LazyBuffer:
     if self.dtype == dtype: return self
     if self.device.startswith("DISK") and not bitcast: raise RuntimeError("attempted to cast disk buffer (bitcast only)")
     if self.is_unrealized_unmasked_const() and not bitcast:
-      return create_lazybuffer(self.device, self.st, dtype, LoadOps.CONST, dtypes.as_const(self.base.arg, dtype))
+      return create_lazybuffer(self.device, self.st, dtype, Ops.CONST, dtypes.as_const(self.base.arg, dtype))
     new_shape = self.shape
     if bitcast and self.dtype.itemsize != dtype.itemsize:
       if not self.device.startswith("DISK"): raise RuntimeError("shape changing bitcast only supported on DISK right now")
@@ -104,29 +110,30 @@ class LazyBuffer:
       # https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
       if not (new_shape[-1]*self.dtype.itemsize) % dtype.itemsize == 0: raise RuntimeError("unsupported size in bitcast")
       new_shape = new_shape[:-1] + ((new_shape[-1]*self.dtype.itemsize) // dtype.itemsize,)
-    elif getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base:
+    elif getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self is not self.base:
       # TODO: applying this makes gpt2 slower
       return self.base.cast(dtype, bitcast)._view(self.st)
-    cast_op: Union[LoadOps, UnaryOps] = (LoadOps.VIEW if self.can_view() else UnaryOps.BITCAST) if bitcast else UnaryOps.CAST
+    cast_op: Ops = (Ops.BUFFER_VIEW if self.can_view() and allow_buffer_view else Ops.BITCAST) if bitcast else Ops.CAST
     return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), dtype, cast_op, dtype, (self,))
-  def is_unrealized_const(self): return self.base.realized is None and self.base.op is LoadOps.CONST and not isinstance(self.base.arg, Variable)
+  def is_unrealized_const(self): return self.base.realized is None and self.base.op is Ops.CONST and not isinstance(self.base.arg, UOp)
   def is_unrealized_unmasked_const(self): return self.is_unrealized_const() and all(v.mask is None for v in self.st.views)
   def _copy(self, device:str) -> LazyBuffer:
-    return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, self.buffer.nbytes, (self,), enable_cache=False)
+    assert self.st.contiguous and self.size == self.base.size, f"can only copy contig {self} {self.base}"
+    return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, Ops.COPY, self.buffer.nbytes, (self,), enable_cache=False)
-  def copy_to_device(self, device:str, force: bool = False) -> LazyBuffer:
+  def copy_to_device(self, device:str, force:bool=False, clone:bool=False) -> LazyBuffer:
     # no COPY
-    if self.device == device: return self
+    if self.device == device and not clone: return self
     # double COPY = one COPY
-    if not force and self.st.contiguous and self.size == self.base.size and not self.base.realized and self.base.op is LoadOps.COPY:
+    if not force and self.st.contiguous and self.size == self.base.size and not self.base.realized and self.base.op is Ops.COPY:
       return self.base.srcs[0].copy_to_device(device).reshape(self.st.shape)
     # const doesn't have to be copied (issues with disk tensor)
     if self.is_unrealized_const():
-      return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, device, arg=self.base.arg)._view(self.st)
+      return LazyBuffer.metaop(Ops.CONST, tuple(), self.dtype, device, arg=self.base.arg)._view(self.st)
     # if it's a shrink, do the shrink before the copy with CONTIGUOUS
     if prod(self.st.shape) < prod(self.base.st.shape): return self.contiguous()._copy(device)
@@ -134,58 +141,59 @@ class LazyBuffer:
     # copy the base and apply the shapetracker on the new device
     return self.base._copy(device)._view(self.st)
-  def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
+  def clone(self) -> LazyBuffer: return self.copy_to_device(self.device, clone=True)
+  def alu(self, op:Ops, *in_srcs:LazyBuffer) -> LazyBuffer:
     srcs: List[LazyBuffer] = []
     for s in (self,)+in_srcs:
       if s == s.base and s.base.contiguous_child and (root:=s.base.contiguous_child[0]()) is not None:
         srcs.append(root._view(s.base.contiguous_child[1]))
       else:
         srcs.append(s)
-    assert all_same(dts:=[x.dtype.scalar() for x in (srcs[1:] if op is TernaryOps.WHERE else srcs)]), f"all dtypes must match {dts} on {op}"
+    if not all_same(dts:=[x.dtype.base for x in (srcs[1:] if op is Ops.WHERE else srcs)]):
+      raise AssertionError(f"all dtypes must match {dts} on {op}")
     assert all_same([x.shape for x in srcs]), f"all shapes must be the same {[x.shape for x in srcs]}"
-    if op is TernaryOps.WHERE: assert srcs[0].dtype == dtypes.bool, "TernaryOps.WHERE must have the first arg be bool"
-    if op is UnaryOps.NEG: assert srcs[0].dtype != dtypes.bool, "UnaryOps.NEG does not accept dtype bool"
+    if op is Ops.WHERE: assert srcs[0].dtype == dtypes.bool, "Ops.WHERE must have the first arg be bool"
-    out_dtype = dtypes.bool if op in (BinaryOps.CMPLT, BinaryOps.CMPNE) else srcs[-1].dtype
+    out_dtype = dtypes.bool if op in (Ops.CMPLT, Ops.CMPNE) else srcs[-1].dtype
     # const folding
     if op in python_alu and all(s.is_unrealized_unmasked_const() for s in srcs):
-      return self.cast(out_dtype).const(exec_alu(op, out_dtype, [s.base.arg for s in srcs]))
-    if op is UnaryOps.NEG and self.base.op is UnaryOps.NEG and self.base.realized is None: return self.base.srcs[0]
-    if op in BinaryOps:
+      return self.cast(out_dtype).const_like(exec_alu(op, out_dtype, [s.base.arg for s in srcs]))
+    if op in GroupOp.Binary:
       x, y = self, in_srcs[0]
-      if op is BinaryOps.ADD:
+      if op is Ops.ADD:
         if y.is_unrealized_unmasked_const() and y.base.arg == 0: return x
         if x.is_unrealized_unmasked_const() and x.base.arg == 0: return y
-      if op is BinaryOps.MUL:
-        if x.is_unrealized_unmasked_const() and (val := x.base.arg) in (1, 0, -1):
-          return y if val == 1 else y.const(0) if val == 0 else y.e(UnaryOps.NEG)
-        if y.is_unrealized_unmasked_const() and (val := y.base.arg) in (1, 0, -1):
-          return x if val == 1 else x.const(0) if val == 0 else x.e(UnaryOps.NEG)
+      if op is Ops.MUL:
+        if x.is_unrealized_unmasked_const() and (val := x.base.arg) in (1, 0): return y if val == 1 else y.const_like(0)
+        if y.is_unrealized_unmasked_const() and (val := y.base.arg) in (1, 0): return x if val == 1 else x.const_like(0)
+      if op is Ops.IDIV and y.is_unrealized_unmasked_const() and y.base.arg == 1: return x
-    return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), out_dtype, op, arg, tuple(srcs))
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), out_dtype, op, None, tuple(srcs))
   # *** reduce ops ***
-  def _reduce_op(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
+  def _reduce_op(self, op:Ops, axis:Tuple[int, ...]) -> LazyBuffer:
     assert all(0 <= x < len(self.shape) for x in axis), f"axis args {axis} out of range for shape {self.shape}"
-    axis = tuple(sorted([x for x in axis if self.shape[x] != 1]))
+    axis = tuple(sorted([x for x in axis if resolve(self.shape[x] != 1)]))
     if len(axis) == 0: return self
-    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
-    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), self.dtype, op, axis, (self,))
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(self.st.reduce(axis)), self.dtype, Ops.REDUCE_AXIS, (op, axis), (self,))
-  def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
-    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
+  def r(self, op:Ops, axis:Tuple[int, ...]) -> LazyBuffer:
+    new_shape = self.st.reduce(axis)
     # TODO: this logic should move to the scheduler
-    if self.size == 0 and 0 not in new_shape: return self.const({ReduceOps.SUM: 0.0, ReduceOps.MAX: -math.inf}[op], new_shape)
+    if 0 in self.shape and 0 not in new_shape: return self.const_with_shape(identity_element(op, self.dtype), new_shape)
     # const folding
     # TODO: fold this for symbolic?
     if self.is_unrealized_unmasked_const() and all_int(self.shape):
-      return self.const(self.base.arg * {ReduceOps.SUM: prod(self.shape[i] for i in axis), ReduceOps.MAX: 1}[op], new_shape)
+      if op is Ops.ADD: return self.const_with_shape(self.base.arg * prod(self.shape[i] for i in axis), new_shape)
+      if op is Ops.MUL: return self.const_with_shape(self.base.arg ** prod(self.shape[i] for i in axis), new_shape)
+      if op is Ops.MAX: return self.const_with_shape(self.base.arg, new_shape)
     # TODO: can we split symbolic shape if the reduce axis is not symbolic?
-    if not getenv("SPLIT_REDUCEOP", 1) or not all_int(self.shape) or (0 in self.shape) or \
+    if not SPLIT_REDUCEOP or not all_int(self.shape) or (0 in self.shape) or \
       prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768):
       return self._reduce_op(op, axis)
@@ -208,7 +216,7 @@ class LazyBuffer:
   def _view(self, new_st:ShapeTracker) -> LazyBuffer:
     if self.st.size == 0 or (new_st.views[-1].mask is not None and any((x[1]-x[0]) == 0 for x in new_st.views[-1].mask)):
-      return self.const(0, new_st.shape)
+      return self.const_with_shape(0, new_st.shape)
     if new_st.contiguous and self.base.shape == new_st.shape: return self.base
     return create_lazybuffer(self.device, new_st, self.dtype, base=self.base)

tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl