PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +11 -6
tinygrad/codegen/kernel.py +308 -175
tinygrad/codegen/linearize.py +95 -0
tinygrad/codegen/lowerer.py +143 -0
tinygrad/codegen/transcendental.py +257 -0
tinygrad/codegen/uopgraph.py +506 -0
tinygrad/device.py +72 -171
tinygrad/dtype.py +122 -47
tinygrad/engine/jit.py +184 -87
tinygrad/{lazy.py → engine/lazy.py} +74 -66
tinygrad/engine/memory.py +51 -0
tinygrad/engine/realize.py +86 -61
tinygrad/engine/schedule.py +366 -317
tinygrad/engine/search.py +58 -47
tinygrad/function.py +59 -58
tinygrad/helpers.py +120 -102
tinygrad/multi.py +82 -78
tinygrad/nn/__init__.py +116 -67
tinygrad/nn/datasets.py +12 -5
tinygrad/nn/optim.py +1 -1
tinygrad/nn/state.py +91 -6
tinygrad/ops.py +1126 -143
tinygrad/renderer/__init__.py +47 -23
tinygrad/renderer/cstyle.py +338 -265
tinygrad/renderer/llvmir.py +125 -143
tinygrad/renderer/ptx.py +225 -0
tinygrad/runtime/autogen/adreno.py +17904 -0
tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/io_uring.py +97 -63
tinygrad/runtime/autogen/kfd.py +60 -47
tinygrad/runtime/autogen/kgsl.py +1386 -0
tinygrad/runtime/autogen/libc.py +5462 -0
tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/autogen/opencl.py +11 -11
tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
tinygrad/runtime/graph/clang.py +3 -3
tinygrad/runtime/graph/cuda.py +11 -15
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +71 -43
tinygrad/runtime/ops_amd.py +244 -323
tinygrad/runtime/ops_clang.py +12 -5
tinygrad/runtime/ops_cloud.py +220 -0
tinygrad/runtime/ops_cuda.py +42 -99
tinygrad/runtime/ops_disk.py +25 -26
tinygrad/runtime/ops_dsp.py +181 -0
tinygrad/runtime/ops_gpu.py +29 -16
tinygrad/runtime/ops_hip.py +68 -0
tinygrad/runtime/ops_llvm.py +15 -10
tinygrad/runtime/ops_metal.py +147 -64
tinygrad/runtime/ops_nv.py +356 -397
tinygrad/runtime/ops_python.py +78 -79
tinygrad/runtime/ops_qcom.py +405 -0
tinygrad/runtime/support/__init__.py +0 -0
tinygrad/runtime/support/compiler_cuda.py +77 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/runtime/support/hcq.py +539 -0
tinygrad/shape/shapetracker.py +40 -50
tinygrad/shape/view.py +102 -63
tinygrad/tensor.py +1109 -365
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
tinygrad-0.10.0.dist-info/RECORD +77 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad/codegen/uops.py +0 -451
tinygrad/engine/graph.py +0 -100
tinygrad/renderer/assembly.py +0 -269
tinygrad/shape/symbolic.py +0 -327
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0

tinygrad/engine/memory.py ADDED Viewed

@@ -0,0 +1,51 @@
+from typing import List, Union, Tuple, Dict
+from collections import defaultdict
+from tinygrad.engine.schedule import ScheduleItem
+from tinygrad.device import Device, Buffer
+from tinygrad.helpers import NO_MEMORY_PLANNER, dedup, DEBUG
+from tinygrad.ops import Ops
+# **************** memory planning ****************
+def _internal_memory_planner(buffers:List[Union[List[Buffer], Tuple[Buffer, ...]]], noopt_buffers=None, debug_prefix="") -> Dict[Buffer, Buffer]:
+  if NO_MEMORY_PLANNER: return {}
+  first_appearance, last_appearance = {}, {}
+  for i,u in enumerate(buffers):
+    for buf in u:
+      if buf.is_allocated() or buf.lb_refcount > 0 or (noopt_buffers is not None and buf.base in noopt_buffers): continue
+      if buf.base not in first_appearance: first_appearance[buf.base] = i
+      last_appearance[buf.base] = i
+  # Sort buffers by size in descending order, prioritizing largest buffers for allocation first.
+  # Track free segments, each containing (start, stop, and buffer that could be reused on this segment).
+  free_segs: Dict[Tuple, List[Tuple[int, int, Buffer]]] = defaultdict(list) # Dict[buffer key, Tuple[start, end, buffer to reuse on the seg]]
+  def find_replace_buffer(buf, st, en):
+    key = (buf.device, buf.dtype, buf.options) + ((buf.nbytes,) if not hasattr(Device[buf.device].allocator, "offset") else tuple())
+    default_buf = (0, len(buffers) - 1, buf) # will return the buffer itself if the replace one is not found.
+    seg_st, seg_en, seg_buf = next((free_segs[key].pop(i) for i,(sst,sen,_) in enumerate(free_segs[key]) if sst <= st and en <= sen), default_buf)
+    free_segs[key] += [(seg_st, st - 1, seg_buf)] if st - 1 >= seg_st else []
+    free_segs[key] += [(en + 1, seg_en, seg_buf)] if seg_en >= en + 1 else []
+    return seg_buf if seg_buf.nbytes == buf.nbytes else Buffer(buf.device, buf.size, buf.dtype, base=seg_buf)
+  buffer_requests = sorted([(first_appearance[buf], last_appearance[buf], buf) for buf in first_appearance.keys()], key=lambda x: -x[2].nbytes)
+  assigned = {buf:find_replace_buffer(buf, st, en) for st, en, buf in buffer_requests}
+  for i,u in enumerate(buffers):
+    for buf in u:
+      if buf.is_allocated() or buf.lb_refcount > 0 or (noopt_buffers is not None and buf.base in noopt_buffers): continue
+      if buf._base is not None: assigned[buf] = Buffer(buf.device, buf.size, buf.dtype, base=assigned.get(buf.base, buf.base).base, offset=buf.offset)
+      else: assigned[buf] = assigned.get(buf, buf)
+  if DEBUG >= 1 and len(ak:=dedup(x for x in assigned.keys() if x._base is None)) != len(av:=dedup(x for x in assigned.values() if x._base is None)):
+    print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,",
+          f"{len(ak)} -> {len(av)} bufs")
+  return assigned
+def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]:
+  # Exclude buffers involved in load ops (e.g transfers) to preserve parallelism in graphs.
+  assigned = _internal_memory_planner([si.bufs for si in schedule],
+                                      noopt_buffers={b for si in schedule if si.ast.op is not Ops.SINK for b in si.bufs})
+  return [ScheduleItem(si.ast, tuple(assigned.get(x, x) for x in si.bufs), si.metadata, si.assign_preloads) for si in schedule]

tinygrad/engine/realize.py CHANGED Viewed

@@ -1,42 +1,62 @@
 from typing import List, Dict, Optional, cast, Generator, Tuple
-import time
+import time, pprint
 from dataclasses import dataclass, replace
-from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING
-from tinygrad.ops import BufferOps, LoadOps, LazyOp
+from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, Context, TRACEMETA
+from tinygrad.ops import Ops, UOp, Variable, sym_infer, sint
+from tinygrad.dtype import dtypes
 from tinygrad.device import Device, Buffer
-from tinygrad.shape.symbolic import Variable, sym_infer, sint
 from tinygrad.renderer import Renderer, Program
-from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.codegen.kernel import Kernel
 from tinygrad.engine.schedule import ScheduleItem
 # **************** Program Creation ****************
 logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
-def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
-  if DEBUG >= 3:
-    from tinygrad.engine.graph import print_tree
-    for op in ast: print_tree(op)
-  k = Linearizer(*ast, opts=renderer)
-  k.required_optimizations()
+def get_kernel(renderer:Renderer, ast:UOp) -> Kernel:
+  if DEBUG >= 5:
+    print(ast)
+  k = Kernel(ast, opts=renderer).required_optimizations()
   if not NOOPT:
     if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
     if BEAM >= 1:
       from tinygrad.engine.search import beam_search, time_linearizer, bufs_from_lin
-      kb, k_opt = Linearizer(*ast, opts=renderer), k
-      kb.required_optimizations()
+      kb, k_opt = Kernel(ast, opts=renderer).required_optimizations(), k
       rawbufs = bufs_from_lin(kb, allocate=False)
-      k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
-      if getenv("BEAM_COMPARE", 1):
+      if BEAM.value >= 100:
+        from extra.mcts_search import mcts_search
+        k = mcts_search(kb, rawbufs, BEAM.value)
+      else:
+        k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
+      if beam_compare:=getenv("BEAM_COMPARE", 1):
         # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
-        lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
-        if used_tensor_cores:
-          lins.append(("hc", Linearizer(*ast, opts=renderer)))
-          lins[-1][1].hand_coded_optimizations()
+        lins: List[Tuple[str, Kernel]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
+        if used_tensor_cores: lins.append(("hc", Kernel(ast, opts=renderer).hand_coded_optimizations()))
         timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
-        if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
+        if DEBUG >= 3: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
         k = timed[0][1]
         if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
-  # TODO: check the correctness inline once compare_linearizer is in core
+        if beam_compare == 2:
+          from tinygrad import Tensor
+          all_outs: List[List[Tensor]] = []
+          with Context(DEBUG=0, BEAM=0, CAPTURING=0):
+            rand_bufs = [Tensor.normal(buf.size, std=0.1, dtype=buf.dtype).data() if dtypes.is_float(buf.dtype) else \
+                        (Tensor.randint(buf.size, low=0, high=2).cast(buf.dtype).data() if buf.dtype == dtypes.bool else \
+                         Tensor.randint(buf.size, low=dtypes.min(buf.dtype), high=dtypes.max(buf.dtype), dtype=buf.dtype).data()) \
+                         for buf in rawbufs]
+          for _, tk in lins[::-1]:
+            for buf,data in zip(rawbufs, rand_bufs): buf.ensure_allocated().copyin(data)
+            time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True, disable_cache=True)
+            all_outs.append([Tensor(bytes(buf.as_buffer()), dtype=buf.dtype) for buf in rawbufs[:len(ast.src)]])
+          with Context(DEBUG=0, BEAM=0, CAPTURING=0):
+            for bufs in zip(*all_outs):
+              for b in bufs[1:]:
+                if dtypes.is_float(bufs[0].dtype):
+                  # we check both atol and rtol here
+                  diff_count = (((b-bufs[0]).abs() > 1e-3) * (((b-bufs[0])/bufs[0]).abs() > 1e-3)).sum().item()
+                else:
+                  diff_count = (b != bufs[0]).sum().item()
+                if diff_count != 0:
+                  raise RuntimeError(f"mismatch of {diff_count}/{b.numel()} items with type {b.dtype}, max {(b-bufs[0]).abs().max().item()}")
   if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
   if DEBUG >= 5: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
   return k
@@ -44,8 +64,9 @@ def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
 # **************** Runners ****************
 class Runner:
-  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
-    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
+  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0, lds_estimate:Optional[sint]=None):
+    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate, self.lds_estimate = \
+      True, display_name, dname, op_estimate, mem_estimate, mem_estimate if lds_estimate is None else lds_estimate
   @property
   def device(self): return Device[self.dname]
   def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
@@ -58,8 +79,9 @@ class CompiledRunner(Runner):
     if DEBUG >= 4: print(p.src)
     self.p:Program = p
     self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
+    if DEBUG >= 6: Device[p.dname].compiler.disassemble(self.lib)
     self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
-    super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
+    super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate, p.lds_estimate)
   def __reduce__(self): return self.__class__, (self.p, self.lib)
@@ -73,19 +95,13 @@ class CompiledRunner(Runner):
       self.p = replace(self.p, global_size=global_size, local_size=local_size)
     lra = {}
     if global_size:
-      lra['global_size'] = global_size
+      lra['global_size'] = tuple(global_size)
       assert len(global_size) == 3, "global size must have len 3"
     if local_size:
-      lra['local_size'] = local_size
+      lra['local_size'] = tuple(local_size)
       assert len(local_size) == 3, "local size must have len 3"
     return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
-class CustomOp(Runner):
-  def __init__(self, fxn):
-    self.fxn = fxn
-    super().__init__(self.fxn.__name__, "CUSTOM", 0, 0)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): self.fxn(*rawbufs)
 class EmptyOp(Runner):
   def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device)
   def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass
@@ -101,7 +117,8 @@ class BufferCopy(Runner):
     else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
     super().__init__(colored(name, "yellow"), dest_device, 0, total_sz)
   def copy(self, dest, src):
-    disk_supports_fast_copyout = src.device.startswith("DISK") and hasattr(src.allocator.device, 'io_uring') and hasattr(src.allocator.device, 'fd')
+    disk_supports_fast_copyout = src.device.startswith("DISK") and hasattr(src.allocator.device, 'io_uring') and \
+      getattr(src.allocator.device, 'fd', None) is not None
     if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_disk') and disk_supports_fast_copyout and src.nbytes >= 4096:
       dest.allocator.copy_from_disk(dest._buf, src._buf, src.nbytes)
     elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'):
@@ -119,24 +136,20 @@ class BufferCopy(Runner):
       return time.perf_counter() - st
 class BufferXfer(BufferCopy):
-  def copy(self, dest, src):
-    if hasattr(dest.allocator.device, "track_cross_buffer") and hasattr(src.allocator, "track_cross_device"):
-      dest.allocator.device.track_cross_buffer.append(src)
-      src.allocator.track_cross_device.add(dest.allocator.device)
-    dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
+  def copy(self, dest, src): dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
 # **************** method cache ****************
-method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
-def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
-  ckey = (dname, ast, BEAM.value, False)
+method_cache: Dict[Tuple[str, bytes, int, int, bool], CompiledRunner] = {}
+def get_runner(dname:str, ast:UOp) -> CompiledRunner:
+  ckey = (dname, ast.key, BEAM.value, NOOPT.value, False)
   if cret:=method_cache.get(ckey): return cret
-  bkey = (dname.split(":")[0], ast, BEAM.value, True)
+  bkey = (dname.split(":")[0], ast.key, BEAM.value, NOOPT.value, True)
   if bret:=method_cache.get(bkey):
     method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
   else:
-    prg: Program = get_linearizer(Device[dname].renderer, ast).to_program()
-    if hasattr(prg.uops, "fuzz_paths"):
+    prg: Program = get_kernel(Device[dname].renderer, ast).to_program()
+    if getenv("FUZZ_UOPS"):
       from test.external.fuzz_uops import UOpsFuzzerRunner
       return UOpsFuzzerRunner(replace(prg, dname=dname))
     method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
@@ -148,39 +161,51 @@ def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
 class ExecItem:
   prg: Runner
   bufs: List[Optional[Buffer]]
-  def run(self, var_vals:Optional[Dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
+  metadata: Optional[Tuple[Metadata, ...]] = None
+  def run(self, _var_vals:Optional[Dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
+    var_vals = {} if _var_vals is None else _var_vals
     bufs = [cast(Buffer, x) for x in self.bufs] if jit else [cast(Buffer, x).ensure_allocated() for x in self.bufs]
-    et = self.prg(bufs, var_vals if var_vals is not None else {}, wait=wait or DEBUG >= 2)
+    et = self.prg(bufs, var_vals, wait=wait or DEBUG >= 2)
     if do_update_stats:
       GlobalCounters.kernel_count += 1
-      GlobalCounters.global_ops += (op_estimate:=sym_infer(self.prg.op_estimate, var_vals))
-      GlobalCounters.global_mem += (mem_estimate:=sym_infer(self.prg.mem_estimate, var_vals))
+      GlobalCounters.global_ops += (op_est:=sym_infer(self.prg.op_estimate, var_vals))
+      GlobalCounters.global_mem += (mem_est:=sym_infer(self.prg.mem_estimate, var_vals))
       if et is not None: GlobalCounters.time_sum_s += et
       if DEBUG >= 2:
+        lds_est = sym_infer(self.prg.lds_estimate, var_vals)
+        mem_est = min(mem_est, lds_est)   # there can't be more memory accessed than loads/stores. remove this when symbolic is fixed
         ptm = (colored(f"{et*1e3:9.2f}ms", "yellow") if et > 0.01 else f"{et*1e6:9.2f}us") if et is not None else ""
-        print(f"{colored(f'*** {self.prg.dname[:7]:7s} {GlobalCounters.kernel_count:4d}', 'magenta' if jit else ('green' if self.prg.first_run else None))} {self.prg.display_name+' '*(38-ansilen(self.prg.display_name))} arg {len(self.bufs):3d} mem {GlobalCounters.mem_used/1e9:5.2f} GB " +  # noqa: E501
-              (str() if et is None else f"tm {ptm}/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_estimate/((et or 1e-20)*1e9):8.2f} GFLOPS, {mem_estimate/((et or 1e-20)*1e9):7.2f} GB/s)"))  # noqa: E501
+        print(f"{colored(f'*** {self.prg.dname[:7]:7s} {GlobalCounters.kernel_count:4d}', 'magenta' if jit else ('green' if self.prg.first_run else None))} {self.prg.display_name+' '*(41-ansilen(self.prg.display_name))} arg {len(bufs):2d} mem {GlobalCounters.mem_used/1e9:5.2f} GB " +  # noqa: E501
+              (str() if et is None else f"tm {ptm}/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_est/((et or 1e-20)*1e9):9.2f} GFLOPS {mem_est/((et or 1e-20)*1e9):6.1f}|{lds_est/((et or 1e-20)*1e9):<7.1f} GB/s)" +  # noqa: E501
+               f" {[repr(m) if TRACEMETA >= 2 else str(m) for m in self.metadata] if self.metadata else ''}"))
       self.prg.first_run = False
     return et
 def lower_schedule_item(si:ScheduleItem) -> ExecItem:
-  assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY or getenv("USE_COPY_KERNEL")
-  if si.ast[0].op is BufferOps.STORE:
+  assert len(set(x.device for x in si.bufs)) == 1 or si.ast.op is Ops.COPY
+  if si.ast.op is Ops.SINK:
     runner = get_runner(si.outputs[0].device, si.ast)
-    return ExecItem(runner, [si.bufs[x[0]] for x in runner.p.globals])
-  out, ast = si.outputs[0], si.ast[0]
-  if ast.op is LoadOps.COPY:
+    return ExecItem(runner, [si.bufs[x] for x in runner.p.globals], si.metadata)
+  out, arg = si.outputs[0], si.ast.arg
+  if si.ast.op is Ops.COPY:
     kernel_type = BufferCopy
     if hasattr(Device[out.device].allocator, 'transfer') and out.device.split(":")[0] == si.inputs[0].device.split(":")[0]:
       kernel_type = BufferXfer
-    return ExecItem(kernel_type(ast.arg, out.device, si.inputs[0].device), list(si.bufs))
-  if ast.op is LoadOps.CUSTOM: return ExecItem(CustomOp(ast.arg), list(si.bufs))
-  if ast.op is LoadOps.EMPTY: return ExecItem(EmptyOp(out), list(si.bufs))
-  if ast.op is LoadOps.VIEW: return ExecItem(ViewOp(out), list(si.bufs))
-  raise RuntimeError(f"don't know how to lower {ast}")
+    return ExecItem(kernel_type(arg, out.device, si.inputs[0].device), list(si.bufs))
+  if si.ast.op is Ops.EMPTY: return ExecItem(EmptyOp(out), list(si.bufs))
+  if si.ast.op is Ops.BUFFER_VIEW: return ExecItem(ViewOp(out), list(si.bufs))
+  raise RuntimeError(f"don't know how to lower {si.ast}")
 def lower_schedule(schedule:List[ScheduleItem]) -> Generator[ExecItem, None, None]:
-  while len(schedule): yield lower_schedule_item(schedule.pop(0))
+  while len(schedule):
+    si = schedule.pop(0)
+    try: yield lower_schedule_item(si)
+    except Exception as e:
+      if DEBUG >= 2:
+        print(f"error lowering {si.ast.op}")
+        print("tensor operations:")
+        pprint.pprint(si.metadata, indent=2)
+      raise e
 # **************** main run function ****************

tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl