PyPI - tinygrad - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

tinygrad 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

tinygrad/codegen/devectorizer.py +247 -0
tinygrad/codegen/expander.py +121 -0
tinygrad/codegen/kernel.py +141 -201
tinygrad/codegen/linearize.py +223 -84
tinygrad/codegen/lowerer.py +60 -42
tinygrad/codegen/symbolic.py +476 -0
tinygrad/codegen/transcendental.py +22 -13
tinygrad/device.py +187 -47
tinygrad/dtype.py +39 -28
tinygrad/engine/jit.py +83 -65
tinygrad/engine/memory.py +4 -5
tinygrad/engine/multi.py +161 -0
tinygrad/engine/realize.py +62 -108
tinygrad/engine/schedule.py +396 -357
tinygrad/engine/search.py +55 -66
tinygrad/gradient.py +73 -0
tinygrad/helpers.py +81 -59
tinygrad/nn/__init__.py +30 -32
tinygrad/nn/datasets.py +1 -2
tinygrad/nn/optim.py +22 -26
tinygrad/nn/state.py +91 -66
tinygrad/ops.py +492 -641
tinygrad/renderer/__init__.py +95 -36
tinygrad/renderer/cstyle.py +99 -92
tinygrad/renderer/llvmir.py +83 -34
tinygrad/renderer/ptx.py +83 -99
tinygrad/renderer/wgsl.py +95 -0
tinygrad/runtime/autogen/amd_gpu.py +39507 -12
tinygrad/runtime/autogen/comgr.py +2 -0
tinygrad/runtime/autogen/kfd.py +4 -3
tinygrad/runtime/autogen/kgsl.py +1 -1
tinygrad/runtime/autogen/libc.py +404 -71
tinygrad/runtime/autogen/llvm.py +11379 -0
tinygrad/runtime/autogen/pci.py +1333 -0
tinygrad/runtime/autogen/vfio.py +891 -0
tinygrad/runtime/autogen/webgpu.py +6985 -0
tinygrad/runtime/graph/cuda.py +8 -9
tinygrad/runtime/graph/hcq.py +84 -79
tinygrad/runtime/graph/metal.py +40 -43
tinygrad/runtime/ops_amd.py +498 -334
tinygrad/runtime/ops_cloud.py +34 -34
tinygrad/runtime/ops_cpu.py +24 -0
tinygrad/runtime/ops_cuda.py +30 -27
tinygrad/runtime/ops_disk.py +62 -63
tinygrad/runtime/ops_dsp.py +159 -42
tinygrad/runtime/ops_gpu.py +30 -30
tinygrad/runtime/ops_hip.py +29 -31
tinygrad/runtime/ops_llvm.py +48 -41
tinygrad/runtime/ops_metal.py +149 -113
tinygrad/runtime/ops_npy.py +2 -2
tinygrad/runtime/ops_nv.py +238 -273
tinygrad/runtime/ops_python.py +55 -50
tinygrad/runtime/ops_qcom.py +129 -157
tinygrad/runtime/ops_webgpu.py +225 -0
tinygrad/runtime/support/allocator.py +94 -0
tinygrad/runtime/support/am/__init__.py +0 -0
tinygrad/runtime/support/am/amdev.py +396 -0
tinygrad/runtime/support/am/ip.py +463 -0
tinygrad/runtime/support/compiler_cuda.py +4 -2
tinygrad/runtime/support/elf.py +28 -4
tinygrad/runtime/support/hcq.py +256 -324
tinygrad/runtime/support/llvm.py +26 -0
tinygrad/shape/shapetracker.py +85 -53
tinygrad/shape/view.py +104 -140
tinygrad/spec.py +155 -0
tinygrad/tensor.py +835 -527
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
tinygrad/viz/index.html +544 -0
tinygrad/viz/perfetto.html +178 -0
tinygrad/viz/serve.py +205 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
tinygrad-0.10.2.dist-info/RECORD +99 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
tinygrad/codegen/uopgraph.py +0 -506
tinygrad/engine/lazy.py +0 -228
tinygrad/function.py +0 -212
tinygrad/multi.py +0 -177
tinygrad/runtime/graph/clang.py +0 -39
tinygrad/runtime/ops_clang.py +0 -35
tinygrad-0.10.0.dist-info/RECORD +0 -77
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0

tinygrad/engine/realize.py CHANGED Viewed

@@ -1,11 +1,11 @@
-from typing import List, Dict, Optional, cast, Generator, Tuple
+from typing import Optional, cast, Generator
 import time, pprint
 from dataclasses import dataclass, replace
-from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, Context, TRACEMETA
-from tinygrad.ops import Ops, UOp, Variable, sym_infer, sint
-from tinygrad.dtype import dtypes
+from tinygrad.helpers import all_same, colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA
+from tinygrad.helpers import DEVECTORIZE, time_to_str
+from tinygrad.ops import Ops, PatternMatcher, UOp, UPat, Variable, sym_infer
 from tinygrad.device import Device, Buffer
-from tinygrad.renderer import Renderer, Program
+from tinygrad.renderer import Renderer, ProgramSpec, Estimates
 from tinygrad.codegen.kernel import Kernel
 from tinygrad.engine.schedule import ScheduleItem
@@ -13,50 +13,15 @@ from tinygrad.engine.schedule import ScheduleItem
 logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
 def get_kernel(renderer:Renderer, ast:UOp) -> Kernel:
-  if DEBUG >= 5:
-    print(ast)
+  if DEBUG >= 5: print(ast)
   k = Kernel(ast, opts=renderer).required_optimizations()
   if not NOOPT:
-    if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
+    if not k.apply_tensor_cores(getenv("TC", 1)): k.hand_coded_optimizations()
     if BEAM >= 1:
-      from tinygrad.engine.search import beam_search, time_linearizer, bufs_from_lin
-      kb, k_opt = Kernel(ast, opts=renderer).required_optimizations(), k
+      from tinygrad.engine.search import beam_search, bufs_from_lin
+      kb = Kernel(ast, opts=renderer).required_optimizations()
       rawbufs = bufs_from_lin(kb, allocate=False)
-      if BEAM.value >= 100:
-        from extra.mcts_search import mcts_search
-        k = mcts_search(kb, rawbufs, BEAM.value)
-      else:
-        k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
-      if beam_compare:=getenv("BEAM_COMPARE", 1):
-        # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
-        lins: List[Tuple[str, Kernel]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
-        if used_tensor_cores: lins.append(("hc", Kernel(ast, opts=renderer).hand_coded_optimizations()))
-        timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
-        if DEBUG >= 3: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
-        k = timed[0][1]
-        if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
-        if beam_compare == 2:
-          from tinygrad import Tensor
-          all_outs: List[List[Tensor]] = []
-          with Context(DEBUG=0, BEAM=0, CAPTURING=0):
-            rand_bufs = [Tensor.normal(buf.size, std=0.1, dtype=buf.dtype).data() if dtypes.is_float(buf.dtype) else \
-                        (Tensor.randint(buf.size, low=0, high=2).cast(buf.dtype).data() if buf.dtype == dtypes.bool else \
-                         Tensor.randint(buf.size, low=dtypes.min(buf.dtype), high=dtypes.max(buf.dtype), dtype=buf.dtype).data()) \
-                         for buf in rawbufs]
-          for _, tk in lins[::-1]:
-            for buf,data in zip(rawbufs, rand_bufs): buf.ensure_allocated().copyin(data)
-            time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True, disable_cache=True)
-            all_outs.append([Tensor(bytes(buf.as_buffer()), dtype=buf.dtype) for buf in rawbufs[:len(ast.src)]])
-          with Context(DEBUG=0, BEAM=0, CAPTURING=0):
-            for bufs in zip(*all_outs):
-              for b in bufs[1:]:
-                if dtypes.is_float(bufs[0].dtype):
-                  # we check both atol and rtol here
-                  diff_count = (((b-bufs[0]).abs() > 1e-3) * (((b-bufs[0])/bufs[0]).abs() > 1e-3)).sum().item()
-                else:
-                  diff_count = (b != bufs[0]).sum().item()
-                if diff_count != 0:
-                  raise RuntimeError(f"mismatch of {diff_count}/{b.numel()} items with type {b.dtype}, max {(b-bufs[0]).abs().max().item()}")
+      k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
   if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
   if DEBUG >= 5: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
   return k
@@ -64,33 +29,32 @@ def get_kernel(renderer:Renderer, ast:UOp) -> Kernel:
 # **************** Runners ****************
 class Runner:
-  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0, lds_estimate:Optional[sint]=None):
-    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate, self.lds_estimate = \
-      True, display_name, dname, op_estimate, mem_estimate, mem_estimate if lds_estimate is None else lds_estimate
+  def __init__(self, display_name:str, device:str, estimates=Estimates()):
+    self.first_run, self.display_name, self.device, self.estimates = True, display_name, device, estimates
   @property
-  def device(self): return Device[self.dname]
-  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
+  def dev(self): return Device[self.device]
+  def exec(self, rawbufs:list[Buffer], var_vals:Optional[dict[Variable, int]]=None) -> Optional[float]:
     return self(rawbufs, {} if var_vals is None else var_vals)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
+  def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> Optional[float]:
     raise NotImplementedError("override this")
 class CompiledRunner(Runner):
-  def __init__(self, p:Program, precompiled:Optional[bytes]=None):
+  def __init__(self, p:ProgramSpec, precompiled:Optional[bytes]=None):
     if DEBUG >= 4: print(p.src)
-    self.p:Program = p
-    self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
-    if DEBUG >= 6: Device[p.dname].compiler.disassemble(self.lib)
-    self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
-    super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate, p.lds_estimate)
+    self.p:ProgramSpec = p
+    self.lib:bytes = precompiled if precompiled is not None else Device[p.device].compiler.compile_cached(p.src)
+    if DEBUG >= 6: Device[p.device].compiler.disassemble(self.lib)
+    self._prg = Device[p.device].runtime(p.function_name, self.lib)
+    super().__init__(p.name, p.device, p.estimates)
   def __reduce__(self): return self.__class__, (self.p, self.lib)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
+  def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> Optional[float]:
     global_size, local_size = self.p.launch_dims(var_vals)
     if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
       # TODO: this is copied from get_program
       from tinygrad.engine.search import optimize_local_size
-      local_size = optimize_local_size(self.clprg, global_size, rawbufs)
+      local_size = optimize_local_size(self._prg, global_size, rawbufs)
       global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
       self.p = replace(self.p, global_size=global_size, local_size=local_size)
     lra = {}
@@ -100,33 +64,29 @@ class CompiledRunner(Runner):
     if local_size:
       lra['local_size'] = tuple(local_size)
       assert len(local_size) == 3, "local size must have len 3"
-    return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
-class EmptyOp(Runner):
-  def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass
+    return self._prg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
 class ViewOp(Runner):
   def __init__(self, buf:Buffer): super().__init__(colored(f"view {buf.nbytes:8d} @ {buf.offset:<10d}", "yellow"), buf.device)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
+  def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False):
     assert rawbufs[0]._base is not None and rawbufs[0]._base == rawbufs[1].base, f"must be base {rawbufs}"
 class BufferCopy(Runner):
   def __init__(self, total_sz, dest_device, src_device):
     if total_sz >= 1e6: name = f"{type(self).__name__[6:].lower()} {total_sz/1e6:7.2f}M, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
     else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
-    super().__init__(colored(name, "yellow"), dest_device, 0, total_sz)
+    super().__init__(colored(name, "yellow"), dest_device, Estimates(lds=total_sz, mem=total_sz))
   def copy(self, dest, src):
-    disk_supports_fast_copyout = src.device.startswith("DISK") and hasattr(src.allocator.device, 'io_uring') and \
-      getattr(src.allocator.device, 'fd', None) is not None
+    disk_supports_fast_copyout = src.device.startswith("DISK") and hasattr(src.allocator.dev, 'io_uring') and \
+      getattr(src.allocator.dev, 'fd', None) is not None
     if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_disk') and disk_supports_fast_copyout and src.nbytes >= 4096:
       dest.allocator.copy_from_disk(dest._buf, src._buf, src.nbytes)
-    elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'):
+    elif src.device.startswith("DISK") and hasattr(dest.allocator, '_as_buffer'):
       # fast(ish) path, uses readinto in diskbuffers
-      src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf)
+      src.allocator._copyout(dest.allocator._as_buffer(dest._buf), src._buf)
     else:
       dest.copyin(src.as_buffer(allow_zero_copy=True))  # may allocate a CPU buffer depending on allow_zero_copy
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
+  def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False):
     dest, src = rawbufs[0:2]
     assert dest.size == src.size and dest.dtype == src.dtype, f"buffer copy mismatch, {dest.size} != {src.size}, {dest.dtype} != {src.dtype}"
     st = time.perf_counter()
@@ -136,23 +96,22 @@ class BufferCopy(Runner):
       return time.perf_counter() - st
 class BufferXfer(BufferCopy):
-  def copy(self, dest, src): dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
+  def copy(self, dest, src): dest.allocator._transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.dev, dest_dev=dest.allocator.dev)
 # **************** method cache ****************
-method_cache: Dict[Tuple[str, bytes, int, int, bool], CompiledRunner] = {}
-def get_runner(dname:str, ast:UOp) -> CompiledRunner:
-  ckey = (dname, ast.key, BEAM.value, NOOPT.value, False)
+method_cache: dict[tuple[str, bytes, tuple[int, ...], bool], CompiledRunner] = {}
+def get_runner(device:str, ast:UOp) -> CompiledRunner:
+  # TODO: this should be all context relevant to rendering
+  context = (BEAM.value, NOOPT.value, DEVECTORIZE.value)
+  ckey = (device, ast.key, context, False)
   if cret:=method_cache.get(ckey): return cret
-  bkey = (dname.split(":")[0], ast.key, BEAM.value, NOOPT.value, True)
+  bkey = (device.split(":")[0], ast.key, context, True)
   if bret:=method_cache.get(bkey):
-    method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
+    method_cache[ckey] = ret = CompiledRunner(replace(bret.p, device=device), bret.lib)
   else:
-    prg: Program = get_kernel(Device[dname].renderer, ast).to_program()
-    if getenv("FUZZ_UOPS"):
-      from test.external.fuzz_uops import UOpsFuzzerRunner
-      return UOpsFuzzerRunner(replace(prg, dname=dname))
-    method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
+    prg: ProgramSpec = get_kernel(Device[device].renderer, ast).to_program()
+    method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, device=device))
   return ret
 # **************** lowering functions ****************
@@ -160,43 +119,38 @@ def get_runner(dname:str, ast:UOp) -> CompiledRunner:
 @dataclass(frozen=True)
 class ExecItem:
   prg: Runner
-  bufs: List[Optional[Buffer]]
-  metadata: Optional[Tuple[Metadata, ...]] = None
-  def run(self, _var_vals:Optional[Dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
+  bufs: list[Optional[Buffer]]
+  metadata: Optional[tuple[Metadata, ...]] = None
+  def run(self, _var_vals:Optional[dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
     var_vals = {} if _var_vals is None else _var_vals
     bufs = [cast(Buffer, x) for x in self.bufs] if jit else [cast(Buffer, x).ensure_allocated() for x in self.bufs]
     et = self.prg(bufs, var_vals, wait=wait or DEBUG >= 2)
     if do_update_stats:
       GlobalCounters.kernel_count += 1
-      GlobalCounters.global_ops += (op_est:=sym_infer(self.prg.op_estimate, var_vals))
-      GlobalCounters.global_mem += (mem_est:=sym_infer(self.prg.mem_estimate, var_vals))
+      GlobalCounters.global_ops += (op_est:=sym_infer(self.prg.estimates.ops, var_vals))
+      GlobalCounters.global_mem += (mem_est:=sym_infer(self.prg.estimates.mem, var_vals))
       if et is not None: GlobalCounters.time_sum_s += et
       if DEBUG >= 2:
-        lds_est = sym_infer(self.prg.lds_estimate, var_vals)
+        lds_est = sym_infer(self.prg.estimates.lds, var_vals)
         mem_est = min(mem_est, lds_est)   # there can't be more memory accessed than loads/stores. remove this when symbolic is fixed
-        ptm = (colored(f"{et*1e3:9.2f}ms", "yellow") if et > 0.01 else f"{et*1e6:9.2f}us") if et is not None else ""
-        print(f"{colored(f'*** {self.prg.dname[:7]:7s} {GlobalCounters.kernel_count:4d}', 'magenta' if jit else ('green' if self.prg.first_run else None))} {self.prg.display_name+' '*(41-ansilen(self.prg.display_name))} arg {len(bufs):2d} mem {GlobalCounters.mem_used/1e9:5.2f} GB " +  # noqa: E501
+        ptm = colored(time_to_str(et, w=9), "yellow" if et > 0.01 else None) if et is not None else ""
+        print(f"{colored(f'*** {self.prg.device[:7]:7s} {GlobalCounters.kernel_count:4d}', 'magenta' if jit else ('green' if self.prg.first_run else None))} {self.prg.display_name+' '*(41-ansilen(self.prg.display_name))} arg {len(bufs):2d} mem {GlobalCounters.mem_used/1e9:5.2f} GB " +  # noqa: E501
               (str() if et is None else f"tm {ptm}/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_est/((et or 1e-20)*1e9):9.2f} GFLOPS {mem_est/((et or 1e-20)*1e9):6.1f}|{lds_est/((et or 1e-20)*1e9):<7.1f} GB/s)" +  # noqa: E501
                f" {[repr(m) if TRACEMETA >= 2 else str(m) for m in self.metadata] if self.metadata else ''}"))
       self.prg.first_run = False
     return et
-def lower_schedule_item(si:ScheduleItem) -> ExecItem:
-  assert len(set(x.device for x in si.bufs)) == 1 or si.ast.op is Ops.COPY
-  if si.ast.op is Ops.SINK:
-    runner = get_runner(si.outputs[0].device, si.ast)
-    return ExecItem(runner, [si.bufs[x] for x in runner.p.globals], si.metadata)
-  out, arg = si.outputs[0], si.ast.arg
-  if si.ast.op is Ops.COPY:
-    kernel_type = BufferCopy
-    if hasattr(Device[out.device].allocator, 'transfer') and out.device.split(":")[0] == si.inputs[0].device.split(":")[0]:
-      kernel_type = BufferXfer
-    return ExecItem(kernel_type(arg, out.device, si.inputs[0].device), list(si.bufs))
-  if si.ast.op is Ops.EMPTY: return ExecItem(EmptyOp(out), list(si.bufs))
-  if si.ast.op is Ops.BUFFER_VIEW: return ExecItem(ViewOp(out), list(si.bufs))
-  raise RuntimeError(f"don't know how to lower {si.ast}")
-def lower_schedule(schedule:List[ScheduleItem]) -> Generator[ExecItem, None, None]:
+# NOTE: ctx is the buffers
+si_lowerer = PatternMatcher([
+  (UPat(Ops.SINK, name="sink"), lambda ctx,sink: (runner:=get_runner(ctx[0].device, sink), [ctx[x] for x in runner.p.globals])),
+  (UPat(Ops.BUFFER_VIEW), lambda ctx: (ViewOp(ctx[0]), list(ctx))),
+  (UPat(Ops.COPY, name="copy"), lambda ctx,copy: ((BufferXfer(ctx[0].nbytes, ctx[0].device, ctx[1].device) \
+      if hasattr(Device[ctx[0].device].allocator, '_transfer') and all_same([x.device.split(":")[0] for x in ctx]) \
+      else BufferCopy(ctx[0].nbytes, ctx[0].device, ctx[1].device)), list(ctx))),
+])
+def lower_schedule_item(si:ScheduleItem) -> ExecItem: return ExecItem(*cast(tuple[Runner,list], si_lowerer.rewrite(si.ast, si.bufs)), si.metadata)
+def lower_schedule(schedule:list[ScheduleItem]) -> Generator[ExecItem, None, None]:
   while len(schedule):
     si = schedule.pop(0)
     try: yield lower_schedule_item(si)
@@ -209,9 +163,9 @@ def lower_schedule(schedule:List[ScheduleItem]) -> Generator[ExecItem, None, Non
 # **************** main run function ****************
-capturing: List = []  # put classes with an add method in here
+capturing: list = []  # put classes with an add method in here
-def run_schedule(schedule:List[ScheduleItem], var_vals:Optional[Dict[Variable, int]]=None, do_update_stats=True):
+def run_schedule(schedule:list[ScheduleItem], var_vals:Optional[dict[Variable, int]]=None, do_update_stats=True):
   for ei in lower_schedule(schedule):
     if len(capturing) and CAPTURING: capturing[0].add(ei)
     ei.run(var_vals, do_update_stats=do_update_stats)

tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

tinygrad 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl