PyPI - tinygrad - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.9.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +78 -90
tinygrad/codegen/linearizer.py +237 -169
tinygrad/codegen/uops.py +278 -242
tinygrad/device.py +147 -10
tinygrad/dtype.py +7 -7
tinygrad/engine/graph.py +16 -16
tinygrad/engine/jit.py +39 -36
tinygrad/engine/realize.py +6 -5
tinygrad/engine/schedule.py +15 -7
tinygrad/engine/search.py +6 -3
tinygrad/function.py +17 -23
tinygrad/helpers.py +77 -8
tinygrad/lazy.py +26 -26
tinygrad/multi.py +13 -9
tinygrad/nn/__init__.py +1 -1
tinygrad/nn/datasets.py +2 -1
tinygrad/nn/state.py +3 -4
tinygrad/ops.py +49 -16
tinygrad/renderer/__init__.py +8 -4
tinygrad/renderer/assembly.py +93 -100
tinygrad/renderer/cstyle.py +47 -42
tinygrad/renderer/llvmir.py +30 -30
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +11504 -1
tinygrad/runtime/autogen/comgr.py +36 -10
tinygrad/runtime/autogen/hsa.py +146 -14
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/nv_gpu.py +269 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +20 -11
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +3 -2
tinygrad/runtime/graph/cuda.py +2 -2
tinygrad/runtime/graph/hcq.py +122 -78
tinygrad/runtime/ops_amd.py +302 -316
tinygrad/runtime/ops_cuda.py +3 -3
tinygrad/runtime/ops_disk.py +70 -5
tinygrad/runtime/ops_gpu.py +2 -2
tinygrad/runtime/ops_metal.py +5 -6
tinygrad/runtime/ops_npy.py +1 -1
tinygrad/runtime/ops_nv.py +161 -166
tinygrad/runtime/ops_python.py +20 -16
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +5 -2
tinygrad/shape/symbolic.py +1 -3
tinygrad/shape/view.py +34 -19
tinygrad/tensor.py +219 -135
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/runtime/driver/hsa.py +0 -143
tinygrad/runtime/graph/hsa.py +0 -171
tinygrad/runtime/ops_hsa.py +0 -278
tinygrad-0.9.0.dist-info/RECORD +0 -60
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/device.py CHANGED Viewed

@@ -2,9 +2,9 @@ from __future__ import annotations
 import multiprocessing
 from dataclasses import dataclass
 from collections import defaultdict
-from typing import List, Optional, Dict, Tuple, Any
-import importlib, inspect, functools, pathlib, os, ctypes
-from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
+from typing import List, Optional, Dict, Tuple, Any, cast
+import importlib, inspect, functools, pathlib, os, ctypes, atexit, time, contextlib
+from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, ProfileLogger, PROFILE
 from tinygrad.dtype import DType, ImageDType
 from tinygrad.renderer import Renderer
@@ -19,15 +19,17 @@ class _Device:
   def __getitem__(self, ix:str) -> Compiled: return self.__get_canonicalized_item(self.canonicalize(ix))
   @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
   def __get_canonicalized_item(self, ix:str) -> Compiled:
-    if DEBUG >= 1: print(f"opening device {ix} from pid:{os.getpid()}")
-    assert multiprocessing.current_process().name == "MainProcess" or ix.split(":")[0] in ["DISK", "NPY"], f"can only open device {ix} from parent"
+    assert ((cpn:=multiprocessing.current_process().name) == "MainProcess") or ix.split(":")[0] in ["DISK", "NPY"], \
+      f"can only open device {ix} from parent, not {cpn}"
     x = ix.split(":")[0].upper()
-    return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._devices][0](ix)  # noqa: E501
+    ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._devices][0](ix)  # noqa: E501
+    if DEBUG >= 1: print(f"opened device {ix} from pid:{os.getpid()}")
+    return ret
   @functools.cached_property
   def DEFAULT(self) -> str:
     device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._devices, None)   # type: ignore
     if device_from_env: return device_from_env
-    for device in ["METAL", "HSA", "CUDA", "GPU", "CLANG", "LLVM"]:
+    for device in ["METAL", "AMD", "CUDA", "GPU", "CLANG", "LLVM"]:
       try:
         if self[device]:
           os.environ[device] = "1"   # we set this in environment for spawned children
@@ -171,13 +173,148 @@ class Compiler:
   def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
   def compile_cached(self, src:str) -> bytes:
     if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
-      assert not getenv("ASSERT_COMPILE"), "tried to compile with ASSERT_COMPILE set"
+      assert not getenv("ASSERT_COMPILE"), f"tried to compile with ASSERT_COMPILE set\n{src}"
       lib = self.compile(src)
       if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
     return lib
 class Compiled:
   def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
-    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
-    self.renderer = renderer if renderer else Renderer()
+    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler or Compiler(), runtime, graph
+    self.renderer = renderer or Renderer()
   def synchronize(self): pass  # override this in your device
+# **************** for HCQ Compatible Devices ****************
+@contextlib.contextmanager
+def hcq_profile(dev, queue_type, enabled, desc):
+  st, en = (dev._get_signal(), dev._get_signal()) if enabled else (None, None)
+  if enabled: queue_type().timestamp(st).submit(dev)
+  try: yield (st, en)
+  finally:
+    if enabled: queue_type().timestamp(en).submit(dev)
+    if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
+class HCQCompatCompiled(Compiled):
+  def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, comp_queue_t, copy_queue_t, timeline_signals):
+    self.hw_compute_queue_t, self.hw_copy_queue_t = comp_queue_t, copy_queue_t
+    self.timeline_value: int = 1
+    self.timeline_signal, self._shadow_timeline_signal = timeline_signals
+    self.sig_prof_records: List[Tuple[Any, Any, str, bool]] = []
+    self.raw_prof_records: List[Tuple[int, int, str, bool]] = []
+    if PROFILE: self._prof_setup()
+    from tinygrad.runtime.graph.hcq import HCQGraph
+    super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
+  @classmethod
+  def _read_signal(self, sig): raise NotImplementedError("need _read_signal") # reads a value for a signal
+  @classmethod
+  def _read_timestamp(self, sig): raise NotImplementedError("need _read_timestamp") # reads a timestamp for a signal
+  @classmethod
+  def _set_signal(self, sig, value): raise NotImplementedError("need _set_signal") # sets a value for a signal
+  @classmethod
+  def _get_signal(self, value=0, **kwargs): raise NotImplementedError("need _get_signal") # allocates a new signal
+  @classmethod
+  def _wait_signal(self, signal, value=0, timeout=10000): raise NotImplementedError("need _wait_signal") # waits for a signal value
+  def _gpu2cpu_time(self, gpu_time, is_copy): raise NotImplementedError("need _gpu2cpu_time")
+  def _prof_setup(self):
+    self.profile_logger = ProfileLogger()
+    def _sync_queue(q_t):
+      q_t().timestamp(self.timeline_signal).signal(self.timeline_signal, self.timeline_value).submit(self)
+      self.timeline_value += 1
+      cpu_start_time = time.perf_counter_ns() / 1e3
+      self._wait_signal(self.timeline_signal, self.timeline_value - 1)
+      return cpu_start_time, self._read_timestamp(self.timeline_signal)
+    self.cpu_start_time, self.gpu_start_time = _sync_queue(self.hw_compute_queue_t)
+    self.copy_cpu_start_time, self.copy_gpu_start_time = _sync_queue(self.hw_copy_queue_t)
+    atexit.register(self._prof_finalize)
+  def _prof_process_events(self):
+    self.raw_prof_records += [(self._read_timestamp(st), self._read_timestamp(en), name, is_cp) for st, en, name, is_cp in self.sig_prof_records]
+    for st, en, _, _ in self.sig_prof_records: self.signals_pool += [st, en] # type: ignore
+    self.sig_prof_records = []
+  def _prof_finalize(self):
+    for st, en, name, is_cp in self.raw_prof_records:
+      self.profile_logger.events += [(name, self._gpu2cpu_time(st, is_cp), self._gpu2cpu_time(en, is_cp), self.dname, ["COMPUTE", "DMA"][is_cp])]
+    del self.profile_logger
+  def _wrap_timeline_signal(self):
+    self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
+    self._set_signal(self.timeline_signal, 0)
+    cast(HCQCompatAllocator, self.allocator).b_timeline = [0] * len(cast(HCQCompatAllocator, self.allocator).b)
+class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
+  def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
+    self.device = device
+    self.b = [self._alloc(batch_size, BufferOptions(host=True)) for _ in range(batch_cnt)]
+    self.b_timeline, self.b_next = [0] * len(self.b), 0
+    super().__init__()
+  def copyin(self, dest, src: memoryview):
+    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
+      for i in range(0, src.nbytes, self.b[0].size):
+        self.b_next = (self.b_next + 1) % len(self.b)
+        self.device._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
+        ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i))
+        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
+                                     .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
+                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
+        self.b_timeline[self.b_next] = self.device.timeline_value
+        self.device.timeline_value += 1
+  def copy_from_disk(self, dest, src, size):
+    def _get_temp_buf():
+      # Check if the next buffer is safe to be used (its signal has passed) and reserve it.
+      if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.device._read_signal(self.device.timeline_signal):
+        self.b_timeline[(self.b_next + 1) % len(self.b)], self.b_next = (1 << 64), (self.b_next + 1) % len(self.b)
+        return (self.b[self.b_next].va_addr, self.b_next)
+      return None
+    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
+      for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
+        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
+                                     .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
+                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
+        self.b_timeline[batch_info[1]] = self.device.timeline_value
+        self.device.timeline_value += 1
+  def copyout(self, dest:memoryview, src):
+    self.device.synchronize()
+    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
+      for i in range(0, dest.nbytes, self.b[0].size):
+        self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
+                                     .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
+                                     .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
+        self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value)
+        self.device.timeline_value += 1
+        ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
+  def transfer(self, dest, src, sz: int, src_dev, dest_dev):
+    src_dev._gpu_map(dest)
+    with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
+      src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
+                               .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
+                               .copy(dest.va_addr, src.va_addr, sz) \
+                               .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev)
+      src_dev.timeline_value += 1
+    if src_dev != dest_dev:
+      dest_dev.hw_compute_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
+                                   .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
+                                   .signal(dest_dev.timeline_signal, dest_dev.timeline_value).submit(dest_dev)
+      dest_dev.timeline_value += 1
+  def offset(self, buf, size:int, offset:int): return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)

tinygrad/dtype.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from typing import Final, Optional, ClassVar, Set, Tuple, Dict, Union
 from dataclasses import dataclass
-import numpy as np  # TODO: remove numpy
 import functools
 from tinygrad.helpers import getenv
@@ -18,9 +17,6 @@ class DType:
     assert sz > 1 and self.count == 1, f"can't vectorize {self} with size {sz}"
     return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self.name]}{sz}", None, sz)
   def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.count))]] if self.count > 1 else self
-  # TODO: someday this will be removed with the "remove numpy" project
-  @property
-  def np(self) -> Optional[type]: return np.dtype(self.fmt).type if self.fmt is not None else None
 # dependent typing?
 @dataclass(frozen=True, repr=False)
@@ -47,9 +43,13 @@ class dtypes:
   @staticmethod
   def is_unsigned(x: DType) -> bool: return x.scalar() in (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
   @staticmethod
-  def from_np(x: type) -> DType: return DTYPES_DICT[np.dtype(x).name]
-  @staticmethod  # NOTE: isinstance(True, int) is True in python
-  def from_py(x) -> DType: return dtypes.default_float if isinstance(x, float) else dtypes.bool if isinstance(x, bool) else dtypes.default_int
+  def from_py(x) -> DType:
+    if x.__class__ is float: return dtypes.default_float
+    if x.__class__ is int: return dtypes.default_int
+    if x.__class__ is bool: return dtypes.bool
+    # put this in the last is faster because there are more items than lists/tuples to check
+    if x.__class__ is list or x.__class__ is tuple: return max(dtypes.from_py(xi) for xi in x) if x else dtypes.default_float
+    raise RuntimeError(f"Could not infer dtype of {x} with type {type(x)}")
   @staticmethod
   def as_const(val: ConstType, dtype:DType): return int(val) if dtypes.is_int(dtype) else float(val) if dtypes.is_float(dtype) else bool(val)
   @staticmethod

tinygrad/engine/graph.py CHANGED Viewed

@@ -1,15 +1,14 @@
-import os, atexit, functools
+import os, atexit, functools, contextlib
 from collections import defaultdict
-from typing import List, Any, DefaultDict
+from typing import List, Any, DefaultDict, Union
 from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LoadOps, BufferOps, TernaryOps, LazyOp
 from tinygrad.device import Device
 from tinygrad.helpers import GRAPHPATH, DEBUG, GlobalCounters, getenv
-from tinygrad.codegen.linearizer import UOps, UOp
+from tinygrad.codegen.uops import UOps, UOp, UPat
 from tinygrad.shape.symbolic import NumNode
 from tinygrad.lazy import LazyBuffer
-try: import networkx as nx
-except ImportError: pass
+with contextlib.suppress(ImportError): import networkx as nx
 # **** debugging and graphing ****
@@ -61,7 +60,7 @@ def log_lazybuffer(lb:'LazyBuffer', scheduled=False):
     for idx,x in enumerate(lb.srcs):
       if nm(x) not in G.nodes: log_lazybuffer(x)
       if x.base.realized is None and x.base.op is LoadOps.CONST:
-        label_append.append(f"\nCONST{idx} {x.base.arg}")
+        label_append.append(f"\nCONST{idx} {x.base.arg:g}")
       else:
         G.add_edge(nm(x), nm(lb), color='#a0a0a0')
     label = '"' + \
@@ -75,18 +74,19 @@ def log_lazybuffer(lb:'LazyBuffer', scheduled=False):
       # realized but unseen?
       G.add_node(nm(lb), label=f'"{str(lb.base.realized)[5:-1].replace(" ", chr(10))}\nb:{nm(lb.realized)}"', style='filled', fillcolor="#f0c08080")
-def _tree(lazyop:LazyOp, cycles, cnt, prefix=""):
+def _tree(dag:Union[LazyOp, UOp, UPat], cycles, cnt):
   cnt[0] += 1
-  if len(lazyop.src) == 0: return [f"━━ {prefix}{lazyop.op.name} {lazyop.arg if lazyop.arg else ''}"]
-  if (lid := id(lazyop)) in cycles and cycles[lid][1] > (tcnt := getenv("TREE_CYCLE_CNT", 5)) and tcnt >= 0:
-    return [f"━⬆︎ goto {cycles[id(lazyop)][0]}: {lazyop.op.name}"]
+  src = dag.src if isinstance(dag.src, (list, tuple)) else [] if dag.src is None else [dag.src]
+  if len(src) == 0: return [f"━━ {dag.op} {dag.arg}"]
+  if (lid := id(dag)) in cycles and cycles[lid][1] > (tcnt := getenv("TREE_CYCLE_CNT", 5)) and tcnt >= 0:
+    return [f"━⬆︎ goto {cycles[id(dag)][0]}: {dag.op}"]
   cycles[lid] = (cnt[0], 1 if lid not in cycles else cycles[lid][1]+1)
-  lines = [f"━┳ {prefix}{lazyop.op.name} {lazyop.arg if lazyop.arg else ''}"]
-  childs = [_tree(c, cycles, cnt) for c in lazyop.src[:]]
+  lines = [f"━┳ {dag.op} {dag.arg}"]
+  childs = [_tree(c, cycles, cnt) for c in src]
   for c in childs[:-1]: lines += [f" ┣{c[0]}"] + [f" ┃{l}" for l in c[1:]]
   return lines + [" ┗"+childs[-1][0]] + ["  "+l for l in childs[-1][1:]]
-def print_tree(lazyop:LazyOp): print("\n".join([f"{str(i).rjust(3)} {s}" for i,s in enumerate(_tree(lazyop, {}, [-1]))]))
+def print_tree(dag:Union[LazyOp, UOp, UPat]): print("\n".join([f"{str(i).rjust(3)} {s}" for i,s in enumerate(_tree(dag, {}, [-1]))]))
 def graph_uops(uops:List[UOp]):
   colors = {UOps.ALU: "#ffffc0", UOps.LOAD: "#ffc0c0", UOps.STORE: "#c0ffc0", UOps.SPECIAL: "#c0c0ff", UOps.CONST: "#e0e0e0",
@@ -94,7 +94,7 @@ def graph_uops(uops:List[UOp]):
             UOps.RANGE: "#c8a0e0", UOps.PHI: "#e0ffc0", UOps.BARRIER: "#ff8080", UOps.IF: "#c8b0c0"}
   G = nx.DiGraph()
   for u in uops:
-    if u.uop in {UOps.ENDRANGE, UOps.ENDIF}: continue
-    G.add_node(uops.index(u), label=f"{str(u.uop)[5:]}{(' '+str(u.arg)) if u.arg is not None else ''}\n{str(u.dtype)}", style="filled", fillcolor=colors.get(u.uop, "#ffffff"))  # noqa: E501
-    for v in u.vin: G.add_edge(uops.index(v), uops.index(u))
+    if u.op in {UOps.ENDRANGE, UOps.ENDIF}: continue
+    G.add_node(uops.index(u), label=f"{str(u.op)[5:]}{(' '+str(u.arg).replace(':', '')) if u.arg is not None else ''}\n{str(u.dtype)}", style="filled", fillcolor=colors.get(u.op, "#ffffff"))  # noqa: E501
+    for v in u.src: G.add_edge(uops.index(v), uops.index(u))
   save_graph(G, f'{GRAPHPATH}.uops', '-Grankdir=LR')

tinygrad/engine/jit.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import TypeVar, Generic, Callable, List, Tuple, Union, Dict, cast, O
 import functools, itertools, collections
 from tinygrad.tensor import Tensor
 from tinygrad.lazy import LazyBuffer
-from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
+from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, ContextVar, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
 from tinygrad.device import Buffer, Compiled, Device
 from tinygrad.dtype import DType
 from tinygrad.shape.shapetracker import ShapeTracker
@@ -41,7 +41,7 @@ def apply_graph_to_jit(jit_cache: List[ExecItem], input_rawbuffers: List[Buffer]
     if ji.prg.__class__ in {EmptyOp, ViewOp}: continue
     ji_graph_dev: Optional[Compiled] = None # device on which the ji will be graphed. Not graphed if None.
     if isinstance(ji.prg, CompiledRunner): ji_graph_dev = ji.prg.device
-    elif isinstance(ji.prg, BufferXfer) and ji.bufs[0] and ji.bufs[0].device.split(":", 1)[0] in {"HSA", "CUDA", "NV", "AMD"}:
+    elif isinstance(ji.prg, BufferXfer) and ji.bufs[0] and ji.bufs[0].device.split(":", 1)[0] in {"CUDA", "NV", "AMD"}:
       ji_graph_dev = Device[ji.bufs[0].device]
     graph_class = (ji_graph_dev.graph.func if isinstance(ji_graph_dev.graph, functools.partial) else ji_graph_dev.graph) if ji_graph_dev else None #type: ignore
@@ -82,7 +82,7 @@ class GraphRunner(Runner):  # pylint: disable=abstract-method
         if ji.prg.p.vars: self.jc_idx_with_updatable_var_vals.append(j)
         if (ji.prg.p.global_size and not all_int(ji.prg.p.global_size)) or (ji.prg.p.local_size and not all_int(ji.prg.p.local_size)):
           self.jc_idx_with_updatable_launch_dims.append(j)
-    self.vars = list(var_vals.keys())
+    self.vars = sorted(var_vals.keys(), key=lambda v: v.expr)
     super().__init__(colored(f"<batched {len(self.jit_cache)}>", "cyan"), jit_cache[0].prg.dname.split(":")[0], op_estimate, mem_estimate)
 class MultiGraphRunner(GraphRunner):  # pylint: disable=abstract-method
@@ -97,15 +97,16 @@ class MultiGraphRunner(GraphRunner):  # pylint: disable=abstract-method
     wait_nodes = []
     for rawbuf in read + write:
-      if id(rawbuf._buf) in self.w_dependency_map: wait_nodes.append(self.w_dependency_map[id(rawbuf._buf)])
+      if id(rawbuf.base._buf) in self.w_dependency_map: wait_nodes.append(self.w_dependency_map[id(rawbuf.base._buf)])
     for rawbuf in write:
-      if id(rawbuf._buf) in self.r_dependency_map: wait_nodes.extend(self.r_dependency_map.pop(id(rawbuf._buf)))
+      if id(rawbuf.base._buf) in self.r_dependency_map: wait_nodes.extend(self.r_dependency_map.pop(id(rawbuf.base._buf)))
-    for rawbuf in read: self.r_dependency_map[id(rawbuf._buf)].append(new_dependency)
-    for rawbuf in write: self.w_dependency_map[id(rawbuf._buf)] = new_dependency
+    for rawbuf in read: self.r_dependency_map[id(rawbuf.base._buf)].append(new_dependency)
+    for rawbuf in write: self.w_dependency_map[id(rawbuf.base._buf)] = new_dependency
     return list({id(x):x for x in wait_nodes}.values())
 ReturnType = TypeVar('ReturnType')
+IN_JIT = ContextVar('IN_JIT', 0)
 class TinyJit(Generic[ReturnType]):
   def __init__(self, fxn:Callable[..., ReturnType]):
     self.fxn = fxn
@@ -134,25 +135,26 @@ class TinyJit(Generic[ReturnType]):
   def __call__(self, *args, **kwargs) -> ReturnType:
     input_tensors: List[Tuple[Union[int, str], Tensor]] = \
-      [(cast(Union[int, str], k),v) for k,v in itertools.chain(enumerate(args), sorted(kwargs.items())) if v.__class__ is Tensor]
-    if len(input_tensors): Tensor.realize(*[x[1] for x in input_tensors])
-    lbs: List[LazyBuffer] = flatten([v.lazydata.lbs for _,v in input_tensors])
-    expected_sts_var_dtype_device = [(*x.st.unbind(), x.dtype, x.device) for x in lbs]
-    input_rawbuffers: List[Buffer] = [v.base.realized for v in lbs if v.base.realized is not None]
-    assert len(set(input_rawbuffers)) == len(input_rawbuffers), "duplicate inputs to JIT"
-    var_vals: Dict[Variable, int] = merge_dicts([x[1] for x in expected_sts_var_dtype_device] + \
-                                                [dict(x.unbind() for x in itertools.chain(args, kwargs.values()) if isinstance(x, Variable))])
-    expected_names, expected_lbs = [x[0] for x in input_tensors], [(x[0], tuple(x[1].keys()), x[2], x[3]) for x in expected_sts_var_dtype_device]
-    if self.cnt == 0:
+      [(cast(Union[int, str], name),t) for name,t in itertools.chain(enumerate(args), sorted(kwargs.items())) if t.__class__ is Tensor]
+    if input_tensors: Tensor.realize(*[t for _,t in input_tensors])
+    names: List[Union[int, str]] = [name for name,_ in input_tensors]
+    lbs: List[LazyBuffer] = flatten([t.lazydata.lbs for _,t in input_tensors])
+    st_varvals_dtype_device = [(*lb.st.unbind(), lb.dtype, lb.device) for lb in lbs]
+    input_buffers: List[Buffer] = [lb.base.realized for lb in lbs if lb.base.realized is not None]
+    assert len(set(input_buffers)) == len(input_buffers), "duplicate inputs to JIT"
+    var_vals: Dict[Variable, int] = merge_dicts([varvals for _,varvals,_,_ in st_varvals_dtype_device] + \
+                                                [dict(v.unbind() for v in itertools.chain(args, kwargs.values()) if isinstance(v, Variable))])
+    st_vars_dtype_device = [(x[0], tuple(sorted(x[1].keys(), key=lambda v: v.expr)), x[2], x[3]) for x in st_varvals_dtype_device]
+    if not JIT or self.cnt == 0:
+      if IN_JIT: raise RuntimeError("having TinyJit inside another TinyJit is not supported")
       # jit ignore
-      with Context(BEAM=0 if getenv("IGNORE_JIT_FIRST_BEAM") else BEAM.value):
+      with Context(BEAM=0 if getenv("IGNORE_JIT_FIRST_BEAM") else BEAM.value, IN_JIT=1):
         self.ret = self.fxn(*args, **kwargs)
         if len(params:=get_parameters(self.ret)): Tensor.realize(params[0], *params[1:])
     elif self.cnt == 1:
       # jit capture
-      self.expected_names: List[Union[int, str]] = expected_names
-      self.expected_lbs: List[Tuple[ShapeTracker, Tuple[Variable, ...], DType, str]] = expected_lbs
+      self.expected_names: List[Union[int, str]] = names
+      self.expected_st_vars_dtype_device: List[Tuple[ShapeTracker, Tuple[Variable, ...], DType, str]] = st_vars_dtype_device
       with Context(GRAPH=getenv("JITGRAPH", GRAPH.value), BEAM=getenv("JITBEAM", BEAM.value)):
         capturing.append(self)
         self.ret = self.fxn(*args, **kwargs)
@@ -160,31 +162,32 @@ class TinyJit(Generic[ReturnType]):
         capturing.clear()
       del self.buffer_replace
       assert len(self.jit_cache), "didn't JIT anything!"
-      if DEBUG >= 1: print(f"JIT captured {len(self.jit_cache)} kernels with {len(input_rawbuffers)} inputs")
+      if DEBUG >= 1: print(f"JIT captured {len(self.jit_cache)} kernels with {len(input_buffers)} inputs")
       # track inputs that are views of buffers
-      for ji in self.jit_cache:
-        for b in ji.bufs:
-          if b is not None and b._base is not None and b._base in input_rawbuffers:
-            input_rawbuffers.append(b)
-            self.extra_view_inputs.append((input_rawbuffers.index(b.base), b.offset, b.device, b.size, b.dtype))
+      for item in self.jit_cache:
+        for b in item.bufs:
+          if b is not None and b._base is not None and b._base in input_buffers:
+            input_buffers.append(b)
+            self.extra_view_inputs.append((input_buffers.index(b.base), b.offset, b.device, b.size, b.dtype))
       # memory planning (optional)
-      assigned = _internal_memory_planner([cast(List[Buffer], x.bufs) for x in self.jit_cache], debug_prefix="JIT ")
-      self.jit_cache = [ExecItem(ei.prg, [assigned.get(x,x).ensure_allocated() for x in ei.bufs if x is not None]) for ei in self.jit_cache]
+      assigned = _internal_memory_planner([cast(List[Buffer], item.bufs) for item in self.jit_cache], debug_prefix="JIT ")
+      self.jit_cache = [ExecItem(item.prg, [assigned.get(b,b).ensure_allocated() for b in item.bufs if b is not None]) for item in self.jit_cache]
       # Condense the items into a graph executor.
-      if JIT < 2: self.jit_cache = apply_graph_to_jit(self.jit_cache, input_rawbuffers, var_vals)
+      if JIT < 2: self.jit_cache = apply_graph_to_jit(self.jit_cache, input_buffers, var_vals)
-      self.input_replace = get_input_replace(self.jit_cache, input_rawbuffers)
-      if DEBUG >= 1 and len(set(self.input_replace.values())) != len(input_rawbuffers): print("WARNING: some input tensors not found")
+      self.input_replace = get_input_replace(self.jit_cache, input_buffers)
+      if DEBUG >= 1 and len(set(self.input_replace.values())) != len(input_buffers): print("WARNING: some input tensors not found")
     elif self.cnt >= 2:
       # jit exec
-      assert self.expected_names == expected_names, f"args mismatch in JIT: {self.expected_names=} != {expected_names}"
-      assert self.expected_lbs == expected_lbs, f"args mismatch in JIT: {self.expected_lbs=} != {expected_lbs=}"
+      assert self.expected_names == names, f"args mismatch in JIT: {self.expected_names=} != {names}"
+      assert self.expected_st_vars_dtype_device == st_vars_dtype_device, \
+        f"args mismatch in JIT: {self.expected_st_vars_dtype_device=} != {st_vars_dtype_device=}"
       for idx, offset, device, size, dtype in self.extra_view_inputs:
-        input_rawbuffers.append(Buffer(device, size, dtype, base=input_rawbuffers[idx], offset=offset).ensure_allocated())
-      for (j,i),input_idx in self.input_replace.items(): self.jit_cache[j].bufs[i] = input_rawbuffers[input_idx]
+        input_buffers.append(Buffer(device, size, dtype, base=input_buffers[idx], offset=offset).ensure_allocated())
+      for (j,i),input_idx in self.input_replace.items(): self.jit_cache[j].bufs[i] = input_buffers[input_idx]
       if DEBUG >= 1 and len(self.jit_cache) >= 10: print(f"jit execs {len(self.jit_cache)} kernels")
       for ei in self.jit_cache: ei.run(var_vals, jit=True)

tinygrad/engine/realize.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List, Dict, Optional, cast, Generator, Tuple
 import time
 from dataclasses import dataclass, replace
-from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int
+from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING
 from tinygrad.ops import BufferOps, LoadOps, LazyOp
 from tinygrad.device import Device, Buffer
 from tinygrad.shape.symbolic import Variable, sym_infer, sint
@@ -38,7 +38,7 @@ def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
         if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
   # TODO: check the correctness inline once compare_linearizer is in core
   if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
-  if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
+  if DEBUG >= 5: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
   return k
 # **************** Runners ****************
@@ -101,8 +101,9 @@ class BufferCopy(Runner):
     else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
     super().__init__(colored(name, "yellow"), dest_device, 0, total_sz)
   def copy(self, dest, src):
-    if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_fd') and src.nbytes >= 4096 and hasattr(src.allocator.device, 'fd'):
-      dest.allocator.copy_from_fd(dest._buf, src.allocator.device.fd, src._buf.offset, src.nbytes)
+    disk_supports_fast_copyout = src.device.startswith("DISK") and hasattr(src.allocator.device, 'io_uring') and hasattr(src.allocator.device, 'fd')
+    if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_disk') and disk_supports_fast_copyout and src.nbytes >= 4096:
+      dest.allocator.copy_from_disk(dest._buf, src._buf, src.nbytes)
     elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'):
       # fast(ish) path, uses readinto in diskbuffers
       src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf)
@@ -187,5 +188,5 @@ capturing: List = []  # put classes with an add method in here
 def run_schedule(schedule:List[ScheduleItem], var_vals:Optional[Dict[Variable, int]]=None, do_update_stats=True):
   for ei in lower_schedule(schedule):
-    if len(capturing): capturing[0].add(ei)
+    if len(capturing) and CAPTURING: capturing[0].add(ei)
     ei.run(var_vals, do_update_stats=do_update_stats)

tinygrad/engine/schedule.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import sys, pickle, atexit
 from collections import defaultdict, deque
 from dataclasses import dataclass
-from typing import Tuple, List, Dict, Optional, Set, DefaultDict, Union
+from typing import Tuple, List, Dict, Optional, Set, DefaultDict, Union, get_args
 from tinygrad.ops import LoadOps, BufferOps, LazyOp, ReduceOps, ConstBuffer, MemBuffer, UNSAFE_PAD_OPS, UnaryOps
 from tinygrad.engine.graph import log_lazybuffer, realized_lazybuffer
-from tinygrad.helpers import GRAPH, DEBUG, MULTIOUTPUT, SAVE_SCHEDULE, GlobalCounters, prod, dedup, all_int, merge_dicts, getenv
+from tinygrad.helpers import GRAPH, DEBUG, MULTIOUTPUT, SAVE_SCHEDULE, GlobalCounters, colored, prod, dedup, all_int, merge_dicts, getenv
 from tinygrad.shape.symbolic import Variable
-from tinygrad.dtype import ImageDType, dtypes, DType
+from tinygrad.dtype import ConstType, ImageDType, dtypes, DType
 from tinygrad.lazy import LazyBuffer
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.device import Buffer
@@ -56,8 +56,13 @@ def _recursive_lazyop(buf:LazyBuffer, inputs:List[LazyBuffer], outputs:Tuple[Laz
   if buf.op is LoadOps.CONST:
     unbound_st, st_var_vals = st.simplify().unbind()
     var_vals.update(st_var_vals)
-    if isinstance(buf.arg, Variable): var_vals.__setitem__(*buf.arg.unbind())
-    return LazyOp(BufferOps.CONST, (), ConstBuffer(buf.arg, buf.dtype, unbound_st))
+    if isinstance(buf.arg, Variable):
+      val, var_val = buf.arg.unbind()
+      var_vals.__setitem__(val, var_val)
+    else:
+      assert isinstance(buf.arg, get_args(ConstType)), f"cannot create ConstBuffer with value {buf.arg}"
+      val = buf.arg
+    return LazyOp(BufferOps.CONST, (), ConstBuffer(val, buf.dtype, unbound_st))
   # if we aren't fusing it, it's a load and we add it to the inputs
   if buf.realized is not None or (buf in realizes and buf not in outputs):
@@ -69,7 +74,8 @@ def _recursive_lazyop(buf:LazyBuffer, inputs:List[LazyBuffer], outputs:Tuple[Laz
         # we also allow masked views. if it has a single view and it's equal when you shrink a contig, it's fine
         if not (len(unbound_st.views) == 1 and unbound_st.views[0].mask is not None and
             ShapeTracker.from_shape(unbound_st.shape).shrink(unbound_st.views[0].mask) == unbound_st.shrink(unbound_st.views[0].mask)):
-          raise RuntimeError(f"must be contiguous for assign {unbound_st}")
+          raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
+                             +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
       return LazyOp(BufferOps.LOAD, (), MemBuffer(outputs.index(assign_targets[buf]), buf.dtype, unbound_st))
     if buf not in inputs: inputs.append(buf)
     return LazyOp(BufferOps.LOAD, (), MemBuffer(len(outputs)+inputs.index(buf), buf.dtype, unbound_st))
@@ -138,6 +144,8 @@ def _recurse_lb(buf:LazyBuffer, realizes:Dict[LazyBuffer, None], allbufs:Dict[La
         pass # don't realize image to image casts. this is part of a larger problem
       else:
         realizes[buf.base] = None
+    # check all other pads for safe fusion
+    elif any(v.mask is not None for v in buf.st.views): simple_pads.add(buf.base)
     return _recurse_lb(buf.base, realizes, allbufs, simple_pads, children)
   # base
   allbufs[buf] = None
@@ -308,7 +316,7 @@ def create_schedule_with_vars(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffe
   if SAVE_SCHEDULE:
     def _save():
       print(f"saving {len(SCHEDULES)} schedule graphs to", fp:=getenv("SAVE_SCHEDULE_PATH", "schedule.pkl"))
-      pickle.dump(SCHEDULES, open(fp, "wb"))
+      with open(fp, "wb") as f: pickle.dump(SCHEDULES, f)
     if len(SCHEDULES) == 0: atexit.register(_save)
     SCHEDULES.extend((ps.ast for ps in prescheduled.values()) if getenv("CAPTURE_AST") else [(graph, prescheduled)])
   # confirm everything was scheduled correctly

tinygrad/engine/search.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _time_program(p:Program, lib:bytes, var_vals, rawbufs, early_stop=None, max_
   input_bufs = [rawbufs[i] for i,_ in car.p.globals]
   for _ in range(cnt):
     if clear_l2:
-      with Context(DEBUG=0, BEAM=0, CACHECOLLECTING=0): Tensor.ones(1024,1024).contiguous().realize(do_update_stats=False)
+      with Context(DEBUG=0, BEAM=0, CAPTURING=0): Tensor.ones(1024,1024).contiguous().realize(do_update_stats=False)
     tms.append(cast(float, car(input_bufs, var_vals, wait=True))*factor)
     if early_stop is not None and early_stop < tms[-1]: break
   return tms
@@ -70,6 +70,9 @@ def _try_compile_linearized_w_idx(x:Tuple[int,Linearizer], compiler:Compiler) ->
     ret = None
   except TimeoutException:
     ret = None
+  except Exception as e:
+    if getenv("BEAM_STRICT_MODE"): raise e
+    ret = None
   finally:
     signal.alarm(0)
   return x[0], ret
@@ -115,7 +118,7 @@ beam_pool, BEAM_DEBUG = None, getenv("BEAM_DEBUG")
 def beam_search(lin:Linearizer, rawbufs:List[Buffer], amt:int, allow_test_size=True) -> Linearizer:
   global beam_pool
   key = {"ast": lin.ast[0].key, "amt": amt, "allow_test_size": allow_test_size, "device": lin.opts.device, "suffix": lin.opts.suffix}
-  if (val:=diskcache_get("beam_search", key)) is not None and not getenv("IGNORE_BEAM_CACHE") and CACHELEVEL >= 1:
+  if not getenv("IGNORE_BEAM_CACHE") and CACHELEVEL >= 1 and (val:=diskcache_get("beam_search", key)) is not None:
     ret = lin.copy()
     for o in val[len(lin.applied_opts):]: ret.apply_opt(o)
     return ret
@@ -123,7 +126,7 @@ def beam_search(lin:Linearizer, rawbufs:List[Buffer], amt:int, allow_test_size=T
   beam: List[Tuple[Linearizer, float]] = [(lin, float("inf"))]
   seen_libs = set()
-  default_parallel = multiprocessing.cpu_count() if lin.opts.device in {"CUDA", "HSA", "AMD", "NV"} else 0
+  default_parallel = multiprocessing.cpu_count() if lin.opts.device in {"CUDA", "AMD", "NV"} else 0
   if beam_pool is None and (workers := getenv("PARALLEL", default_parallel)):
     beam_pool = multiprocessing.get_context("spawn").Pool(workers, _init_worker, (), getenv("BEAM_MAX_TASKS_PER_CHILD", 16))

tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.9.0py3-none-any.whl → 0.9.1py3-none-any.whl