PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +253 -225
tinygrad/codegen/linearizer.py +398 -436
tinygrad/codegen/uops.py +451 -0
tinygrad/device.py +268 -274
tinygrad/dtype.py +56 -40
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +198 -0
tinygrad/engine/realize.py +192 -0
tinygrad/engine/schedule.py +370 -0
tinygrad/engine/search.py +199 -0
tinygrad/{mlops.py → function.py} +40 -32
tinygrad/helpers.py +144 -46
tinygrad/lazy.py +143 -242
tinygrad/multi.py +173 -0
tinygrad/nn/__init__.py +180 -9
tinygrad/nn/datasets.py +8 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +87 -19
tinygrad/ops.py +104 -45
tinygrad/renderer/__init__.py +65 -0
tinygrad/renderer/assembly.py +269 -0
tinygrad/renderer/cstyle.py +308 -210
tinygrad/renderer/llvmir.py +119 -124
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +13403 -0
tinygrad/runtime/autogen/comgr.py +891 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5893 -0
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33597 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +56 -0
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +39 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +187 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +550 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +129 -37
tinygrad/runtime/ops_disk.py +111 -43
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +41 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +625 -0
tinygrad/runtime/ops_python.py +208 -0
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +46 -107
tinygrad/shape/symbolic.py +99 -98
tinygrad/shape/view.py +162 -45
tinygrad/tensor.py +2492 -483
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/engine/schedule.py ADDED Viewed

@@ -0,0 +1,370 @@
+import sys, pickle, atexit
+from collections import defaultdict, deque
+from dataclasses import dataclass
+from typing import Tuple, List, Dict, Optional, Set, DefaultDict, Union, get_args
+from tinygrad.ops import LoadOps, BufferOps, LazyOp, ReduceOps, ConstBuffer, MemBuffer, UNSAFE_PAD_OPS, UnaryOps
+from tinygrad.engine.graph import log_lazybuffer, realized_lazybuffer
+from tinygrad.helpers import GRAPH, DEBUG, MULTIOUTPUT, SAVE_SCHEDULE, GlobalCounters, colored, prod, dedup, all_int, merge_dicts, getenv
+from tinygrad.shape.symbolic import Variable
+from tinygrad.dtype import ConstType, ImageDType, dtypes, DType
+from tinygrad.lazy import LazyBuffer
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.device import Buffer
+# creation can recurse a lot
+sys.setrecursionlimit(10000)
+# optionally log the ops to disk
+logops = open(getenv("LOGOPS", ""), "a") if getenv("LOGOPS", "") else None
+# *** ScheduleItem return type ***
+@dataclass(frozen=True)
+class ScheduleItem:
+  ast: Tuple[LazyOp, ...]
+  bufs: Tuple[Buffer, ...]
+  @property
+  def outputs(self) -> Tuple[Buffer, ...]:
+    """Read/write or write only buffers in the schedule."""
+    return self.bufs[:len(self.ast)]
+  @property
+  def inputs(self) -> Tuple[Buffer, ...]:
+    """Read only buffers in the schedule."""
+    return self.bufs[len(self.ast):]
+# *** DAG transformation: List[LazyBuffer] -> ScheduleItem ***
+# TODO: it's unfortunate this needs to exist, but because of ASSIGN, we have to retain the LazyBuffer structure until post toposort
+@dataclass(frozen=True)
+class _LBScheduleItem:
+  ast: Tuple[LazyOp, ...]
+  outputs: Tuple[LazyBuffer, ...]
+  inputs: Tuple[LazyBuffer, ...]
+  var_vals: Dict[Variable, int]
+def _recursive_lazyop(buf:LazyBuffer, inputs:List[LazyBuffer], outputs:Tuple[LazyBuffer, ...], var_vals:Dict[Variable, int], st:ShapeTracker,
+                      realizes:Dict[LazyBuffer, None], assign_targets:Dict[LazyBuffer, LazyBuffer], cache) -> LazyOp:
+  """recursively create a lazyop"""
+  if (buf, st) in cache: return cache[(buf, st)]
+  if buf != buf.base:
+    st = buf.st + st
+    buf = buf.base
+  # all buffers here are base now
+  assert buf.op is not None
+  # consts are always fused and generated
+  if buf.op is LoadOps.CONST:
+    unbound_st, st_var_vals = st.simplify().unbind()
+    var_vals.update(st_var_vals)
+    if isinstance(buf.arg, Variable):
+      val, var_val = buf.arg.unbind()
+      var_vals.__setitem__(val, var_val)
+    else:
+      assert isinstance(buf.arg, get_args(ConstType)), f"cannot create ConstBuffer with value {buf.arg}"
+      val = buf.arg
+    return LazyOp(BufferOps.CONST, (), ConstBuffer(val, buf.dtype, unbound_st))
+  # if we aren't fusing it, it's a load and we add it to the inputs
+  if buf.realized is not None or (buf in realizes and buf not in outputs):
+    unbound_st, st_var_vals = st.simplify().unbind()
+    var_vals.update(st_var_vals)
+    if buf in assign_targets:
+      # can only assign to contiguous read+write buffer
+      if not unbound_st.contiguous:
+        # we also allow masked views. if it has a single view and it's equal when you shrink a contig, it's fine
+        if not (len(unbound_st.views) == 1 and unbound_st.views[0].mask is not None and
+            ShapeTracker.from_shape(unbound_st.shape).shrink(unbound_st.views[0].mask) == unbound_st.shrink(unbound_st.views[0].mask)):
+          raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
+                             +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
+      return LazyOp(BufferOps.LOAD, (), MemBuffer(outputs.index(assign_targets[buf]), buf.dtype, unbound_st))
+    if buf not in inputs: inputs.append(buf)
+    return LazyOp(BufferOps.LOAD, (), MemBuffer(len(outputs)+inputs.index(buf), buf.dtype, unbound_st))
+  # if a CONTIGUOUS or ASSIGN made it all the way here, just skip it
+  if buf.op is LoadOps.CONTIGUOUS:
+    assert buf in outputs
+    return _recursive_lazyop(buf.srcs[0], inputs, outputs, var_vals, st, realizes, assign_targets, cache)
+  if buf.op is LoadOps.ASSIGN:
+    assert buf in outputs
+    assert buf.srcs[1].base is buf.srcs[1], "assign must be to base"
+    assert buf.srcs[1].realized is not None, f"assign must be already realized to schedule {buf.srcs[1]}"
+    return _recursive_lazyop(buf.srcs[0], inputs, outputs, var_vals, st, realizes, assign_targets, cache)
+  # if it's a reduce, we have to change the shapetracker
+  if buf.op in ReduceOps:
+    assert st.contiguous, "ReduceOps late fusion must be contiguous"
+    st = ShapeTracker.from_shape(buf.srcs[0].shape)
+  # otherwise we fuse it like normal
+  cache[(buf, st)] = ret = \
+    LazyOp(buf.op, tuple(_recursive_lazyop(x, inputs, outputs, var_vals, st, realizes, assign_targets, cache) for x in buf.srcs), buf.arg)
+  return ret
+def _schedule_group(outs:Tuple[LazyBuffer, ...], realizes:Dict[LazyBuffer, None], reduce_for_op: Dict[LazyBuffer, LazyBuffer]) -> _LBScheduleItem:
+  """create a schedule item from a list of outputs"""
+  inputs: List[LazyBuffer] = []
+  ast: List[LazyOp] = []
+  var_vals: Dict[Variable, int] = merge_dicts([out.st.var_vals.copy() for out in outs])
+  # single output AST
+  if (op:=(out:=outs[0]).op) in {LoadOps.CUSTOM, LoadOps.COPY, LoadOps.EMPTY, LoadOps.VIEW}:
+    assert len(outs) == 1, f"can't schedule a group of {op}"
+    inputs = [x.base for x in out.srcs]
+    if getenv("USE_COPY_KERNEL") and op is LoadOps.COPY and out.device.split(":")[0] == out.srcs[0].device.split(":")[0]:
+      rd = LazyOp(BufferOps.LOAD, (), MemBuffer(1, dtypes.uint8, st:=ShapeTracker.from_shape((out.arg,))))
+      ast = [LazyOp(BufferOps.STORE, (rd,), MemBuffer(0, dtypes.uint8, st))]
+    else: ast = [LazyOp(op, (), out.arg)]
+  # multi output AST
+  else:
+    assign_targets = {x.srcs[1]:x for x in outs if x.op is LoadOps.ASSIGN}
+    for i, out in enumerate(outs):
+      output_st = ShapeTracker.from_shape(reduce_for_op[out].shape if out in reduce_for_op else out.shape)
+      output_view = out.arg[0] if out.op is LoadOps.ASSIGN and out.arg else output_st
+      lop = _recursive_lazyop(out, inputs, outs, var_vals, output_st, realizes, assign_targets, cache={})
+      output_view, vv = output_view.simplify().unbind()
+      if vv: var_vals.update(vv)
+      ast.append(LazyOp(BufferOps.STORE, (lop, ), MemBuffer(i, out.dtype, output_view)))
+  return _LBScheduleItem(tuple(ast), outs, tuple(inputs), var_vals)
+# *** DAG creation: decide which LazyBuffers should realize ***
+def _recurse_lb(buf:LazyBuffer, realizes:Dict[LazyBuffer, None], allbufs:Dict[LazyBuffer, None],
+                simple_pads:Set[LazyBuffer], children:DefaultDict[LazyBuffer, Dict[LazyBuffer, None]], scheduled=False):
+  """recursively search the entire graph for all LazyBuffers, insert realizes after expands"""
+  if buf in allbufs or buf.base.realized is not None: return
+  if GRAPH: log_lazybuffer(buf, scheduled)
+  # view
+  if buf.base != buf:
+    # fuse some pads
+    if len(buf.st.views) == 1 and buf.st.views[-1].mask is not None and all_int(buf.base.st.shape) and \
+        prod(buf.base.st.shape) >= prod([y-x for x,y in buf.st.views[-1].mask]):
+      simple_pads.add(buf.base)
+    # realize all expands
+    elif prod(buf.base.st.shape) < prod(buf.st.shape):
+      if buf.base.op is UnaryOps.CAST and isinstance(buf.base.srcs[0].dtype, ImageDType) and isinstance(buf.base.arg, ImageDType):
+        pass # don't realize image to image casts. this is part of a larger problem
+      else:
+        realizes[buf.base] = None
+    # check all other pads for safe fusion
+    elif any(v.mask is not None for v in buf.st.views): simple_pads.add(buf.base)
+    return _recurse_lb(buf.base, realizes, allbufs, simple_pads, children)
+  # base
+  allbufs[buf] = None
+  if buf.forced_realize: realizes[buf] = None
+  if buf.op in LoadOps: realizes[buf.base] = None
+  if buf.op is LoadOps.COPY:
+    assert buf.srcs[0].st.contiguous and buf.srcs[0].size == buf.srcs[0].base.size, "can only copy contig"
+    realizes[buf.srcs[0].base] = None
+  if buf.op is LoadOps.VIEW: realizes[buf.srcs[0].base] = None
+  for x in buf.srcs:
+    children[x.base][buf] = None
+    _recurse_lb(x, realizes, allbufs, simple_pads, children)
+def _is_padding_okay(buf:LazyBuffer, realizes:Dict[LazyBuffer, None]) -> bool:
+  if buf in realizes or buf.realized is not None: return True
+  # NOTE: this broke to_image_idx and coder with JIT
+  if buf.op in UNSAFE_PAD_OPS: return False
+  return all(_is_padding_okay(x.base, realizes) for x in buf.srcs)
+def _recursive_group(tr:LazyBuffer, st:ShapeTracker, r:LazyBuffer, children:DefaultDict[LazyBuffer, Dict[LazyBuffer, None]],
+                     realizes:Dict[LazyBuffer, None], reduce_for_op:Dict[LazyBuffer, LazyBuffer], group:Set[LazyBuffer]):
+  """recursively search the LazyBuffer for groupable children, realize the LazyBuffer if a child can't group"""
+  if tr in realizes:
+    # can only fuse contiguous
+    # max one reduceop per kernel
+    if not st.contiguous or st.size != r.st.size or tr in reduce_for_op: group.add(r)
+    return group.add(tr)
+  for tr_next in children[tr]:
+    if tr_next.realized is None:
+      # max one reduceop per kernel
+      if tr_next.op in ReduceOps: return group.add(r)
+      # can only fuse contiguous
+      if len(st_childs:=dedup(s for s in tr_next.srcs if s.base == tr)) > 1: return group.add(r)
+      _recursive_group(tr_next, st+st_childs[0].st, r, children, realizes, reduce_for_op, group)
+def _graph_schedule(outs:List[LazyBuffer], seen:Set[LazyBuffer]) -> Tuple[DefaultDict[LazyBuffer, List[LazyBuffer]], DefaultDict[LazyBuffer, int],
+                                                                    Dict[LazyBuffer, _LBScheduleItem]]:
+  """create a graph for realizing the outputs"""
+  # start by just realizing the buffers passed in
+  realizes: Dict[LazyBuffer, None] = {x.base:None for x in outs if x.base.realized is None}
+  allbufs: Dict[LazyBuffer, None] = {}
+  simple_pads: Set[LazyBuffer] = set()
+  children: DefaultDict[LazyBuffer, Dict[LazyBuffer, None]] = defaultdict(dict)
+  for out in outs: _recurse_lb(out.base, realizes, allbufs, simple_pads, children, scheduled=True)
+  assign_targets = {x.srcs[1]:x for x in realizes if x.op is LoadOps.ASSIGN and x not in seen and x.realized is None}
+  # check if we have to realize pads
+  for p in simple_pads:
+    if not _is_padding_okay(p, realizes):
+      realizes[p] = None
+  # find all reduces, and pair them to a elementwise op. if they can't be cleanly paired, force realize the reduce (or a contig child)
+  reduce_for_op: Dict[LazyBuffer, LazyBuffer] = {}
+  for r in allbufs:
+    if r.op not in ReduceOps or r in realizes: continue
+    group: Set[LazyBuffer] = set()
+    _recursive_group(r, r.st, r, children, realizes, reduce_for_op, group)
+    # max one reduceop per kernel
+    can_chase = all(tr not in reduce_for_op for tr in group)
+    # TODO: forced_realize exists because the scheduler is incapable of checking for self-contained DAGs
+    forced_realize = r in group
+    if not forced_realize and len(group) > 1:
+      # create a multi output kernel if the LazyBufferss can cleanly group
+      rc_parents, rc_children = deque(group), deque(group)
+      while rc_parents and not forced_realize:
+        # max one reduceop per kernel
+        if (p:=rc_parents.pop()).op in ReduceOps: forced_realize = True
+        else: rc_parents.extend(x.base for x in p.srcs if x.base.realized is None and x.base is not r)
+      # search descendants of the reduceop that can cleanly group
+      realized_descendants: Set[LazyBuffer] = set()
+      while rc_children and not forced_realize:
+        if (c:=rc_children.pop()).op in ReduceOps or not c.st.contiguous or c.st.size != r.st.size or c in reduce_for_op:
+          realized_descendants.clear()
+          break
+        if c in realizes and c not in group: realized_descendants.add(c)
+        rc_children.extend(x for x in children[c] if x.realized is None and x.device == r.device)
+      group.update(realized_descendants)
+    # can only fuse assign if no other assign_target is used in the kernel
+    if not forced_realize and any(x.op is LoadOps.ASSIGN for x in group):
+      parents = deque((r, *group))
+      while parents and not forced_realize:
+        if (p:=parents.pop().base).realized or p in realizes:
+          if p in assign_targets and assign_targets[p] not in group: forced_realize, can_chase = True, False
+          continue
+        parents.extend(p.srcs)
+    if forced_realize:
+      tr = r
+      if can_chase:
+        # can chase this down to contiguous children
+        st = tr.st
+        while len(children[tr]) == 1:
+          tr_next = next(iter(children[tr]))
+          st_childs = dedup(s for s in tr_next.srcs if s.base is tr)
+          if len(st_childs) > 1: break
+          if st.size != st_childs[0].st.size: break
+          st = st + st_childs[0].st
+          if not st.contiguous or tr_next.op in ReduceOps: break
+          tr = tr_next
+        # don't cast to higher size before store (tr cannot be realized if forced_realize)
+        if tr.op is UnaryOps.CAST and tr.arg.itemsize > tr.srcs[0].dtype.itemsize:
+          tr = tr.srcs[0].base
+        reduce_for_op[tr] = r
+      realizes[tr] = None
+    else: reduce_for_op.update((tr, r) for tr in group)
+  output_groups: DefaultDict[LazyBuffer, List[LazyBuffer]] = defaultdict(list)
+  for buf in realizes:
+    if buf.realized is not None or buf.op is LoadOps.CONST or buf in seen: continue
+    output_groups[reduce_for_op[buf] if buf in reduce_for_op and MULTIOUTPUT else buf].append(buf)
+    # make things that can't be images not images
+    if isinstance(buf.dtype, ImageDType) and (prod(buf.shape) != prod(buf.dtype.shape) or
+                                              not any(buf.shape[x]%4 == 0 for x in buf.st.unit_stride_axes())):
+      if DEBUG >= 2: print(f"forcing image {buf.dtype} with shape {buf.shape} to float32")
+      buf.dtype = dtypes.float32
+      # hack the underlying buffer too
+      if buf.base is buf:
+        assert not hasattr(buf.buffer, '_buf'), "can't fixup allocated buffer"
+        buf.buffer.dtype = dtypes.float32
+        buf.buffer.options = None
+  # preschedule all buffers in realizes
+  prescheduled = {group[0]:_schedule_group(tuple(group), realizes, reduce_for_op) for group in output_groups.values()}
+  schedule_targets = {out:ps for ps in prescheduled.values() for out in ps.outputs}
+  graph: DefaultDict[LazyBuffer, List[LazyBuffer]] = defaultdict(list)
+  in_degree: DefaultDict[LazyBuffer, int] = defaultdict(int)
+  for key, lsi in prescheduled.items():
+    if key not in in_degree: in_degree[key] = 0
+    # realize outputs after all parents are realized
+    scheduled_parents = set(schedule_targets[x].outputs[0] for x in lsi.inputs if x in schedule_targets)
+    for x in scheduled_parents:
+      graph[x].append(key)
+      in_degree[key] += 1
+    # realize outputs before a parent is assigned to
+    parents_assigns = set(schedule_targets[assign_targets[x]].outputs[0] for x in lsi.inputs if x in assign_targets)
+    for assign in parents_assigns:
+      graph[key].append(assign)
+      in_degree[assign] += 1
+  return graph, in_degree, prescheduled
+# *** DAG ordering: breadth first search ***
+SCHEDULES: List = []
+def create_schedule_with_vars(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffer]]=None) -> Tuple[List[ScheduleItem], Dict[Variable, int]]:
+  if seen is None: seen = set()
+  graph, in_degree, prescheduled = _graph_schedule(outs, seen)
+  queue = deque(si for key, si in prescheduled.items() if in_degree[key] == 0)
+  schedule: List[ScheduleItem] = []
+  var_vals: Dict[Variable, int] = {}
+  kernel_number = GlobalCounters.kernel_count
+  while queue:
+    ps = queue.popleft()
+    for buf in ps.outputs: seen.add(buf)
+    if GRAPH:
+      kernel_number += 1
+      for out in ps.outputs: realized_lazybuffer(out, kernel_number)
+    var_vals = merge_dicts([var_vals, ps.var_vals])
+    for out in ps.outputs: del out.srcs  # can only schedule once
+    schedule.append(si:=ScheduleItem(ps.ast, tuple(x.buffer for x in (ps.outputs+ps.inputs) if x.size != 0)))
+    if logops and si.ast[0].op not in LoadOps and not any(i.device.startswith("DISK:") for i in si.inputs): logops.write(str(si.ast)+"\n")
+    for x in graph[ps.outputs[0]]:
+      in_degree[x] -= 1
+      if in_degree[x] == 0: queue.append(prescheduled[x])
+  if SAVE_SCHEDULE:
+    def _save():
+      print(f"saving {len(SCHEDULES)} schedule graphs to", fp:=getenv("SAVE_SCHEDULE_PATH", "schedule.pkl"))
+      with open(fp, "wb") as f: pickle.dump(SCHEDULES, f)
+    if len(SCHEDULES) == 0: atexit.register(_save)
+    SCHEDULES.extend((ps.ast for ps in prescheduled.values()) if getenv("CAPTURE_AST") else [(graph, prescheduled)])
+  # confirm everything was scheduled correctly
+  if not all(degree == 0 for degree in in_degree.values()) or len(prescheduled) != len(schedule):
+    raise RuntimeError(f"cycle detected in graph, prescheduled {len(prescheduled)} but only scheduled {len(schedule)}")
+  if DEBUG >= 1 and len(schedule) >= 10: print(f"scheduled {len(schedule)} kernels")
+  return schedule, var_vals
+def create_schedule(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffer]]=None) -> List[ScheduleItem]:
+  schedule, var_vals = create_schedule_with_vars(outs, seen)
+  assert len(var_vals) == 0
+  return schedule
+# *** memory planning ***
+def _internal_memory_planner(buffers:List[Union[List[Buffer], Tuple[Buffer, ...]]], debug_prefix="") -> Dict[Buffer, Buffer]:
+  if getenv("NO_MEMORY_PLANNER"): return {}
+  last_appearance = {}
+  for i,u in enumerate(buffers):
+    for buf in u: last_appearance[buf] = i
+  # LRU algorithm
+  assigned: Dict[Buffer, Buffer] = {}
+  local_cache: DefaultDict[Tuple[str, int, DType], List[Buffer]] = defaultdict(list)
+  def handle_buffer(buf):
+    key = (buf.device, buf.size, buf.dtype)
+    if buf not in assigned:
+      if len(ll:=local_cache[key]): assigned[buf] = ll.pop()
+      else: assigned[buf] = Buffer(*key)
+    if i == last_appearance[buf]:
+      if assigned[buf] not in local_cache[key]: local_cache[key].append(assigned[buf])
+  for i,u in enumerate(buffers):
+    for buf in u:
+      # all unallocated unparented buffers are fair game to replace
+      if buf.is_allocated() or buf.lb_refcount > 0: continue
+      # handle view buffers
+      if buf._base is not None:
+        assigned[buf] = Buffer(buf.device, buf.size, buf.dtype, base=assigned.get(buf._base, buf._base), offset=buf.offset)
+      else:
+        handle_buffer(buf)
+  if DEBUG >= 1 and len(ak:=dedup(assigned.keys())) != len(av:=dedup(assigned.values())):
+    print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,",
+          f"{len(ak)} -> {len(av)} bufs")
+  return assigned
+def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]:
+  assigned = _internal_memory_planner([si.bufs for si in schedule])
+  return [ScheduleItem(si.ast, tuple(assigned.get(x, x) for x in si.bufs)) for si in schedule]

tinygrad/engine/search.py ADDED Viewed

@@ -0,0 +1,199 @@
+from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
+import itertools, functools, random, math, time, multiprocessing, traceback, signal
+from collections import defaultdict
+from dataclasses import replace
+from tinygrad.device import Device, Buffer, Compiler
+from tinygrad.ops import MemBuffer
+from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
+from tinygrad.dtype import ImageDType
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError
+from tinygrad.codegen.uops import UOpGraph
+from tinygrad.tensor import Tensor
+from tinygrad.shape.symbolic import sym_infer
+from tinygrad.engine.realize import CompiledRunner
+from tinygrad.renderer import Program
+actions = [Opt(op=OptOps.UPCAST, axis=axis, amt=amt) for amt in [0,2,3,4,5,7] for axis in range(6)]
+actions += [Opt(op=OptOps.UNROLL, axis=axis, amt=amt) for amt in [0,4,7] for axis in range(5)]
+actions += [Opt(op=OptOps.LOCAL, axis=axis, amt=amt) for amt in [2,3,4,8,13,16,29] for axis in range(5)]
+actions += [Opt(op=OptOps.GROUPTOP, axis=axis, amt=amt) for amt in [13,16,28,29,32,49,64,256] for axis in range(3)]
+actions += [Opt(op=OptOps.GROUP, axis=axis, amt=amt) for amt in [0,4,8,16] for axis in range(3)]
+if getenv("BEAM_PADTO", 1): actions += [Opt(op=OptOps.PADTO, axis=axis, amt=amt) for amt in [32] for axis in range(7)]
+actions += [Opt(op=OptOps.LOCAL, axis=0, amt=32), Opt(op=OptOps.UPCASTMID, axis=1, amt=4), Opt(op=OptOps.TC, axis=0, amt=0)]
+actions += [Opt(op=OptOps.TC, axis=axis, amt=getenv("TC_OPT", 2)) for axis in range(9)] # covers resnet kernels (3 global * 3 reduce)
+if getenv("NOLOCALS"): actions += [Opt(op=OptOps.NOLOCALS)]
+def _get_test_global_size(global_size, max_global_size, var_vals):
+  test_global_size, factor = [sym_infer(sz, var_vals) for sz in global_size], 1
+  while prod(test_global_size) > max_global_size:
+    for j in range(len(global_size)-1,-1,-1):
+      if test_global_size[j] > 16:
+        test_global_size[j] //= 2
+        factor *= 2
+        break
+  return test_global_size, factor
+def _time_program(p:Program, lib:bytes, var_vals, rawbufs, early_stop=None, max_global_size=65536, clear_l2=False, cnt=3, name="test"):
+  factor = 1
+  if p.global_size is not None and max_global_size is not None:
+    global_size, factor = _get_test_global_size(p.global_size, max_global_size, var_vals)
+    p = replace(p, global_size=global_size)
+  try: car = CompiledRunner(p, precompiled=lib)
+  except AssertionError: return [math.inf] * cnt
+  tms = []
+  input_bufs = [rawbufs[i] for i,_ in car.p.globals]
+  for _ in range(cnt):
+    if clear_l2:
+      with Context(DEBUG=0, BEAM=0, CAPTURING=0): Tensor.ones(1024,1024).contiguous().realize(do_update_stats=False)
+    tms.append(cast(float, car(input_bufs, var_vals, wait=True))*factor)
+    if early_stop is not None and early_stop < tms[-1]: break
+  return tms
+class TimeoutException(Exception): pass
+def timeout_handler(signum, frame): raise TimeoutException()
+def _try_compile_linearized_w_idx(x:Tuple[int,Linearizer], compiler:Compiler) -> Tuple[int, Optional[Tuple[Program, bytes, float]]]:
+  signal.signal(signal.SIGALRM, timeout_handler)
+  # set timeout
+  signal.alarm(getenv("BEAM_TIMEOUT_SEC", 10))
+  try:
+    x[1].linearize()
+    if len(x[1].uops.uops) >= getenv("BEAM_UOPS_MAX", 3000) > 0: raise RuntimeError("too many uops")
+    p = x[1].to_program()
+    st = time.perf_counter()
+    prog = compiler.compile(p.src)
+    et = time.perf_counter() - st
+    ret = (p, prog, et)
+  except RuntimeError:
+    if DEBUG >= 4: traceback.print_exc()
+    ret = None
+  except TimeoutException:
+    ret = None
+  except Exception as e:
+    if getenv("BEAM_STRICT_MODE"): raise e
+    ret = None
+  finally:
+    signal.alarm(0)
+  return x[0], ret
+# workers should ignore ctrl c
+def _init_worker(): signal.signal(signal.SIGINT, signal.SIG_IGN)
+def _ensure_buffer_alloc(bufs:List[Buffer]) -> List[Buffer]: return [buf.ensure_allocated() for buf in bufs]
+# *** external API ***
+# get (scrap) buffers for timing the linearizer
+def bufs_from_lin(lin:Linearizer, allocate:bool=True) -> List[Buffer]:
+  bufsts:DefaultDict[int, List[MemBuffer]] = defaultdict(list)
+  for x in lin.membufs: bufsts[x.idx].append(x)
+  rawbufs:List[Optional[Buffer]] = [None]*len(bufsts)
+  for k,lx in bufsts.items():
+    buf_size = prod(lx[0].dtype.shape) if isinstance(lx[0].dtype, ImageDType) else max(y.st.real_size() for y in lx)
+    if buf_size == 0: buf_size = 1  # create a size 1 buffer if no cell is accessed in kernel. # TODO: remove from kernel input in this case.
+    rawbufs[k] = Buffer(lin.opts.device, buf_size, lx[0].dtype).allocate() if allocate else Buffer(lin.opts.device, buf_size, lx[0].dtype)
+  assert all(r is not None for r in rawbufs)
+  return cast(List[Buffer], rawbufs)
+# get dictionary of all possible actions
+def get_linearizer_actions(lin:Linearizer, include_0=True) -> Dict[int, Linearizer]:
+  acted_lins, max_up, max_lcl = {0:lin} if include_0 else {}, getenv("BEAM_UPCAST_MAX", 256), getenv("BEAM_LOCAL_MAX", 1024)
+  for i,a in enumerate(actions):
+    if a.axis is not None and a.op is not OptOps.TC:
+      if ((ax:=a.real_axis(lin)) >= lin.shape_len) or (lin.full_shape[ax] == a.amt and Opt(a.op, ax, 0) in actions): continue
+    lin2 = lin.copy()
+    try:
+      lin2.apply_opt(a)
+      up, lcl, tc_up = 1, 1, prod(tc.dims)//prod([x[1] for x in tc.threads]) if (tc:=lin2.tensor_core) else 1
+      for s,c in zip(lin2.full_shape, lin2.colors()):
+        if c in {"magenta", "yellow"}: up *= s
+        elif c in {"cyan", "green", "white"}: lcl *= s
+      if up//tc_up > max_up or lcl > max_lcl: continue
+      acted_lins[i+1] = lin2
+    except KernelOptError: pass
+  return acted_lins
+beam_pool, BEAM_DEBUG = None, getenv("BEAM_DEBUG")
+def beam_search(lin:Linearizer, rawbufs:List[Buffer], amt:int, allow_test_size=True) -> Linearizer:
+  global beam_pool
+  key = {"ast": lin.ast[0].key, "amt": amt, "allow_test_size": allow_test_size, "device": lin.opts.device, "suffix": lin.opts.suffix}
+  if not getenv("IGNORE_BEAM_CACHE") and CACHELEVEL >= 1 and (val:=diskcache_get("beam_search", key)) is not None:
+    ret = lin.copy()
+    for o in val[len(lin.applied_opts):]: ret.apply_opt(o)
+    return ret
+  beam: List[Tuple[Linearizer, float]] = [(lin, float("inf"))]
+  seen_libs = set()
+  default_parallel = multiprocessing.cpu_count() if lin.opts.device in {"CUDA", "AMD", "NV"} else 0
+  if beam_pool is None and (workers := getenv("PARALLEL", default_parallel)):
+    beam_pool = multiprocessing.get_context("spawn").Pool(workers, _init_worker, (), getenv("BEAM_MAX_TASKS_PER_CHILD", 16))
+  min_progress = getenv("BEAM_MIN_PROGRESS", 0.01)/1e6
+  if BEAM_DEBUG: print(f"BEAM_SEARCH:\n{lin.ast}")
+  if DEBUG >= 2: print(f"   0.00s:                 from   1 ->   1 actions {lin.colored_shape()}")
+  try:
+    rawbufs = _ensure_buffer_alloc(rawbufs)
+    var_vals = {k:(k.max+k.min)//2 for k in lin.ast[0].vars()}
+    exiting, st = False, time.perf_counter()
+    dev = Device[lin.opts.device]
+    while not exiting:
+      acted_lins: List[Linearizer] = flatten([get_linearizer_actions(lin, include_0=False).values() for lin,_ in beam])
+      timed_lins: List[Tuple[Linearizer, float]] = []
+      _compile_fn = functools.partial(_try_compile_linearized_w_idx, compiler=dev.compiler)
+      for i,proc in (map(_compile_fn, enumerate(acted_lins)) if beam_pool is None else beam_pool.imap_unordered(_compile_fn, enumerate(acted_lins))):
+        if proc is None: continue
+        p, lib, compile_et = proc
+        if lib in seen_libs: continue
+        #print(acted_lins[i].colored_shape(), acted_lins[i].applied_opts)  # for debugging BEAMs that segfault
+        seen_libs.add(lib)
+        try: tms = _time_program(p, lib, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0)
+        except RuntimeError: continue # for runtime issues
+        timed_lins.append((acted_lins[i], min(tms)))
+        if BEAM_DEBUG > 1: print(f"{time.perf_counter() - st:7.2f}s: {i:5d} {len(cast(UOpGraph, p.uops).uops):5d} uops {compile_et*1e6:12.2f} us compile/{timed_lins[-1][1]*1e6:12.2f} us run       {len(timed_lins):4d}/{len(acted_lins):4d}         {timed_lins[-1][0].colored_shape()}")  # noqa: E501
+        elif DEBUG >= 2: print(f"\r{time.perf_counter() - st:7.2f}s: {timed_lins[-1][1]*1e6:12.2f} us       {len(timed_lins):4d}/{len(acted_lins):4d}         {timed_lins[-1][0].colored_shape()}\033[K", end="")  # noqa: E501
+      # done
+      opts = sorted(timed_lins, key=lambda x: x[1])
+      exiting = len(opts) == 0 or (opts[0][1] < min_progress) or (len(beam) > 0 and ((beam[0][1]-opts[0][1]) < min_progress))
+      if not exiting: beam = opts[:amt]
+      elif len(opts) > 0 and opts[0][1] < beam[0][1]: beam = opts[:1]
+      if DEBUG >= 2: print(f"\r{time.perf_counter() - st:7.2f}s:", colored(f"{beam[0][1]*1e6:12.2f} us", "green" if exiting else None), f"from {len(acted_lins):3d} -> {len(opts):3d} actions\033[K", beam[0][0].colored_shape())  # noqa: E501
+  except KeyboardInterrupt as e:
+    if beam_pool is not None: beam_pool.terminate()
+    raise e
+  if CACHELEVEL >= 1: diskcache_put("beam_search", key, beam[0][0].applied_opts)
+  if BEAM_DEBUG: print(f"BEAM_SEARCH: final tm={beam[0][1]*1e6:0.2f} us, applied_opts={beam[0][0].applied_opts}")
+  return beam[0][0]
+def optimize_local_size(clprg:Callable, global_size:List[int], rawbufs:List[Buffer]) -> List[int]:
+  test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
+  MAX_WORKGROUP = 1024
+  local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size]
+  local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2  # try each valid size twice
+  def try_exec(local_size):
+    try: return clprg(*[x._buf for x in test_rawbuffers], global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], local_size=local_size, wait=True)  # noqa: E501
+    except Exception: return float('inf')
+  ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])
+  assert not math.isinf(ret[0]), "all optimize_local_size exec failed"
+  return ret[1]
+def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True, max_global_size=65536, cnt=3, disable_cache=False, clear_l2=False) -> float:  # noqa: E501
+  key = {"ast": lin.ast[0].key, "opts": str(lin.applied_opts), "allow_test_size": allow_test_size,
+         "max_global_size": max_global_size, "clear_l2": clear_l2, "device": lin.opts.device, "suffix": lin.opts.suffix}
+  if not disable_cache and CACHELEVEL >= 2 and (val:=diskcache_get("time_linearizer", key)) is not None: return min(val)
+  dev = Device[lin.opts.device]
+  assert dev.compiler is not None
+  rawbufs = _ensure_buffer_alloc(rawbufs)
+  var_vals = {k:(k.max+k.min)//2 for k in lin.ast[0].vars()}
+  p = lin.to_program()
+  tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs,
+                      max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name))
+  if CACHELEVEL >= 2: diskcache_put("time_linearizer", key, tms)
+  return min(tms)

tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl