PyPI - tinygrad - Versions diffs - 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

tinygrad 0.10.2py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

tinygrad/__init__.py +1 -1
tinygrad/apps/llm.py +206 -0
tinygrad/codegen/__init__.py +116 -0
tinygrad/codegen/devectorizer.py +315 -172
tinygrad/codegen/expander.py +8 -16
tinygrad/codegen/gpudims.py +89 -0
tinygrad/codegen/linearize.py +205 -203
tinygrad/codegen/lowerer.py +92 -139
tinygrad/codegen/opt/__init__.py +38 -0
tinygrad/codegen/opt/heuristic.py +125 -0
tinygrad/codegen/opt/kernel.py +510 -0
tinygrad/{engine → codegen/opt}/search.py +51 -35
tinygrad/codegen/opt/swizzler.py +134 -0
tinygrad/codegen/opt/tc.py +127 -0
tinygrad/codegen/quantize.py +67 -0
tinygrad/device.py +122 -132
tinygrad/dtype.py +152 -35
tinygrad/engine/jit.py +81 -54
tinygrad/engine/memory.py +46 -27
tinygrad/engine/realize.py +82 -41
tinygrad/engine/schedule.py +70 -445
tinygrad/frontend/__init__.py +0 -0
tinygrad/frontend/onnx.py +1253 -0
tinygrad/frontend/torch.py +5 -0
tinygrad/gradient.py +19 -27
tinygrad/helpers.py +95 -47
tinygrad/nn/__init__.py +7 -8
tinygrad/nn/optim.py +72 -41
tinygrad/nn/state.py +37 -23
tinygrad/renderer/__init__.py +40 -60
tinygrad/renderer/cstyle.py +143 -128
tinygrad/renderer/llvmir.py +113 -62
tinygrad/renderer/ptx.py +50 -32
tinygrad/renderer/wgsl.py +27 -23
tinygrad/runtime/autogen/am/am.py +5861 -0
tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
tinygrad/runtime/autogen/comgr.py +35 -9
tinygrad/runtime/autogen/comgr_3.py +906 -0
tinygrad/runtime/autogen/cuda.py +2419 -494
tinygrad/runtime/autogen/hsa.py +57 -16
tinygrad/runtime/autogen/ib.py +7171 -0
tinygrad/runtime/autogen/io_uring.py +917 -118
tinygrad/runtime/autogen/kfd.py +748 -26
tinygrad/runtime/autogen/libc.py +613 -218
tinygrad/runtime/autogen/libusb.py +1643 -0
tinygrad/runtime/autogen/nv/nv.py +8602 -0
tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
tinygrad/runtime/autogen/opencl.py +2 -4
tinygrad/runtime/autogen/sqtt.py +1789 -0
tinygrad/runtime/autogen/vfio.py +3 -3
tinygrad/runtime/autogen/webgpu.py +273 -264
tinygrad/runtime/graph/cuda.py +3 -3
tinygrad/runtime/graph/hcq.py +68 -29
tinygrad/runtime/graph/metal.py +29 -13
tinygrad/runtime/graph/remote.py +114 -0
tinygrad/runtime/ops_amd.py +537 -320
tinygrad/runtime/ops_cpu.py +108 -7
tinygrad/runtime/ops_cuda.py +12 -14
tinygrad/runtime/ops_disk.py +13 -10
tinygrad/runtime/ops_dsp.py +47 -40
tinygrad/runtime/ops_gpu.py +13 -11
tinygrad/runtime/ops_hip.py +6 -9
tinygrad/runtime/ops_llvm.py +35 -15
tinygrad/runtime/ops_metal.py +29 -19
tinygrad/runtime/ops_npy.py +5 -3
tinygrad/runtime/ops_null.py +28 -0
tinygrad/runtime/ops_nv.py +306 -234
tinygrad/runtime/ops_python.py +62 -52
tinygrad/runtime/ops_qcom.py +28 -39
tinygrad/runtime/ops_remote.py +482 -0
tinygrad/runtime/ops_webgpu.py +28 -28
tinygrad/runtime/support/am/amdev.py +114 -249
tinygrad/runtime/support/am/ip.py +211 -172
tinygrad/runtime/support/amd.py +138 -0
tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
tinygrad/runtime/support/compiler_cuda.py +8 -11
tinygrad/runtime/support/elf.py +2 -1
tinygrad/runtime/support/hcq.py +184 -97
tinygrad/runtime/support/ib.py +172 -0
tinygrad/runtime/support/llvm.py +3 -4
tinygrad/runtime/support/memory.py +251 -0
tinygrad/runtime/support/nv/__init__.py +0 -0
tinygrad/runtime/support/nv/ip.py +581 -0
tinygrad/runtime/support/nv/nvdev.py +183 -0
tinygrad/runtime/support/system.py +170 -0
tinygrad/runtime/support/usb.py +268 -0
tinygrad/runtime/support/webgpu.py +18 -0
tinygrad/schedule/__init__.py +0 -0
tinygrad/schedule/grouper.py +119 -0
tinygrad/schedule/kernelize.py +368 -0
tinygrad/schedule/multi.py +231 -0
tinygrad/shape/shapetracker.py +40 -46
tinygrad/shape/view.py +88 -52
tinygrad/tensor.py +968 -542
tinygrad/uop/__init__.py +117 -0
tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
tinygrad/uop/mathtraits.py +169 -0
tinygrad/uop/ops.py +1021 -0
tinygrad/uop/spec.py +228 -0
tinygrad/{codegen → uop}/symbolic.py +239 -216
tinygrad/uop/upat.py +163 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
tinygrad/viz/index.html +203 -403
tinygrad/viz/js/index.js +718 -0
tinygrad/viz/js/worker.js +29 -0
tinygrad/viz/serve.py +224 -102
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
tinygrad-0.11.0.dist-info/RECORD +141 -0
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/kernel.py +0 -693
tinygrad/engine/multi.py +0 -161
tinygrad/ops.py +0 -1003
tinygrad/runtime/ops_cloud.py +0 -220
tinygrad/runtime/support/allocator.py +0 -94
tinygrad/spec.py +0 -155
tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
tinygrad/viz/perfetto.html +0 -178
tinygrad-0.10.2.dist-info/RECORD +0 -99
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0

tinygrad/codegen/gpudims.py ADDED Viewed

@@ -0,0 +1,89 @@
+import math
+from tinygrad.uop.ops import UOp, Ops, sint, PatternMatcher, UPat, KernelInfo, ssimplify, AxisType
+from tinygrad.helpers import all_int
+from tinygrad.dtype import dtypes
+from tinygrad.shape.view import get_contraction
+from tinygrad.renderer import Renderer
+def _group_dims(dims:tuple[sint, ...], max_sizes:tuple[int, ...]):
+  # TODO: symbolic shape
+  if not all_int(dims): return dims
+  while len(dims) > len(max_sizes) or any(d > m for d,m in zip(dims, max_sizes)):
+    for i,m in enumerate(max_sizes):
+      if i < (len(dims)-1) and dims[i] * dims[i+1] <= m:
+        dims = dims[:i] + (dims[i]*dims[i+1],) + dims[i+2:]
+        break
+    else: return None
+  return dims
+def _split_dims(dims, max_sizes):
+  if all(d <= m for d,m in zip(dims, max_sizes)): return dims
+  _dims = list(dims) + [1]*(3-len(dims))
+  for i in range(len(_dims)):
+    while _dims[i] > max_sizes[i]:
+      div = next((d for d in range(2, math.ceil(math.sqrt(_dims[i])) + 1) if (_dims[i] % d) == 0), 1)
+      if div == 1: raise RuntimeError(f"cannot limit dim {dims=}, {max_sizes=}")
+      _dims[i], _dims[(i+1)%len(_dims)] = _dims[i]//div, _dims[(i+1)%len(_dims)]*div
+  return tuple(_dims[:2] if _dims[2] == 1 else _dims[0] if _dims[1:3] == [1,1] else _dims)
+def get_grouped_dims(prefix, dims:tuple[sint, ...], max_sizes:tuple[int, ...]|None, reverse=False) -> list[UOp]:
+  if reverse: dims = dims[::-1]
+  # try to group first: (a, b, c, d) -> (ab, c, d)
+  limited = (grouped if (grouped := _group_dims(dims, max_sizes)) else dims) if max_sizes is not None else dims
+  # check if grouping failed
+  if max_sizes is not None and len(limited) > len(max_sizes): raise RuntimeError(f"cannot limit dim {dims=}, {max_sizes=}")
+  # try to split up dims: (a,) -> (b, c)
+  if limited == dims: limited = _split_dims(dims, max_sizes) if max_sizes is not None else dims
+  ret = raw_idxs = [UOp(Ops.SPECIAL, dtypes.int, (), (f"{prefix}{i}", s)) for i,s in enumerate(limited)]
+  if len(limited) < len(dims):
+    ret = []
+    if (contraction:=get_contraction(dims, limited)) is None: raise AssertionError(f"get_contraction should not be None {dims=} {limited=}")
+    for idx, contraction_group in zip(raw_idxs, contraction):
+      for c in contraction_group[:-1]:
+        ret.append(idx % dims[c])
+        idx //= dims[c]
+      ret.append(idx)
+  elif len(limited) > len(dims):
+    a, b = len(limited), len(dims)
+    if a == 2 and b == 1: ret = [raw_idxs[0] * limited[1] + raw_idxs[1]]
+    if a == 3 and b == 1: ret = [raw_idxs[0] * (limited[1] * limited[2]) + raw_idxs[1] * limited[2] + raw_idxs[2]]
+    if a == 3 and b == 2: ret = [raw_idxs[0] * limited[1] + raw_idxs[1], raw_idxs[2]]
+  return ret[::-1] if reverse else ret
+def add_gpudims(ctx:Renderer, s:UOp):
+  if s.arg is None: return None
+  ki: KernelInfo = s.arg
+  global_dims = [i for i,x in enumerate(ki.axis_types) if x is AxisType.GLOBAL]
+  local_dims = [i for i,x in enumerate(ki.axis_types) if x in (AxisType.LOCAL, AxisType.GROUP_REDUCE)]
+  if not global_dims and not local_dims: return None
+  s_topo = list(s.toposort())
+  if any(x.op is Ops.SPECIAL for x in s_topo): return None
+  # get global and local shape
+  all_ranges = {x.arg%1000:x for x in s_topo if x.op is Ops.RANGE}
+  ranges = [all_ranges[r] for r in global_dims+local_dims if r in all_ranges]
+  global_shape = tuple([ssimplify(r.src[0]) for r in ranges if r.arg%1000 in global_dims])
+  local_shape = tuple([ssimplify(r.src[0]) for r in ranges if r.arg%1000 in local_dims])
+  # get the idxs
+  if ki.dont_use_locals:
+    assert not local_dims, "can't use locals if there's no local dims"
+    idxs = get_grouped_dims("idx", global_shape, ctx.global_max, reverse=True)
+  else:
+    # define indexes for GPU-like execution
+    idxs = get_grouped_dims("gidx", global_shape, ctx.global_max, reverse=True) + get_grouped_dims("lidx", local_shape, ctx.local_max)
+  # apply to multiple ranges
+  subs = {}
+  for r in s_topo:
+    if r.op is not Ops.RANGE: continue
+    try:
+      ii = (global_dims+local_dims).index(r.arg%1000)
+      if r.arg < 2000 and ki.axis_types[r.arg%1000] == AxisType.GROUP_REDUCE: continue
+      subs[r] = idxs[ii]
+    except ValueError: continue
+  return s.substitute(subs)
+pm_add_gpudims = PatternMatcher([
+  (UPat(Ops.SINK, name="s"), add_gpudims),
+])

tinygrad/codegen/linearize.py CHANGED Viewed

@@ -1,234 +1,236 @@
 from __future__ import annotations
-import collections, heapq
-from dataclasses import dataclass
-from tinygrad.ops import UOp, Ops, PatternMatcher, UPat, graph_rewrite, GroupOp
-from tinygrad.spec import type_verify
-from tinygrad.dtype import dtypes, PtrDType
-from tinygrad.helpers import dedup, flatten, partition
+import heapq
+from collections import defaultdict
+from dataclasses import dataclass, replace
+from tinygrad.uop.ops import UOp, Ops, PatternMatcher, UPat, GroupOp
+from tinygrad.helpers import dedup, all_same, flatten, getenv
-DONT_PLACE_IN_BLOCK = {Ops.NAME, Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL, Ops.DEFINE_VAR, Ops.SPECIAL, Ops.CONST, *GroupOp.Block}
+# NOTE: any toposort should be valid here, unlike last time this isn't required, it's just for speed
+def block_reorder(lst:list[UOp]) -> list[UOp]:
+  in_this_block = set(lst)
+  local_children: defaultdict[UOp, list[UOp]] = defaultdict(list)
+  in_degree:dict[UOp, int] = {}
+  priorities:dict[UOp, int] = {}
+  # get local children and assign priorities
+  # NOTE: this requires the lst be locally toposorted
+  for u in reversed(lst):
+    in_degree[u] = 0
+    for s in u.src:
+      if s in in_this_block:
+        local_children[s].append(u)
+        in_degree[u] += 1
+    # put loads in the beginning of the block and prevent priority inversion. hack for BARRIER grouping too
+    priority = [0] + [priorities[x] for x in local_children[u]]
+    if u.op is Ops.LOAD: priority.append(-1000)
+    if u.op is Ops.BARRIER: priority.append(-1500)
+    priorities[u] = min(priority)
+  # number the uops in "ideal" order
+  nkey = {u:i for i,u in enumerate(sorted(lst, key=lambda x: (priorities[x],)+x.tuplize))}
+  # then force then to be toposorted in as close to the ideal order as possible
+  heapq.heapify(heap:=[(nkey[u],u) for u in lst if in_degree[u] == 0])
+  newlst = []
+  while heap:
+    newlst.append(u:=heapq.heappop(heap)[1])
+    for v in local_children[u]:
+      in_degree[v] -= 1
+      if in_degree[v] == 0: heapq.heappush(heap, (nkey[v],v))
+  assert len(newlst) == len(lst), f"len mismatch {len(newlst)} != {len(lst)}"
+  return newlst
+# ***** basic block *****
 def disp(y:UOp) -> str:
-  if y.op is Ops.BLOCKSTART: return "w"+disp(y.src[0])
   if y.op is Ops.IF: return f'IF{id(y)}'
   if y.op is Ops.RANGE: return str(y.arg)
   return "<NONE>"
-@dataclass(frozen=True)
+@dataclass(frozen=True, eq=False)
 class BasicBlock:
-  ctx: tuple[UOp, ...]
   lst: tuple[UOp, ...]
+  ctx: tuple[UOp, ...] = ()
   end: UOp|None = None
-  def __lt__(self, o:BasicBlock): return tuple(x.tuplize for x in self.ctx+self.lst) < tuple(x.tuplize for x in o.ctx+o.lst)
+  cnt: int = 0
+  child_ctx: tuple[UOp, ...]|None = None
+  def __lt__(self, _:BasicBlock): raise RuntimeError("no comparing basic blocks")
   def __repr__(self):
-    return f"{(str(disp(self.end))+' ') if self.end is not None else ''}"+\
-           f"{[disp(y) for y in self.ctx]} {len(self.lst)}" + "\n" + '\n'.join([str(x.op) for x in self.lst])
-def append_to_block(ctx:tuple[dict[UOp, tuple[UOp, ...]], dict[UOp, list[UOp]]], x:UOp):
-  block_ctxs, children = ctx
-  in_this_block = set(x.arg.lst)
-  # collections to build
-  new_srcs: list[UOp] = []
-  to_append: list[UOp] = []
-  old_blocks: dict[tuple[UOp, ...], UOp] = {}
-  new_blocks: dict[tuple[UOp, ...], list[UOp]] = {}
-  for u in x.src:
-    if u.op is Ops.BLOCK:
-      # merge sibling blocks. NOTE: blocks must only have one output source
-      assert u.arg.ctx not in old_blocks, "sibling should never have been created"
-      old_blocks[u.arg.ctx] = u
-    elif u.op not in DONT_PLACE_IN_BLOCK and set(children[u]).issubset(in_this_block):
-      # if it can go in blocks and all its children are in the block, we add it to the block
-      if (block_ctx:=block_ctxs[u]) == x.arg.ctx:
-        # if it's the same context, we place the UOp in this block and append the parents to its srcs
-        new_srcs.extend(u.src)
-        to_append.append(u)
+    return f"{(str(disp(self.end))+' ') if self.end is not None else ''}"+f'f{self.cnt} '+\
+           f"{[disp(y) for y in self.ctx]} {[disp(y) for y in self.child_ctx] if self.child_ctx is not None else '-'} "+\
+           f"{len(self.lst)}" + "\n" + '\n'.join([str(x.op) for x in self.lst])
+  def last_ctx(self): return self.child_ctx if self.child_ctx is not None else self.ctx
+def _sort_ctx(inp): return tuple(sorted(dedup(inp), key=lambda x: x.tuplize))
+# ***** block context *****
+@dataclass
+class BlockContext:
+  child_count: dict[UOp, int]
+  block_ctxs: dict[UOp, tuple[UOp, ...]]
+  child_ctxs: dict[UOp, tuple[UOp, ...]]
+  def last_ctx(self, u): return self.child_ctxs.get(u, self.block_ctxs[u])
+  @staticmethod
+  def from_sink(sink:UOp) -> BlockContext:
+    # get children and all block contexts
+    ctx = BlockContext({}, {}, {})
+    for u in sink.toposort():
+      this_block_ctx: list[UOp] = []
+      ctx.child_count[u] = 0
+      # get children and accumulate the last_ctx
+      for s in u.src:
+        # NOTE: if a parent appears multiple times in the src, it counts multiple times as a child
+        ctx.child_count[s] += 1
+        this_block_ctx += ctx.last_ctx(s)
+      # save the block ctx. SINK never has anything
+      ctx.block_ctxs[u] = _sort_ctx(this_block_ctx) if u.op is not Ops.SINK else ()
+      # RANGE/IF add to the next ctx
+      # STORE/ASSIGN subtract from the next ctx
+      if u.op in {Ops.RANGE, Ops.IF}: ctx.child_ctxs[u] = _sort_ctx(ctx.block_ctxs[u] + (u,))
+      elif u.op is Ops.STORE: ctx.child_ctxs[u] = tuple([y for y in ctx.block_ctxs[u] if y not in u.src])
+    return ctx
+# ***** make blocks *****
+DONT_PLACE_IN_BLOCK = {Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL, Ops.DEFINE_REG, Ops.DEFINE_VAR, Ops.SPECIAL, Ops.CONST}
+def add_blockends(base_block:UOp, new_ctx:tuple[UOp, ...], current_ctx:tuple[UOp, ...], cnt:int=1) -> UOp:
+  ends_to_add = [z for z in new_ctx if z not in current_ctx]
+  while len(ends_to_add):
+    r:UOp = ends_to_add.pop(-1)
+    new_ctx = tuple([z for z in new_ctx if z is not r])
+    end_uop = UOp(Ops.ENDIF if r.op is Ops.IF else Ops.ENDRANGE, src=(r,))
+    base_block = UOp(Ops.BLOCKEND, src=(base_block,)*cnt, arg=BasicBlock((end_uop,), tuple(new_ctx), end=r, cnt=cnt))
+  return base_block
+def make_block_bottom_up(ctx:BlockContext, x:UOp):
+  if x.op is Ops.BLOCKSTART:
+    current_ctx, child_ctx = x.arg
+    lst = list(x.src)
+    child_count = 1
+  else:
+    current_ctx, child_count, child_ctx = ctx.block_ctxs[x], ctx.child_count[x], ctx.child_ctxs.get(x, None)
+    lst = [x]
+  # count of times we've seen this block, or a seed for a new block if we can't merge it
+  unmergable: defaultdict[UOp, int] = defaultdict(int)
+  blockseeds = defaultdict(list)
+  # add the srcs of this to the frontier
+  # NOTE: things may be in here multiple times, that's okay
+  frontier_nodes = list(flatten(y.src[::-1] for y in lst))
+  while len(frontier_nodes):
+    u = frontier_nodes.pop(0)
+    if u.op not in DONT_PLACE_IN_BLOCK and ctx.child_count[u] == unmergable[u]+1:
+      # count is correct
+      if (newctx:=ctx.block_ctxs[u]) == current_ctx:
+        # block has same context, merge it, and put the srcs on the frontier
+        lst.append(u)
+        frontier_nodes.extend(u.src[::-1])
       else:
-        # if it's a different context, we create a new block with this UOp
-        new_blocks.setdefault(block_ctx, []).append(u)
+        # block has different context, add it to blockseeds
+        blockseeds[(newctx, ctx.child_ctxs.get(u, None))].append(u)
+      del unmergable[u]
     else:
-      # otherwise, we keep it in the srcs
-      new_srcs.append(u)
-  if len(to_append) == 0 and len(new_blocks) == 0: return None
-  for rng,lst in new_blocks.items():
-    srcs = flatten(y.src for y in lst)
-    if (old_block:=old_blocks.pop(rng, None)) is not None:
-      # NOTE: order shouldn't matter here
-      srcs.extend(old_block.src)
-      lst.extend(old_block.arg.lst)
-    new_block = UOp(Ops.BLOCK, dtypes.void, tuple(dedup(srcs)), BasicBlock(rng, tuple(lst)))
-    lrng = list(rng)
-    for r in rng[::-1]:
-      if r not in x.arg.ctx and r.op is not Ops.BLOCKSTART:
-        lrng.remove(r)
-        new_block = UOp(Ops.BLOCKEND, src=(new_block,),
-                        arg=BasicBlock(tuple(lrng), (UOp(Ops.ENDIF if r.op is Ops.IF else Ops.ENDRANGE, src=(r,)),), r))
-    new_srcs.append(new_block)
-  return UOp(Ops.BLOCK, dtypes.void, tuple(dedup(list(old_blocks.values())+new_srcs)), BasicBlock(x.arg.ctx, tuple(to_append)+x.arg.lst))
-make_basic_blocks = PatternMatcher([
-  (UPat(Ops.SINK, name="x"),
-    lambda x: UOp(Ops.BLOCK, src=x.src+((UOp(Ops.NAME, arg=x.arg.name),) if x.arg is not None else ()), arg=BasicBlock((), (x,)))),
-  (UPat(Ops.BLOCK, name="x"), append_to_block),
-])
-def block_merge(ctx, x:UOp):
-  # ctx is children here
-  if x.op is Ops.BLOCKEND:
-    # if it's a BLOCKEND, see if we are done with placement. if all the children of the range are in here
-    in_this_block = set(x.arg.lst)
-    if len([y for y in ctx[x.arg.end] if y not in in_this_block]) == 0:
-      # find the parent block that has the BLOCKSTART in the ctx
-      parent_blocks = [y for y in x.src if y.op is Ops.BLOCK and UOp(Ops.BLOCKSTART, src=(x.arg.end,)) in y.arg.ctx]
-      assert len(parent_blocks) <= 1, "should never have two parent blocks"
-      if len(parent_blocks) == 1:
-        parent_block = parent_blocks[0]
-        # range needs DEFINE_ACC to be before the range (never in DEFINE_ACC for if)
-        early_ops, late_ops = partition(x.arg.lst, lambda y: y.op is Ops.DEFINE_ACC and x.arg.end in y.src)
-        return UOp(Ops.BLOCK, dtypes.void, tuple(y for y in x.src if y is not parent_block)+parent_block.src,
-                  BasicBlock(tuple(y for y in x.arg.ctx if y is not x.arg.end), tuple(early_ops)+parent_block.arg.lst+tuple(late_ops)))
-  new_srcs: list[UOp] = []
-  to_append: list[UOp] = []
-  new_ctx = x.arg.ctx
-  placed = set()
-  for u in x.src:
-    if u.op is Ops.BLOCK and (tuple(u.arg.ctx) == tuple(x.arg.ctx) or (x.arg.end is not None and x.arg.end in u.arg.ctx)):
-      # NOTE: this can't appear in srcs twice or it would be a BLOCKFORK
-      new_ctx += tuple(y for y in u.arg.ctx if y not in x.arg.ctx)
-      new_srcs.extend(u.src)
-      to_append.extend(u.arg.lst)
-    elif u.op is Ops.BLOCKFORK and x.src.count(u) == u.arg: # block fork appears # of times in srcs
-      if u not in placed:
-        new_srcs.extend(u.src)
-        placed.add(u)
-    else:
-      # keep it in srcs
-      new_srcs.append(u)
-  if len(to_append) == 0 and len(placed) == 0: return None
-  return UOp(x.op, dtypes.void, tuple(new_srcs), BasicBlock(tuple(sorted(new_ctx, key=lambda x: x.tuplize)), tuple(to_append)+x.arg.lst, x.arg.end))
-pm_block_merge = PatternMatcher([(UPat((Ops.BLOCKEND, Ops.BLOCK), name="x"), block_merge),])
-def block_finalize(block:UOp):
-  if len(block.src) == 0: return None
-  _uops = sorted(dedup(block.src), key=lambda x: x.tuplize)
-  assert all(len(x.src) == 0 and x.op not in {Ops.BLOCK, Ops.BLOCKSTART, Ops.BLOCKEND, Ops.BLOCKFORK} for x in _uops)
-  _uops += block.arg.lst
-  # strip the SINK
-  assert _uops[-1].op is Ops.SINK, "doesn't end with SINK"
-  return UOp(Ops.BLOCK, arg=BasicBlock((), tuple(_uops[:-1])))
+      # count is incorrect (or it's DONT_PLACE_IN_BLOCK), add it to unmergable
+      unmergable[u] += 1
-pm_block_finalize = PatternMatcher([(UPat(Ops.BLOCK, name="block"), block_finalize)])
+  # add unmergables to sources
+  srcs = []
+  for u,cnt in unmergable.items(): srcs += [add_blockends(u, ctx.block_ctxs[u], current_ctx, cnt=cnt)]*cnt
-# NOTE: any toposort should be valid here, unlike last time this isn't required, it's just for speed
-def block_reorder(in_block:UOp):
-  in_this_block = set(in_block.arg.lst)
-  local_children: collections.defaultdict[UOp, list[UOp]] = collections.defaultdict(list)
-  in_degree: collections.defaultdict[UOp, int] = collections.defaultdict(int)
-  priorities:dict[UOp, int] = {}
-  # get local children and assign priorities
-  for u in reversed(in_block.arg.lst):
-    for s in u.src:
-      if s in in_this_block:
-        local_children[s].append(u)
-        in_degree[u] += 1
-    # put loads in the beginning of the block and prevent priority inversion
-    priorities[u] = min([-1000 if u.op is Ops.LOAD else 0] + [priorities[x] for x in local_children[u]])
-  # placement queue
-  queue:list[tuple[int, tuple, UOp]] = []
-  def push(u:UOp): heapq.heappush(queue, (priorities[u], u.tuplize, u))
+  # add blockseeds, with blockends as needed
+  for (new_ctx, new_child_ctx), v in blockseeds.items():
+    base_block = UOp(Ops.BLOCKSTART, src=tuple(v), arg=(new_ctx, new_child_ctx))
+    srcs.append(add_blockends(base_block, new_ctx, current_ctx))
-  # place the first ones that don't have deps
-  for u in in_block.arg.lst:
-    if u not in in_degree: push(u)
+  lst = lst[::-1]
+  if getenv("BLOCK_REORDER", 1): lst = block_reorder(lst)
+  bb = BasicBlock(tuple(lst), ctx=current_ctx, cnt=child_count, child_ctx=child_ctx)
+  return UOp(Ops.BLOCK, src=tuple(srcs), arg=bb)
-  newlst = []
-  while queue:
-    _,_,x = heapq.heappop(queue)
-    newlst.append(x)
-    for u in local_children[x]:
-      in_degree[u] -= 1
-      if in_degree[u] == 0: push(u)
-  assert len(newlst) == len(in_block.arg.lst), f"len mismatch {len(newlst)} != {len(in_block.arg.lst)}"
-  return in_block.replace(arg=BasicBlock(in_block.arg.ctx, tuple(newlst)))
-def linearize_uop(sink:UOp, skip_check:bool=not __debug__) -> list[UOp]:
-  assert sink.op is Ops.SINK, f"sink isn't sink, it's {sink.op}"
-  # get children and all block contexts
-  temp_block_ctxs: dict[UOp, list[UOp]] = {}
-  children: dict[UOp, list[UOp]] = {}
-  for u in sink.toposort:
-    this_block_ctx: list[UOp] = []
-    for s in u.src:
-      # save children
-      children.setdefault(s, []).append(u)
-      # compute block ctx
-      if s.op in {Ops.RANGE, Ops.IF}: this_block_ctx.append(s)
-      # don't flow (fully) through assign and store
-      elif s.op is Ops.STORE:
-        # ugh, deal with non-reduce locals. probably wrong
-        if isinstance(s.src[0].dtype, PtrDType) and s.src[0].dtype.local:
-          idx_context, store_context = temp_block_ctxs[s.src[0]], temp_block_ctxs[s]
-          this_block_ctx += [x for x in store_context if x not in idx_context and x.op is Ops.RANGE]
-      elif s.op is Ops.ASSIGN:
-        # flow though assign, but remove the ranges used in the assign
-        assert s.src[0].op is Ops.DEFINE_ACC
-        this_block_ctx += [x for x in temp_block_ctxs[s.src[1]] if x not in s.src[0].src[1:]]
-      else:
-        # flow though everything else
-        this_block_ctx += temp_block_ctxs[s]
-    temp_block_ctxs[u] = sorted(dedup(this_block_ctx), key=lambda x: x.tuplize)
-  # make final block_ctxs, add BLOCKSTART to block_ctxs for IF and RANGE
-  block_ctxs: dict[UOp, tuple[UOp, ...]] = {}
-  for u in sink.toposort:
-    block_ctxs[u] = ((UOp(Ops.BLOCKSTART, src=(u,)),) + tuple(temp_block_ctxs[u])) if u.op in {Ops.IF, Ops.RANGE} else tuple(temp_block_ctxs[u])
-  # TODO: there's probably a clever way to remove this while loop
-  while 1:
-    sink = graph_rewrite(sink, make_basic_blocks, ctx=(block_ctxs, children))
-    # add BLOCKFORK (slow!)
-    block_parent_count = collections.Counter(flatten([x.src for x in sink.toposort if x.op is Ops.BLOCK]))
-    non_block_parents = set(flatten([x.src for x in sink.toposort if x.op is not Ops.BLOCK]))
-    forks = {u:UOp(Ops.BLOCKFORK, src=(UOp(Ops.BLOCK, src=u.src, arg=BasicBlock(block_ctxs[u], (u,))),), arg=child_count)
-      for u,child_count in block_parent_count.items() if u.op not in DONT_PLACE_IN_BLOCK and child_count > 1 and u not in non_block_parents}
+block_create = PatternMatcher([
+  (UPat(GroupOp.All-DONT_PLACE_IN_BLOCK.union({Ops.BLOCK, Ops.BLOCKEND}), name="x"), make_block_bottom_up),
+])
-    if not len(forks): break
-    sink = sink.substitute(forks)
+# ***** blockend merging ****
-  # combine matching BLOCKENDS
+def merge_blockends(sink:UOp) -> UOp|None:
+  # only run on the final BLOCK with the SINK in it
+  if sink.arg.lst[-1].op is not Ops.SINK: return None
+  # combine matching BLOCKENDS, the keys of this dictionary are the RANGE UOps, values are the BLOCKENDs
   blockends_to_arg: dict[UOp, list[UOp]] = {}
-  for be in sink.toposort:
+  for be in sink.toposort():
     if be.op is Ops.BLOCKEND: blockends_to_arg.setdefault(be.arg.end, []).append(be)
   new_forks = {}
   for k,v in blockends_to_arg.items():
     # NOTE: if any BLOCKEND is the parent of any other with the same arg, this algo fails
     if len(v) > 1:
-      out = UOp(Ops.BLOCKFORK, src=(UOp(Ops.BLOCKEND, src=tuple(flatten(x.src for x in v)),
-                                        arg=BasicBlock(tuple(dedup(flatten([y.arg.ctx for y in v]))), v[0].arg.lst, k)),), arg=len(v))
+      bb = BasicBlock(v[0].arg.lst, _sort_ctx(flatten([y.arg.ctx for y in v])), k, cnt=sum(y.arg.cnt for y in v))
+      out = UOp(Ops.BLOCKEND, src=tuple(flatten([x.src for x in v])), arg=bb)
+      # NOTE: bb.ctx != u.arg.ctx can cause problems here
       for u in v: new_forks[u] = out
-  sink = sink.substitute(new_forks)
-  # reorder ops in block for speed
-  sink = sink.substitute({u:newu for u in sink.toposort if u.op is Ops.BLOCK and (newu:=block_reorder(u)) is not u})
+  if len(new_forks) == 0: return None
+  return sink.substitute(new_forks)
+pm_blockend_merge = PatternMatcher([(UPat(Ops.BLOCK, name="sink"), merge_blockends)])
+# ***** block merging ****
+def merge_block(x:UOp):
+  unmergable_blocks, mergable_blocks = [], []
+  mergable_dict: defaultdict[UOp, int] = defaultdict(int)
+  for y in x.src:
+    if y.op is Ops.BLOCK and x.op is Ops.BLOCK and x.arg.ctx == y.arg.ctx: mergable_dict[y] += 1
+    elif y.op is Ops.BLOCK and x.op is Ops.BLOCKEND and x.arg.end in y.arg.ctx: mergable_dict[y] += 1
+    else: unmergable_blocks.append(y)
+  for k,v in mergable_dict.items():
+    if v == k.arg.cnt: mergable_blocks.append(k)
+    else: unmergable_blocks.extend([k]*v)
+  if len(mergable_blocks) == 0: return None
+  del mergable_dict
+  # create the block
+  arg = replace(x.arg, lst=tuple(flatten([y.arg.lst for y in mergable_blocks]))+x.arg.lst)
+  return UOp(x.op, src=tuple(flatten([y.src for y in mergable_blocks])+unmergable_blocks), arg=arg)
+def remove_blockend(x:UOp):
+  # if there's any remaining blocks that need to go in this BLOCKEND, we don't remove it
+  if any(x.arg.end in y.arg.ctx for y in x.src if y.op in {Ops.BLOCK, Ops.BLOCKEND}): return None
+  if (parent_blocks := [y for y in x.src if y.op is Ops.BLOCK and y.arg.child_ctx is not None and x.arg.end in y.arg.child_ctx]):
+    assert all_same(parent_blocks), f"should never have two parent blocks (has {len(parent_blocks)})"
+    parent_block = parent_blocks[0]
+    assert len(parent_blocks) == parent_block.arg.cnt
+    # NOTE: DEFINE_ACC doesn't have to be handled in any special way
+    late_ops = list(x.arg.lst)
+    # NOTE: we have to add a barrier at the start if barrier is used in the range
+    if x.op is Ops.BLOCKEND and any(y.op is Ops.BARRIER for y in late_ops) and late_ops[-1].op is Ops.ENDRANGE:
+      late_ops = [UOp(Ops.BARRIER)] + late_ops
+    # peephole opt, remove any BARRIERs next to each other
+    for i in range(len(late_ops)-1):
+      if late_ops[i].op is Ops.BARRIER and late_ops[i+1].op is Ops.BARRIER: late_ops[i+1] = UOp(Ops.NOOP)
+    arg = BasicBlock(parent_block.arg.lst+tuple(late_ops), tuple([y for y in x.arg.ctx if y is not x.arg.end]), cnt=x.arg.cnt)
+    return UOp(Ops.BLOCK, src=tuple(y for y in x.src if y is not parent_block)+parent_block.src, arg=arg)
+block_merge = PatternMatcher([
+  (UPat((Ops.BLOCK, Ops.BLOCKEND), name="x"), merge_block),
+  (UPat(Ops.BLOCKEND, name="x"), remove_blockend),
+])
-  # final rewrite to merge all blocks into one
-  sink = graph_rewrite(sink, pm_block_merge, ctx=children)
+# ****** finalize ******
-  # there should just be one block left, with a few parents with 0 srcs (now done in a rewriter)
-  sink = graph_rewrite(sink, pm_block_finalize)
+def finalize(sink:UOp) -> UOp:
+  if sink.op is not Ops.BLOCK or not all(x.op in DONT_PLACE_IN_BLOCK for x in sink.src):
+    raise RuntimeError(f"linearize failure {sink.op} {[x.op for x in sink.src if x.op not in DONT_PLACE_IN_BLOCK]}")
-  # sanity checks (NOTE: these can cause things to be skipped in BEAM)
-  if not skip_check: type_verify(sink.arg.lst)
+  # place the early things
+  lst = sorted(dedup(sink.src), key=lambda x: x.tuplize) + list(sink.arg.lst)
+  return UOp(Ops.BLOCKFINAL, arg=BasicBlock(tuple(lst)))
-  # return the list. TODO: refactor to return the UOp
-  return list(sink.arg.lst)
+pm_finalize = PatternMatcher([(UPat(Ops.BLOCK, name="sink"), finalize)])

tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

tinygrad 0.10.2py3-none-any.whl → 0.11.0py3-none-any.whl