PyPI - tinygrad - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

tinygrad 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

tinygrad/codegen/kernel.py +114 -172
tinygrad/codegen/linearize.py +211 -81
tinygrad/codegen/lowerer.py +30 -35
tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
tinygrad/codegen/transcendental.py +12 -13
tinygrad/device.py +170 -47
tinygrad/dtype.py +28 -26
tinygrad/engine/jit.py +80 -63
tinygrad/engine/memory.py +4 -5
tinygrad/engine/multi.py +162 -0
tinygrad/engine/realize.py +58 -107
tinygrad/engine/schedule.py +381 -314
tinygrad/engine/search.py +40 -44
tinygrad/gradient.py +70 -0
tinygrad/helpers.py +77 -58
tinygrad/nn/__init__.py +30 -32
tinygrad/nn/datasets.py +1 -2
tinygrad/nn/optim.py +22 -26
tinygrad/nn/state.py +89 -64
tinygrad/ops.py +562 -446
tinygrad/renderer/__init__.py +79 -36
tinygrad/renderer/cstyle.py +70 -84
tinygrad/renderer/llvmir.py +32 -20
tinygrad/renderer/ptx.py +79 -99
tinygrad/renderer/wgsl.py +87 -0
tinygrad/runtime/autogen/amd_gpu.py +39507 -12
tinygrad/runtime/autogen/comgr.py +2 -0
tinygrad/runtime/autogen/kfd.py +4 -3
tinygrad/runtime/autogen/kgsl.py +1 -1
tinygrad/runtime/autogen/libpciaccess.py +2023 -0
tinygrad/runtime/autogen/llvm.py +11379 -0
tinygrad/runtime/autogen/vfio.py +891 -0
tinygrad/runtime/graph/cuda.py +8 -9
tinygrad/runtime/graph/hcq.py +84 -79
tinygrad/runtime/graph/metal.py +19 -21
tinygrad/runtime/ops_amd.py +488 -327
tinygrad/runtime/ops_clang.py +15 -28
tinygrad/runtime/ops_cloud.py +34 -34
tinygrad/runtime/ops_cuda.py +30 -27
tinygrad/runtime/ops_disk.py +62 -63
tinygrad/runtime/ops_dsp.py +129 -38
tinygrad/runtime/ops_gpu.py +30 -30
tinygrad/runtime/ops_hip.py +29 -31
tinygrad/runtime/ops_llvm.py +45 -40
tinygrad/runtime/ops_metal.py +93 -73
tinygrad/runtime/ops_npy.py +2 -2
tinygrad/runtime/ops_nv.py +232 -270
tinygrad/runtime/ops_python.py +51 -46
tinygrad/runtime/ops_qcom.py +129 -157
tinygrad/runtime/ops_webgpu.py +63 -0
tinygrad/runtime/support/allocator.py +94 -0
tinygrad/runtime/support/am/__init__.py +0 -0
tinygrad/runtime/support/am/amdev.py +384 -0
tinygrad/runtime/support/am/ip.py +463 -0
tinygrad/runtime/support/compiler_cuda.py +4 -2
tinygrad/runtime/support/elf.py +26 -4
tinygrad/runtime/support/hcq.py +254 -324
tinygrad/runtime/support/llvm.py +32 -0
tinygrad/shape/shapetracker.py +84 -53
tinygrad/shape/view.py +103 -138
tinygrad/spec.py +154 -0
tinygrad/tensor.py +744 -496
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
tinygrad-0.10.1.dist-info/RECORD +86 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
tinygrad/engine/lazy.py +0 -228
tinygrad/function.py +0 -212
tinygrad/multi.py +0 -177
tinygrad/runtime/graph/clang.py +0 -39
tinygrad-0.10.0.dist-info/RECORD +0 -77
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0

tinygrad/codegen/linearize.py CHANGED Viewed

@@ -1,92 +1,222 @@
-from typing import List, Set, Dict, Tuple
-import functools, heapq
-from tinygrad.ops import type_verify, END_FOR_UOP, UOp, Ops, GroupOp
-from tinygrad.dtype import dtypes
-from tinygrad.helpers import DEBUG
-def get_children_dfs(u:UOp, children:Dict[UOp, List[UOp]], srcs:Dict[UOp, Dict[UOp, None]], in_degree:Dict[UOp, int]):
-  if u in children: return srcs[u]
-  srcs[u] = {}
-  children[u] = []
-  for x in u.src:
-    srcs[u].update(get_children_dfs(x, children, srcs, in_degree))
-    if x.op is Ops.RANGE and x.arg[1]: srcs[u][x] = None
-    children[x].append(u)
-  in_degree[u] = len(u.src)
-  return srcs[u]
-def linearize_uop(sink:UOp, skip_check:bool=not __debug__) -> List[UOp]:
-  assert sink.op is Ops.SINK, f"sink isn't sink, it's {sink.op}"
-  # filter nodes that don't link to a sink
-  # BFS toposort
-  children: Dict[UOp, List[UOp]] = {}
-  range_srcs: Dict[UOp, Dict[UOp, None]] = {}
-  in_degree: Dict[UOp, int] = {}
-  get_children_dfs(sink, children, range_srcs, in_degree)
-  @functools.lru_cache(None)
-  def get_recursive_children(x:UOp, end:Ops, include_self=False) -> Set[UOp]:
-    if x.op is Ops.SINK: return set()
-    return set.union({x} if include_self else set(), *([get_recursive_children(u, end, True) for u in children[x] if x.op is not end]))
-  # scope children impact the toposort and END* insertion
-  scope_children = {p:get_recursive_children(p, END_FOR_UOP[p.op][0]) for p in reversed(in_degree) if p.op in END_FOR_UOP}
-  range_phi = {r:[p for p in scope_children[r] if p.op is Ops.ASSIGN] for r in scope_children if r.op is Ops.RANGE}
-  # assign priorities
-  def get_priority(u:UOp):
-    priority = 0
-    # prefer ranges that depend on the least number of independent ranges
-    if u.op is Ops.RANGE and u.arg[1]:
-      priority += u.arg[0]
-      for p in range_phi[u]:
-        priority += 10000*len([r for r in range_srcs[p] if not any(i in range_phi[u] for i in range_phi[r])])
-    elif u.op is Ops.CONST:
-      # place consts first here, they don't do anything and it can cause issues with DEFINE_ACC
-      priority -= 100000000000
+from __future__ import annotations
+import collections, heapq
+from dataclasses import dataclass
+from tinygrad.ops import UOp, Ops, PatternMatcher, UPat, graph_rewrite, GroupOp
+from tinygrad.spec import type_verify
+from tinygrad.dtype import dtypes, PtrDType
+from tinygrad.helpers import dedup, flatten, partition
+DONT_PLACE_IN_BLOCK = {Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL, Ops.DEFINE_VAR, Ops.SPECIAL, Ops.CONST, *GroupOp.Block}
+def disp(y:UOp) -> str:
+  if y.op is Ops.BLOCKSTART: return "w"+disp(y.src[0])
+  if y.op is Ops.IF: return f'IF{id(y)}'
+  if y.op is Ops.RANGE: return str(y.arg)
+  return "<NONE>"
+@dataclass(frozen=True)
+class BasicBlock:
+  ctx: tuple[UOp, ...]
+  lst: tuple[UOp, ...]
+  end: UOp|None = None
+  def __lt__(self, o:BasicBlock): return tuple(x.tuplize for x in self.ctx+self.lst) < tuple(x.tuplize for x in o.ctx+o.lst)
+  def __repr__(self):
+    return f"{(str(disp(self.end))+' ') if self.end is not None else ''}"+\
+           f"{[disp(y) for y in self.ctx]} {len(self.lst)}" + "\n" + '\n'.join([str(x.op) for x in self.lst])
+def append_to_block(ctx:tuple[dict[UOp, tuple[UOp, ...]], dict[UOp, list[UOp]]], x:UOp):
+  block_ctxs, children = ctx
+  in_this_block = set(x.arg.lst)
+  # collections to build
+  new_srcs: list[UOp] = []
+  to_append: list[UOp] = []
+  old_blocks: dict[tuple[UOp, ...], UOp] = {}
+  new_blocks: dict[tuple[UOp, ...], list[UOp]] = {}
+  for u in x.src:
+    if u.op is Ops.BLOCK:
+      # merge sibling blocks. NOTE: blocks must only have one output source
+      assert u.arg.ctx not in old_blocks, "sibling should never have been created"
+      old_blocks[u.arg.ctx] = u
+    elif u.op not in DONT_PLACE_IN_BLOCK and set(children[u]).issubset(in_this_block):
+      # if it can go in blocks and all its children are in the block, we add it to the block
+      if (block_ctx:=block_ctxs[u]) == x.arg.ctx:
+        # if it's the same context, we place the UOp in this block and append the parents to its srcs
+        new_srcs.extend(u.src)
+        to_append.append(u)
+      else:
+        # if it's a different context, we create a new block with this UOp
+        new_blocks.setdefault(block_ctx, []).append(u)
     else:
-      # prefer uops that are loop children
-      priority -= sum([(l.arg[0]+1) + 1000*l.arg[1] for l,ss in scope_children.items() if l.op is Ops.RANGE and u in ss])
-    if u.op is Ops.IF and len(u.src) == 1: priority += 10000000 # if penalty
-    return priority
-  priorities:Dict[UOp, int] = {u:get_priority(u) for u in children}
-  # prevent priority inversion
-  @functools.lru_cache(None)
-  def fix_priority(u:UOp, lowest_priority):
-    if u.op in {Ops.CAST, Ops.BITCAST, *GroupOp.ALU, Ops.VECTORIZE, Ops.GEP, Ops.SPECIAL, Ops.DEFINE_LOCAL, Ops.LOAD}:
-      priorities[u] = min(priorities[u], lowest_priority)
-      if u.op is Ops.LOAD: priorities[u] += 100 # load penalty (here)
-    for x in u.src: fix_priority(x, priorities[u])
-  fix_priority(sink, 0)
-  # NOTE: the compare should never make it all the way to u
-  queue:List[Tuple[int, Tuple, UOp]] = []
+      # otherwise, we keep it in the srcs
+      new_srcs.append(u)
+  if len(to_append) == 0 and len(new_blocks) == 0: return None
+  for rng,lst in new_blocks.items():
+    srcs = flatten(y.src for y in lst)
+    if (old_block:=old_blocks.pop(rng, None)) is not None:
+      # NOTE: order shouldn't matter here
+      srcs.extend(old_block.src)
+      lst.extend(old_block.arg.lst)
+    new_block = UOp(Ops.BLOCK, dtypes.void, tuple(dedup(srcs)), BasicBlock(rng, tuple(lst)))
+    lrng = list(rng)
+    for r in rng[::-1]:
+      if r not in x.arg.ctx and r.op is not Ops.BLOCKSTART:
+        lrng.remove(r)
+        new_block = UOp(Ops.BLOCKEND, src=(new_block,),
+                        arg=BasicBlock(tuple(lrng), (UOp(Ops.ENDIF if r.op is Ops.IF else Ops.ENDRANGE, src=(r,)),), r))
+    new_srcs.append(new_block)
+  return UOp(Ops.BLOCK, dtypes.void, tuple(dedup(list(old_blocks.values())+new_srcs)), BasicBlock(x.arg.ctx, tuple(to_append)+x.arg.lst))
+make_basic_blocks = PatternMatcher([
+  (UPat(Ops.SINK, name="x"), lambda x: UOp(Ops.BLOCK, src=x.src, arg=BasicBlock((), (x,)))),
+  (UPat(Ops.BLOCK, name="x"), append_to_block),
+])
+def block_merge(ctx, x:UOp):
+  # ctx is children here
+  if x.op is Ops.BLOCKEND:
+    # if it's a BLOCKEND, see if we are done with placement. if all the children of the range are in here
+    in_this_block = set(x.arg.lst)
+    if len([y for y in ctx[x.arg.end] if y not in in_this_block]) == 0:
+      # find the parent block that has the BLOCKSTART in the ctx
+      parent_blocks = [y for y in x.src if y.op is Ops.BLOCK and UOp(Ops.BLOCKSTART, src=(x.arg.end,)) in y.arg.ctx]
+      assert len(parent_blocks) <= 1, "should never have two parent blocks"
+      if len(parent_blocks) == 1:
+        parent_block = parent_blocks[0]
+        # range needs DEFINE_ACC to be before the range (never in DEFINE_ACC for if)
+        early_ops, late_ops = partition(x.arg.lst, lambda y: y.op is Ops.DEFINE_ACC and x.arg.end in y.src)
+        return UOp(Ops.BLOCK, dtypes.void, tuple(y for y in x.src if y is not parent_block)+parent_block.src,
+                  BasicBlock(tuple(y for y in x.arg.ctx if y is not x.arg.end), tuple(early_ops)+parent_block.arg.lst+tuple(late_ops)))
+  new_srcs: list[UOp] = []
+  to_append: list[UOp] = []
+  new_ctx = x.arg.ctx
+  placed = set()
+  for u in x.src:
+    if u.op is Ops.BLOCK and (tuple(u.arg.ctx) == tuple(x.arg.ctx) or (x.arg.end is not None and x.arg.end in u.arg.ctx)):
+      # NOTE: this can't appear in srcs twice or it would be a BLOCKFORK
+      new_ctx += tuple(y for y in u.arg.ctx if y not in x.arg.ctx)
+      new_srcs.extend(u.src)
+      to_append.extend(u.arg.lst)
+    elif u.op is Ops.BLOCKFORK and x.src.count(u) == u.arg: # block fork appears # of times in srcs
+      if u not in placed:
+        new_srcs.extend(u.src)
+        placed.add(u)
+    else:
+      # keep it in srcs
+      new_srcs.append(u)
+  if len(to_append) == 0 and len(placed) == 0: return None
+  return UOp(x.op, dtypes.void, tuple(new_srcs), BasicBlock(tuple(sorted(new_ctx, key=lambda x: x.tuplize)), tuple(to_append)+x.arg.lst, x.arg.end))
+pm_block_merge = PatternMatcher([(UPat((Ops.BLOCKEND, Ops.BLOCK), name="x"), block_merge),])
+# NOTE: any toposort should be valid here, unlike last time this isn't required, it's just for speed
+def block_reorder(in_block:UOp):
+  in_this_block = set(in_block.arg.lst)
+  local_children: collections.defaultdict[UOp, list[UOp]] = collections.defaultdict(list)
+  in_degree: collections.defaultdict[UOp, int] = collections.defaultdict(int)
+  priorities:dict[UOp, int] = {}
+  # get local children and assign priorities
+  for u in reversed(in_block.arg.lst):
+    for s in u.src:
+      if s in in_this_block:
+        local_children[s].append(u)
+        in_degree[u] += 1
+    # put loads in the beginning of the block and prevent priority inversion
+    priorities[u] = min([-1000 if u.op is Ops.LOAD else 0] + [priorities[x] for x in local_children[u]])
+  # placement queue
+  queue:list[tuple[int, tuple, UOp]] = []
   def push(u:UOp): heapq.heappush(queue, (priorities[u], u.tuplize, u))
-  for u in children:
-    if in_degree[u] == 0: push(u)
+  # place the first ones that don't have deps
+  for u in in_block.arg.lst:
+    if u not in in_degree: push(u)
-  scope_end: Dict[UOp, UOp] = {}
-  _uops: List[UOp] = []
+  newlst = []
   while queue:
-    p,_,x = heapq.heappop(queue)
-    if DEBUG >= 7: print(f"{p:5d}", x.op, x.dtype, x.arg)
-    if x in scope_children: scope_end[x] = x
-    if x.op is Ops.DEFINE_ACC:
-      idx = min([_uops.index(l) for l in x.src if l.op is Ops.RANGE])
-      _uops.insert(idx, x)
-    else: _uops.append(x)
-    for u, ss in scope_children.items():
-      if x in ss:
-        ss.remove(x)
-        if len(ss) == 0: scope_end[u] = x
-    for u in children[x]:
+    _,_,x = heapq.heappop(queue)
+    newlst.append(x)
+    for u in local_children[x]:
       in_degree[u] -= 1
       if in_degree[u] == 0: push(u)
-  # end scopes in toposort order
-  for u, x in scope_end.items(): _uops.insert(_uops.index(x)+1, UOp(END_FOR_UOP[u.op][1], dtypes.void, (u,)))
+  assert len(newlst) == len(in_block.arg.lst), f"len mismatch {len(newlst)} != {len(in_block.arg.lst)}"
+  return in_block.replace(arg=BasicBlock(in_block.arg.ctx, tuple(newlst)))
+def linearize_uop(sink:UOp, skip_check:bool=not __debug__) -> list[UOp]:
+  assert sink.op is Ops.SINK, f"sink isn't sink, it's {sink.op}"
+  # get children and all block contexts
+  temp_block_ctxs: dict[UOp, list[UOp]] = {}
+  children: dict[UOp, list[UOp]] = {}
+  for u in sink.toposort:
+    this_block_ctx: list[UOp] = []
+    for s in u.src:
+      # save children
+      children.setdefault(s, []).append(u)
+      # compute block ctx
+      if s.op in {Ops.RANGE, Ops.IF}: this_block_ctx.append(s)
+      # don't flow (fully) through assign and store
+      elif s.op is Ops.STORE:
+        # ugh, deal with non-reduce locals. probably wrong
+        if isinstance(s.src[0].dtype, PtrDType) and s.src[0].dtype.local:
+          idx_context, store_context = temp_block_ctxs[s.src[0]], temp_block_ctxs[s]
+          this_block_ctx += [x for x in store_context if x not in idx_context and x.op is Ops.RANGE]
+      elif s.op is Ops.ASSIGN:
+        # flow though assign, but remove the ranges used in the assign
+        assert s.src[0].op is Ops.DEFINE_ACC
+        this_block_ctx += [x for x in temp_block_ctxs[s.src[1]] if x not in s.src[0].src[1:]]
+      else:
+        # flow though everything else
+        this_block_ctx += temp_block_ctxs[s]
+    temp_block_ctxs[u] = sorted(dedup(this_block_ctx), key=lambda x: x.tuplize)
+  # make final block_ctxs, add BLOCKSTART to block_ctxs for IF and RANGE
+  block_ctxs: dict[UOp, tuple[UOp, ...]] = {}
+  for u in sink.toposort:
+    block_ctxs[u] = ((UOp(Ops.BLOCKSTART, src=(u,)),) + tuple(temp_block_ctxs[u])) if u.op in {Ops.IF, Ops.RANGE} else tuple(temp_block_ctxs[u])
+  # TODO: there's probably a clever way to remove this while loop
+  while 1:
+    sink = graph_rewrite(sink, make_basic_blocks, ctx=(block_ctxs, children))
+    # add BLOCKFORK (slow!)
+    block_parent_count = collections.Counter(flatten([x.src for x in sink.toposort if x.op is Ops.BLOCK]))
+    non_block_parents = set(flatten([x.src for x in sink.toposort if x.op is not Ops.BLOCK]))
+    forks = {u:UOp(Ops.BLOCKFORK, src=(UOp(Ops.BLOCK, src=u.src, arg=BasicBlock(block_ctxs[u], (u,))),), arg=child_count)
+      for u,child_count in block_parent_count.items() if u.op not in DONT_PLACE_IN_BLOCK and child_count > 1 and u not in non_block_parents}
+    if not len(forks): break
+    sink = sink.substitute(forks)
+  # combine matching BLOCKENDS
+  blockends_to_arg: dict[UOp, list[UOp]] = {}
+  for be in sink.toposort:
+    if be.op is Ops.BLOCKEND: blockends_to_arg.setdefault(be.arg.end, []).append(be)
+  new_forks = {}
+  for k,v in blockends_to_arg.items():
+    # NOTE: if any BLOCKEND is the parent of any other with the same arg, this algo fails
+    if len(v) > 1:
+      out = UOp(Ops.BLOCKFORK, src=(UOp(Ops.BLOCKEND, src=tuple(flatten(x.src for x in v)),
+                                        arg=BasicBlock(tuple(dedup(flatten([y.arg.ctx for y in v]))), v[0].arg.lst, k)),), arg=len(v))
+      for u in v: new_forks[u] = out
+  sink = sink.substitute(new_forks)
+  # reorder ops in block for speed
+  sink = sink.substitute({u:newu for u in sink.toposort if u.op is Ops.BLOCK and (newu:=block_reorder(u)) is not u})
+  # final rewrite to merge all blocks into one
+  sink = graph_rewrite(sink, pm_block_merge, ctx=children)
+  # there should just be one block left, with a few parents with 0 srcs
+  assert sink.op is Ops.BLOCK
+  _uops = sorted(dedup(sink.src), key=lambda x: x.tuplize)
+  assert all(len(x.src) == 0 and x.op not in {Ops.BLOCK, Ops.BLOCKSTART, Ops.BLOCKEND, Ops.BLOCKFORK} for x in _uops)
+  _uops += sink.arg.lst
   # sanity checks (NOTE: these can cause things to be skipped in BEAM)
   if not skip_check: type_verify(_uops)

tinygrad/codegen/lowerer.py CHANGED Viewed

@@ -1,17 +1,14 @@
 # the job of the lowerer is to do indexing
-from __future__ import annotations
 import functools, itertools, operator
 from dataclasses import dataclass
-from typing import List, Tuple, cast, Optional
-from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.shape.view import variable_to_uop
+from typing import cast
 from tinygrad.dtype import dtypes, PtrDType
-from tinygrad.ops import KernelInfo, UOp, Ops, graph_rewrite, PatternMatcher, UPat, sint, identity_element
+from tinygrad.ops import KernelInfo, UOp, Ops, graph_rewrite, PatternMatcher, UPat, sint, identity_element, sint_to_uop
 from tinygrad.renderer import Renderer
-from tinygrad.helpers import all_int, prod, partition, flatten
+from tinygrad.helpers import all_int, prod, partition, flatten, unwrap
 # returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
-def get_contraction(old_shape:Tuple[sint, ...], new_shape:Tuple[sint, ...]) -> Optional[List[List[int]]]:
+def get_contraction(old_shape:tuple[sint, ...], new_shape:tuple[sint, ...]) -> list[list[int]]|None:
   acc_old, acc_new = list(itertools.accumulate(old_shape, operator.mul)), list(itertools.accumulate(new_shape, operator.mul))
   try: split = [acc_old.index(acc)+1 if acc != 1 else 0 for acc in acc_new]
   except ValueError: return None
@@ -19,7 +16,7 @@ def get_contraction(old_shape:Tuple[sint, ...], new_shape:Tuple[sint, ...]) -> O
 # ***** indexing *****
-def _limit_dims(dims:Tuple[sint, ...], max_sizes:Tuple[int, ...]):
+def _limit_dims(dims:tuple[sint, ...], max_sizes:tuple[int, ...]):
   # TODO: symbolic shape
   if not all_int(dims): return dims
   while len(dims) > len(max_sizes) or any(d > m for d,m in zip(dims, max_sizes)):
@@ -30,25 +27,24 @@ def _limit_dims(dims:Tuple[sint, ...], max_sizes:Tuple[int, ...]):
     else: raise RuntimeError(f"cannot limit dim {dims=}, {max_sizes=}")
   return dims
-def get_grouped_dims(prefix, dims:Tuple[sint, ...], max_sizes:Optional[Tuple[int, ...]], reverse=False) -> List[UOp]:
+def get_grouped_dims(prefix, dims:tuple[sint, ...], max_sizes:tuple[int, ...]|None, reverse=False) -> list[UOp]:
   if reverse: dims = dims[::-1]
   limited = _limit_dims(dims, max_sizes) if max_sizes is not None else dims
   ret = raw_idxs = [UOp(Ops.SPECIAL, dtypes.int, (), (f"{prefix}{i}", s)) for i,s in enumerate(limited)]
   if limited != dims:
     ret = []
-    # cast for mypy, get_contraction won't be None
-    for idx, contraction in zip(raw_idxs, cast(List[List[int]], get_contraction(dims, limited))):
-      if len(contraction) == 1: ret.append(idx)
-      else:
-        for c in contraction:
-          ret.append(idx % dims[c])
-          idx //= dims[c]
+    if (contraction:=get_contraction(dims, limited)) is None: raise AssertionError(f"get_contraction should not be None {dims=} {limited=}")
+    for idx, contraction_group in zip(raw_idxs, contraction):
+      for c in contraction_group[:-1]:
+        ret.append(idx % dims[c])
+        idx //= dims[c]
+      ret.append(idx)
   return ret[::-1] if reverse else ret
 @dataclass
 class IndexContext:
-  idxs: List[UOp]
-  ridxs: List[UOp]
+  idxs: list[UOp]
+  ridxs: list[UOp]
   acc_num: int = 0
 def get_index(ast:UOp, opts:Renderer) -> IndexContext:
@@ -56,14 +52,11 @@ def get_index(ast:UOp, opts:Renderer) -> IndexContext:
   # NOTE: assumes the shape is <global dims> <local dims> <group_for_reduces> <reduces> <upcasts/unrolls>
   full_shape = ast.full_shape
   first_upcasted = len(full_shape)-ki.upcasted
-  first_output_st: ShapeTracker = ast.src[0].st_arg
   # if there's no reduce, this is first_upcasted. assumes reduces are at the end
-  first_reduce = min([first_upcasted]+flatten(x.axis_arg for x in ast.sparents if x.op is Ops.REDUCE_AXIS))
-  local_loads = [x for x in ast.parents if x.op is Ops.LOAD and x.src[0].op is Ops.DEFINE_LOCAL]
+  first_reduce = min([first_upcasted]+flatten(x.axis_arg for x in ast.toposort if x.op is Ops.REDUCE_AXIS))
+  local_loads = [x for x in ast.toposort if x.op is Ops.LOAD and x.src[0].op is Ops.DEFINE_LOCAL]
   # NOTE: sum up the reduced axes looking across all local loads, yields the number of grouped reduces
-  group_for_reduces = sum([any(j!=y for j in x) for x,y in zip(
-    [[l.st_arg.shape[i] for l in local_loads] for i in range(first_reduce,first_upcasted)],
-    first_output_st.shape[first_reduce:first_upcasted])]) if local_loads else 0
+  group_for_reduces = sum([any(l.st_arg.shape[i]!=ast.src[0].st_arg.shape[i] for l in local_loads) for i in range(first_reduce,first_upcasted)])
   global_dims = first_reduce-ki.local_dims
   if opts.has_local:
@@ -76,22 +69,21 @@ def get_index(ast:UOp, opts:Renderer) -> IndexContext:
              get_grouped_dims("lidx", full_shape[global_dims:first_reduce+group_for_reduces], opts.local_max)
   else:
     # all loops are RANGES
-    idxs = [UOp(Ops.RANGE, dtypes.int, (UOp.const(dtypes.int, 0), variable_to_uop(g)), (i, False))
-                  for i,g in enumerate(full_shape[:first_reduce])]
+    idxs = [UOp(Ops.RANGE, dtypes.int, (sint_to_uop(0), sint_to_uop(g)), i) for i,g in enumerate(full_shape[:first_reduce])]
   # reduce loops
-  idxs += [UOp(Ops.RANGE, dtypes.int, (UOp.const(dtypes.int, 0), variable_to_uop(g)), (i, True))
+  idxs += [UOp(Ops.RANGE, dtypes.int, (sint_to_uop(0), sint_to_uop(g)), i)
     for i,g in enumerate(full_shape[first_reduce+group_for_reduces:first_upcasted], start=first_reduce+group_for_reduces)]
   # upcast loops
   for i,g in enumerate(full_shape[first_upcasted:], start=first_upcasted):
     assert isinstance(g, int), "needs to be int to upcast/unroll"
-    idxs.append(UOp(Ops.EXPAND, dtypes.int, (UOp.const(dtypes.int.vec(g), tuple(range(g))),), ((i,g),)))
+    idxs.append(UOp(Ops.UNROLL, dtypes.int, (UOp.const(dtypes.int.vec(g), tuple(range(g))),), ((i,g),)))
   # late indexes (group for reduce)
   ridxs = idxs[:]
   for a in range(first_reduce, first_reduce+group_for_reduces):
-    ridxs[a] = UOp(Ops.RANGE, dtypes.int, (UOp.const(dtypes.int, 0), variable_to_uop(full_shape[a])), (1000+a, True))
+    ridxs[a] = UOp(Ops.RANGE, dtypes.int, (sint_to_uop(0), sint_to_uop(full_shape[a])), 1000+a)
   return IndexContext(idxs, ridxs)
@@ -100,7 +92,7 @@ def get_index(ast:UOp, opts:Renderer) -> IndexContext:
 def lower_reduce_axis(ctx: IndexContext, x: UOp):
   # NOTE: always using ridxs is fine here
   reduce_range, reduce_expand = partition([ctx.ridxs[i] for i in x.axis_arg], lambda y: y.op is Ops.RANGE)
-  assert all(x.op is Ops.EXPAND for x in reduce_expand), f"not all EXPANDS in {reduce_expand} for {x.axis_arg}"
+  assert all(x.op is Ops.UNROLL for x in reduce_expand), f"not all UNROLLS in {reduce_expand} for {x.axis_arg}"
   alu_op: Ops = x.arg[0]
   ret = x.src[0]
   if len(contract_axis:=flatten(x.arg for x in reduce_expand)):
@@ -114,12 +106,10 @@ def lower_reduce_axis(ctx: IndexContext, x: UOp):
 def lower_load_store(ctx: IndexContext, x: UOp):
   idx, valid = x.st_arg.to_indexed_uops(ctx.ridxs if x.op is Ops.LOAD and x.src[0].op is Ops.DEFINE_LOCAL else ctx.idxs)
-  # TODO: check has_valid in UPat, not here
-  has_valid = valid.op is not Ops.CONST or valid.arg is not True
   buf = x.src[0]
   if x.op is Ops.LOAD:
     barrier = (UOp(Ops.BARRIER, dtypes.void, (x.src[2],)),) if x.src[0].op is Ops.DEFINE_LOCAL else ()
-    return UOp(Ops.LOAD, x.dtype, (buf.index(idx, valid if has_valid else None),) + barrier)
+    return UOp(Ops.LOAD, x.dtype, (buf.index(idx, valid),) + barrier)
   # NOTE: only store the local reduceop in the threads that are actually doing the reduce
   if cast(PtrDType, x.src[0].dtype).local and x.src[2].op is Ops.ASSIGN:
     reduce_input = x.src[2].src[1].src[1] if x.src[2].src[1].src[1] is not x.src[2].src[0] else x.src[2].src[1].src[0]
@@ -130,14 +120,19 @@ def lower_load_store(ctx: IndexContext, x: UOp):
   if (not cast(PtrDType, x.src[0].dtype).local) or store_back:
     for oidx, ridx in zip(ctx.idxs, ctx.ridxs):
       if oidx is not ridx: valid = valid * oidx.eq(0)
-    has_valid = valid.op is not Ops.CONST or valid.arg is not True
-  return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid if has_valid else None), x.src[2]))
+  return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid), x.src[2]))
+def lower_const(x:UOp):
+  assert all(v.mask is None for v in unwrap(x.st).views), f"VIEW in CONST/DEFINE_VAR source must be unmasked, got {x.st}"
+  return x.replace(src=())
 pm_lowerer = PatternMatcher([
   (UPat(Ops.REDUCE_AXIS, name="x"), lower_reduce_axis),
+  (UPat((Ops.CONST, Ops.DEFINE_VAR), src=(UPat(Ops.VIEW),), name="x"), lower_const),
   (UPat(Ops.VALID, src=(UPat(Ops.VIEW),), name="x"), lambda ctx,x: x.st_arg.to_indexed_uops(ctx.idxs)[1]),
   # rewrite LOAD/STORE VIEW to LOAD/STORE with indexed
   (UPat((Ops.LOAD, Ops.STORE), src=(UPat(), UPat(Ops.VIEW)), allow_any_len=True, name="x"), lower_load_store),
+  (UPat(Ops.INDEX, src=(UPat.var("b"), UPat.var("idx"), UPat.const(dtypes.bool, True))), lambda b, idx: b.index(idx)),
 ])
 def rewrite_shapetracker_with_index(ast:UOp, opts:Renderer) -> UOp: return graph_rewrite(ast, pm_lowerer, ctx=get_index(ast, opts))

tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

tinygrad 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl