PyPI - tinygrad - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

tinygrad 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

tinygrad/codegen/devectorizer.py +247 -0
tinygrad/codegen/expander.py +121 -0
tinygrad/codegen/kernel.py +141 -201
tinygrad/codegen/linearize.py +223 -84
tinygrad/codegen/lowerer.py +60 -42
tinygrad/codegen/symbolic.py +476 -0
tinygrad/codegen/transcendental.py +22 -13
tinygrad/device.py +187 -47
tinygrad/dtype.py +39 -28
tinygrad/engine/jit.py +83 -65
tinygrad/engine/memory.py +4 -5
tinygrad/engine/multi.py +161 -0
tinygrad/engine/realize.py +62 -108
tinygrad/engine/schedule.py +396 -357
tinygrad/engine/search.py +55 -66
tinygrad/gradient.py +73 -0
tinygrad/helpers.py +81 -59
tinygrad/nn/__init__.py +30 -32
tinygrad/nn/datasets.py +1 -2
tinygrad/nn/optim.py +22 -26
tinygrad/nn/state.py +91 -66
tinygrad/ops.py +492 -641
tinygrad/renderer/__init__.py +95 -36
tinygrad/renderer/cstyle.py +99 -92
tinygrad/renderer/llvmir.py +83 -34
tinygrad/renderer/ptx.py +83 -99
tinygrad/renderer/wgsl.py +95 -0
tinygrad/runtime/autogen/amd_gpu.py +39507 -12
tinygrad/runtime/autogen/comgr.py +2 -0
tinygrad/runtime/autogen/kfd.py +4 -3
tinygrad/runtime/autogen/kgsl.py +1 -1
tinygrad/runtime/autogen/libc.py +404 -71
tinygrad/runtime/autogen/llvm.py +11379 -0
tinygrad/runtime/autogen/pci.py +1333 -0
tinygrad/runtime/autogen/vfio.py +891 -0
tinygrad/runtime/autogen/webgpu.py +6985 -0
tinygrad/runtime/graph/cuda.py +8 -9
tinygrad/runtime/graph/hcq.py +84 -79
tinygrad/runtime/graph/metal.py +40 -43
tinygrad/runtime/ops_amd.py +498 -334
tinygrad/runtime/ops_cloud.py +34 -34
tinygrad/runtime/ops_cpu.py +24 -0
tinygrad/runtime/ops_cuda.py +30 -27
tinygrad/runtime/ops_disk.py +62 -63
tinygrad/runtime/ops_dsp.py +159 -42
tinygrad/runtime/ops_gpu.py +30 -30
tinygrad/runtime/ops_hip.py +29 -31
tinygrad/runtime/ops_llvm.py +48 -41
tinygrad/runtime/ops_metal.py +149 -113
tinygrad/runtime/ops_npy.py +2 -2
tinygrad/runtime/ops_nv.py +238 -273
tinygrad/runtime/ops_python.py +55 -50
tinygrad/runtime/ops_qcom.py +129 -157
tinygrad/runtime/ops_webgpu.py +225 -0
tinygrad/runtime/support/allocator.py +94 -0
tinygrad/runtime/support/am/__init__.py +0 -0
tinygrad/runtime/support/am/amdev.py +396 -0
tinygrad/runtime/support/am/ip.py +463 -0
tinygrad/runtime/support/compiler_cuda.py +4 -2
tinygrad/runtime/support/elf.py +28 -4
tinygrad/runtime/support/hcq.py +256 -324
tinygrad/runtime/support/llvm.py +26 -0
tinygrad/shape/shapetracker.py +85 -53
tinygrad/shape/view.py +104 -140
tinygrad/spec.py +155 -0
tinygrad/tensor.py +835 -527
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
tinygrad/viz/index.html +544 -0
tinygrad/viz/perfetto.html +178 -0
tinygrad/viz/serve.py +205 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
tinygrad-0.10.2.dist-info/RECORD +99 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
tinygrad/codegen/uopgraph.py +0 -506
tinygrad/engine/lazy.py +0 -228
tinygrad/function.py +0 -212
tinygrad/multi.py +0 -177
tinygrad/runtime/graph/clang.py +0 -39
tinygrad/runtime/ops_clang.py +0 -35
tinygrad-0.10.0.dist-info/RECORD +0 -77
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
{tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0

tinygrad/codegen/devectorizer.py ADDED Viewed

@@ -0,0 +1,247 @@
+from typing import Optional, Any, Callable
+import functools, operator
+from collections import defaultdict
+from tinygrad.dtype import dtypes, ImageDType, PtrDType
+from tinygrad.ops import UOp, Ops, UPat, PatternMatcher, resolve
+from tinygrad.ops import graph_rewrite, GroupOp
+from tinygrad.codegen.symbolic import symbolic_simple, split_uop, uop_given_valid, parse_valid, simplify_valid, sym, mulacc_unrolled
+from tinygrad.helpers import getenv, flatten, dedup, TRANSCENDENTAL, AMX, prod, DEVECTORIZE
+from tinygrad.codegen.transcendental import xexp2, xlog2, xsin, xpow, TRANSCENDENTAL_SUPPORTED_DTYPES
+from tinygrad.renderer import Renderer
+# ***** float4/image store handling *****
+def fold_expanded(ex, buf):
+  new_srcs = dedup(list(ex.src))
+  old_new_srcs = new_srcs[:]
+  is_load, is_image = new_srcs[0].op is Ops.LOAD, isinstance(buf.dtype, ImageDType)
+  # TODO: get the device from the buffer somehow
+  # NOTE: this can't be Device.DEFAULT because it opens devices
+  if buf.dtype.base != dtypes.float and buf.dtype.base != dtypes.half and not isinstance(buf.dtype, ImageDType): return None
+  lengths = [4] if is_image else ([8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2]))
+  # first, extract all the relevant offsets
+  offsets_rootsrc: defaultdict[Any, dict] = defaultdict(dict)
+  for i,s in enumerate(new_srcs):
+    idx = s.src[0].src[1]
+    if s.dtype.count != 1 or (is_image and idx.dtype.count == 2): continue
+    if idx.op is Ops.ADD and idx.src[1].op is Ops.CONST: root_src, arg = idx.src[0], idx.src[1].arg
+    elif idx.op is Ops.CONST: root_src, arg = "CONST", idx.arg
+    else: root_src, arg = idx, 0
+    # add gates for gated
+    if len(s.src[0].src) == 3: root_src = (s.src[0].src[2], root_src)
+    assert arg not in offsets_rootsrc[root_src], f"{offsets_rootsrc[root_src][arg]} != {i} with {len(s.src)} sources"
+    offsets_rootsrc[root_src][arg] = i
+  # then rewrite everything we can
+  used: set[tuple[UOp, UOp]] = set()
+  for rootsrc, offsets in offsets_rootsrc.items():
+    for o in offsets:
+      for fold_length in lengths:
+        if all((rootsrc,o+i) not in used and o+i in offsets for i in range(fold_length)):
+          load_1 = new_srcs[offsets[o]]
+          new_src = list(load_1.src)
+          oidx = new_src[0].src[1]
+          if oidx.divides(fold_length) is None: continue
+          if is_image:
+            # for images, we rewrite the index. it must evenly divide 4 from the above check
+            new_src[0] = buf.index(
+              UOp(Ops.VECTORIZE, dtypes.int.vec(2), ((oidx // 4) % buf.dtype.shape[1], (oidx // (4*buf.dtype.shape[1])))),
+              rootsrc[0] if isinstance(rootsrc, tuple) else None)
+          else:
+            # for non image, we upcast the index pointer
+            new_src[0] = new_src[0].cast(new_src[0].dtype.base.vec(fold_length).ptr(size=new_src[0].dtype.size, local=new_src[0].dtype.local))
+          # generate the folded new_srcs
+          if is_load:
+            new_load = UOp(Ops.LOAD, load_1.dtype.vec(fold_length), tuple(new_src))
+            for i in range(fold_length): new_srcs[offsets[o+i]] = new_load.gep(i)
+          else: # vectorize the store
+            new_src[1] = UOp(Ops.VECTORIZE, new_src[1].dtype.vec(fold_length), tuple(new_srcs[offsets[o+i]].src[1] for i in range(fold_length)))
+            for i in range(fold_length): new_srcs[offsets[o+i]] = UOp(Ops.STORE, dtypes.void, tuple(new_src)) if i == 0 else None
+          used.update((rootsrc,o+i) for i in range(fold_length))
+  # dedup expand for LOAD
+  if is_load and len(old_new_srcs) != len(ex.src): new_srcs = [new_srcs[old_new_srcs.index(s)] for s in ex.src]
+  # remove Nones for STORE
+  return UOp(ex.op, ex.dtype, tuple(x for x in new_srcs if x is not None), ex.arg) if len(used) else None
+def fix_unfoldable_image_load(load:UOp, buf:UOp):
+  if not isinstance(buf.dtype, ImageDType) or (oidx:=load.src[0].src[1]).dtype.count == 2: return None
+  id4 = oidx % 4
+  new_src = list(load.src)
+  # TODO: copied logic from above
+  new_src[0] = load.src[0].src[0].index(
+    UOp(Ops.VECTORIZE, dtypes.int.vec(2), ((oidx // 4) % buf.dtype.shape[1], (oidx // (4*buf.dtype.shape[1])))),
+    load.src[0].src[2] if len(load.src[0].src) == 3 else None)
+  vec_load = UOp(Ops.LOAD, load.dtype.vec(4), tuple(new_src))
+  return functools.reduce(lambda ret, i: id4.ne(i).where(ret, vec_load.gep(i)), range(4), load.const_like(float('nan')))
+buf_idx_pat = UPat(Ops.INDEX, src=(UPat.var("buf"),), allow_any_len=True)
+float4_folding = PatternMatcher([
+  (UPat(Ops.VECTORIZE, src=UPat(Ops.LOAD, src=(buf_idx_pat,), allow_any_len=True), name="ex"), fold_expanded),
+  (UPat((Ops.BARRIER, Ops.SINK), src=UPat(Ops.STORE, src=(buf_idx_pat,), allow_any_len=True), name="ex"), fold_expanded),
+])
+# ***** image load valid simplification *****
+def simplify_valid_load(buf:UOp, start_idx:UOp, valid:UOp) -> UOp|None:
+  if (idx:=uop_given_valid(valid, start_idx)) is None: return buf.const_like(0)
+  if not isinstance(buf.dtype, ImageDType): return None if idx is start_idx else buf.index(idx, valid)
+  # wait for it to be image indexed before running simplification
+  if start_idx.dtype.count != 2: return None
+  # can drop valid if idx is out of bound when valid is False
+  drop_stmt = []
+  for stmt in split_uop(valid, Ops.AND):
+    X, is_upper_bound, c = parse_valid(stmt)
+    # for X0 + X1 + ... >= 1, check if it's out of bound when Xi = 0 for all i
+    if not is_upper_bound and c == 1 and all(u.op in GroupOp.Irreducible and u.vmin == 0 for u in split_uop(X, Ops.ADD)):
+      testidx = functools.reduce(lambda nowidx,u: nowidx.substitute({u:u.const_like(0)}), split_uop(X, Ops.ADD), idx)
+      testidx = testidx.simplify()
+      if testidx.gep(0).vmax < 0 or testidx.gep(1).vmax < 0:
+        drop_stmt.append(stmt)
+        continue
+    # if X <= c, check if it's out of bound when X = c+1
+    # if X >= c, check if it's out of bound when X = c-1
+    test_value = c + 1 if is_upper_bound else c - 1
+    for i,b in zip(idx.src, (buf.dtype.shape[1], buf.dtype.shape[0])):
+      if i.is_increasing():
+        rw = i.substitute({X:X.const_like(test_value)}).simplify()
+        if rw.vmin >= b or rw.vmax < 0:
+          drop_stmt.append(stmt)
+          break
+  if not drop_stmt and idx is start_idx: return None
+  new_valid = functools.reduce(operator.and_, ss) if (ss:=[s for s in split_uop(valid, Ops.AND) if s not in drop_stmt]) else None
+  return buf.index(idx, new_valid)
+# ***** optional patterns *****
+powers_of_two = {2**i:i for i in range(64)}
+@functools.lru_cache(None)
+def get_late_rewrite_patterns(ops, force_transcendental=False):
+  pat: list[tuple[UPat, Callable]] = [(UPat(op, dtype=TRANSCENDENTAL_SUPPORTED_DTYPES, src=(UPat.var("d"),)), f) for op,f in \
+           ((Ops.EXP2, xexp2), (Ops.LOG2, xlog2), (Ops.SIN, xsin)) if op not in ops or force_transcendental]
+  # rewrite SQRT to xpow 0.5
+  if Ops.SQRT not in ops: pat.append((UPat(Ops.SQRT, src=UPat.var("d")), lambda d: xpow(d, d.const_like(0.5))))
+  # rewrite MOD to AND (which should always be supported, but not for generic in tests): x % (2**y) -> x & (2**y-1)
+  if Ops.AND in ops: pat += [(UPat.var("x", dtypes.ints)%UPat.cvar("c"), lambda x,c: x & (c.arg-1) if c.arg in powers_of_two else None)]
+  # rewrite MUL/IDIV to SHL+SHR: x*(2**y) -> shl(x,y) and x//(2**y) -> shr(x,y)
+  if Ops.SHL in ops: pat += [(UPat.var("x", dtypes.ints)*UPat.cvar("c"), lambda c,x: x << v if (v:=powers_of_two.get(c.arg, 0)) else None)]
+  if Ops.SHR in ops:
+    pat += [(UPat.var("x", dtypes.ints)//UPat.cvar("c"), lambda x,c: x >> v if (v:=powers_of_two.get(c.arg, 0)) and resolve(x>=0,False) else None)]
+  if Ops.NEG in ops:
+    pat += [(UPat.var('x')*-1, lambda x: x.alu(Ops.NEG))]
+    if Ops.SUB in ops: pat += [(UPat.var('x')+UPat.var('y').alu(Ops.NEG), lambda x,y: x.alu(Ops.SUB, y))]
+  if Ops.MULACC in ops: pat += [(UPat.var('a')*UPat.var('b')+UPat.var('c'), lambda a,b,c: a.alu(Ops.MULACC, b, c))]
+  return PatternMatcher(pat)
+# *** uop expander ***
+# TODO: there's a lot shared with gep_through_wmma here
+def no_vectorized_wmma(wmma:UOp):
+  out_sz = prod(x[1] for x in wmma.arg[6][-1])
+  if wmma.dtype.count == out_sz: return None
+  tsrcs = []
+  for s,sz in zip(wmma.src, wmma.arg[6]):
+    ssz = prod(x[1] for x in sz)
+    tsrcs.append([s.gep(tuple(range(grp, grp+ssz))) for grp in range(0, s.dtype.count, ssz)])
+  wmmas = [UOp(Ops.WMMA, wmma.dtype.scalar().vec(out_sz), tsrc, wmma.arg) for tsrc in zip(*tsrcs)]
+  wmma_ex = flatten([[e.gep(i) for i in range(out_sz)] for e in wmmas])
+  return UOp(Ops.VECTORIZE, wmma.dtype, tuple(wmma_ex))
+def no_vectorized_alu(alu):
+  if alu.dtype.vcount == 1: return None
+  alus = tuple(UOp(alu.op, alu.dtype.scalar(), tuple(s.gep(i) for s in alu.src), alu.arg) for i in range(alu.dtype.vcount))
+  return UOp(Ops.VECTORIZE, alu.dtype, alus)
+def no_vectorized_load_store(ls:UOp):
+  idx = ls.src[0]
+  assert isinstance(idx.dtype, PtrDType)
+  if idx.dtype.v == 1: return None
+  tv = [UOp(ls.op, ls.dtype.scalar(), tuple(j.gep(i) for j in ls.src)) for i in range(idx.dtype.v)]
+  return UOp(Ops.VECTORIZE, ls.dtype, tuple(tv))
+def no_vectorized_acc(acc:UOp):
+  if acc.dtype.count == 1: return None
+  alus = tuple(UOp(acc.op, acc.dtype.scalar(),
+    tuple(s.gep(i) if j == 0 else s for j,s in enumerate(acc.src)), acc.arg+(i,)) for i in range(acc.dtype.count))
+  return UOp(Ops.VECTORIZE, acc.dtype, alus)
+devectorize = PatternMatcher([
+  # no ALU on vectorized dtypes
+  (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.ASSIGN, Ops.INDEX), name="alu"), no_vectorized_alu),
+  (UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma),
+  (UPat(Ops.DEFINE_ACC, name="acc"), no_vectorized_acc),
+  (UPat((Ops.LOAD, Ops.STORE), name="ls"), no_vectorized_load_store),
+])
+devectorize_load_store = PatternMatcher([
+  # TODO: add vectorized support to transcendental
+  (UPat((Ops.INDEX, Ops.EXP2, Ops.LOG2, Ops.SIN), name="alu"), no_vectorized_alu),
+  (UPat((Ops.LOAD, Ops.STORE), name="ls"), no_vectorized_load_store),
+])
+def delete_redundant_gates(buf:UOp, idx:UOp, val:UOp, store_gate:UOp, cast:UOp|None=None) -> UOp|None:
+  if store_gate not in [gate.src[0] for gate in val.toposort if gate.op is Ops.IF]: return None
+  # remove the gate from the index
+  return UOp.store(buf.index(idx).cast(cast.dtype) if cast is not None else buf.index(idx), val)
+load_store_indexing = PatternMatcher([
+  # late fixup of unfoldable image loads
+  (UPat(Ops.LOAD, src=(UPat.var("buf"), UPat()), allow_any_len=True, name="load"), fix_unfoldable_image_load),
+  # simplify valid
+  (UPat(Ops.AND, name="valid"), simplify_valid),
+  # image load valid idx simplification
+  (UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("start_idx"), UPat.var("valid"))), simplify_valid_load),
+  # delete_redundant_gates (after expand)
+  (UPat(Ops.STORE, src=(UPat.any(stidx:=UPat.var("buf").index(UPat.var("idx"), UPat.var("store_gate")), stidx.cast().named("cast")),
+                                  UPat.var("val"))), delete_redundant_gates),
+])
+def move_mask(x:UOp, buf:UOp, idx:UOp, mask:UOp, cast:UOp|None=None) -> UOp:
+  # this moves the mask from the indexing to the load/store op for rendering
+  nidx = buf.index(idx).cast(cast.dtype) if cast is not None else buf.index(idx)
+  return UOp.load(nidx, x.const_like(0), mask, *x.src[1:], dtype=x.dtype) if x.op is Ops.LOAD else UOp.store(nidx, x.src[1], mask, *x.src[2:])
+pm_render = PatternMatcher([
+  # for rendering, we use explicit VECTORIZE
+  (UPat(Ops.CONST, name='c'),
+   lambda c: UOp(Ops.VECTORIZE, c.dtype, (UOp.const(c.dtype.scalar(), c.arg),)*c.dtype.vcount) if c.dtype.vcount > 1 else None),
+  (UPat(Ops.VCONST, name='c'), lambda c: UOp(Ops.VECTORIZE, c.dtype, tuple(UOp.const(c.dtype.scalar(), x) for x in c.arg))),
+  (UPat(Ops.GEP, name='gep'), lambda gep: UOp(Ops.VECTORIZE, gep.dtype, tuple(gep.src[0].gep(x) for x in gep.arg)) if len(gep.arg) > 1 else None),
+  (UPat(Ops.VECTORIZE, src=(UPat(name='x'),)), lambda x: x),
+  # move masks of loads/stores
+  (UPat((Ops.LOAD, Ops.STORE), src=(UPat.any(masked_index:=UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("idx"), UPat.var("mask"))),
+                                               masked_index.cast(None).named("cast")),), allow_any_len=True, name="x"), move_mask),
+  # gate any stores that aren't gated with ifs
+  (UPat(Ops.STORE, dtype=dtypes.void, src=(UPat(), UPat(), UPat(dtype=dtypes.bool)), name="store"),
+    lambda store: UOp(Ops.STORE, src=store.src[:2]+(UOp(Ops.IF, src=(store.src[2],)),))),
+])
+# *** uop graph ***
+def full_graph_rewrite(sink:UOp, opts:Optional[Renderer]=None) -> UOp:
+  assert sink.op is Ops.SINK, f"sink isn't sink, it's {sink.op}"
+  supported_ops = tuple(opts.code_for_op.keys()) if opts is not None else ()
+  extra_matcher = opts.extra_matcher if opts is not None and opts.extra_matcher is not None else PatternMatcher([])
+  if DEVECTORIZE:
+    # devectorize + load_store_indexing + mulacc_unrolled, mulacc_unrolled must be last because it can break loop_collapse
+    sink = graph_rewrite(sink, sym+(devectorize+float4_folding if opts is not None and opts.supports_float4 else devectorize)+load_store_indexing+
+      mulacc_unrolled)
+  else:
+    # new devectorize only for load/store
+    sink = graph_rewrite(sink, sym+devectorize_load_store+mulacc_unrolled)
+  # optional pre matcher
+  if opts is not None and opts.pre_matcher is not None: sink = graph_rewrite(sink, opts.pre_matcher)
+  # final rules for the renderer (without sym)
+  sink = graph_rewrite(sink, symbolic_simple+get_late_rewrite_patterns(supported_ops, TRANSCENDENTAL>=2)+pm_render+extra_matcher)
+  return sink

tinygrad/codegen/expander.py ADDED Viewed

@@ -0,0 +1,121 @@
+# this converts a lowerer program into a vectorized program
+import functools, itertools, operator
+from tinygrad.helpers import AMX, dedup, flatten, all_same, prod
+from tinygrad.ops import UOp, Ops, UPat, PatternMatcher, GroupOp, graph_rewrite
+from tinygrad.codegen.symbolic import sym
+def _expand_arg_to_idx(args:tuple[tuple[int, int], ...], rpk:dict[int, int]) -> int:
+  idx, mul = 0, 1
+  for axis,m in args[::-1]:
+    idx += rpk[axis] * mul
+    mul *= m
+  return idx
+def _choices_from_args(args:tuple[tuple[int, int], ...]) -> list[dict[int, int]]:
+  return [dict(x) for x in itertools.product(*[zip(itertools.repeat(axis), range(m)) for axis,m in args])]
+@functools.lru_cache(None)
+def _swizzle_args(cargs:tuple[tuple[int, int], ...], eargs:tuple[tuple[int, int], ...], exclude_args:tuple[int, ...]) -> list[int]:
+  return [_expand_arg_to_idx(eargs, {**rpk, **{x:0 for x in exclude_args}} if exclude_args else rpk) for rpk in _choices_from_args(cargs)]
+def do_expand(root:UOp):
+  expands = [x for x in root.src if x.op is Ops.UNROLL]
+  if len(expands) == 0: return None
+  # NOTE: we 0 out the reduce axis for WMMA. in theory they should all be the same, but is this always correct?
+  exclude_args = tuple(dedup(root.arg[-1] + tuple(y[0] for y in flatten(root.arg[-2])))) if root.op is Ops.WMMA else ()
+  if all_same(expands_args:=[x.arg for x in expands]) and len(exclude_args) == 0:
+    # if there's only one expand arg, it's okay to use it (optimization)
+    expand_args = expands[0].arg
+  else:
+    # otherwise, we sort them and GEP
+    expand_args = tuple(x for x in sorted(dedup(flatten(expands_args))) if x[0] not in exclude_args)
+  expand_sz = prod([x[1] for x in expand_args])
+  new_srcs = []
+  for i,src in enumerate(root.src):
+    if src.op is Ops.UNROLL:
+      if root.op is Ops.IF and i == 0:
+        # IF means OR on first arg to IF
+        new_srcs.append(functools.reduce(operator.__or__, [src.src[0].gep(i) for i in range(expand_sz)]))
+      elif expand_args == src.arg:
+        # just remove the expand
+        new_srcs.append(src.src[0])
+      else:
+        lst = _swizzle_args(expand_args, src.arg, exclude_args)
+        # if the base dtype is > 1, put those at the end
+        if src.dtype.count > 1: lst = flatten([[i*src.dtype.count+j for j in range(src.dtype.count)] for i in lst])
+        new_srcs.append(src.src[0].gep(tuple(lst)))
+    else:
+      # non-UNROLL input
+      if root.op is Ops.IF:
+        # for the first arg of IF, just pass them through ignoring UNROLLS
+        new_srcs.append(src)
+      elif src.dtype.count > 1:
+        # put any input dtype > 1 grouped together
+        new_srcs.append(UOp(Ops.CAT, src.dtype.scalar().vec(expand_sz*src.dtype.count), (src,)*expand_sz))
+      else:
+        # repeat the arg
+        new_srcs.append(src.broadcast(expand_sz))
+  new_arg = root.arg
+  if root.op is Ops.GEP:
+    assert root.dtype.count == 1
+    # is this right?
+    new_arg = tuple(range(root.arg[0], new_srcs[0].dtype.count, new_srcs[0].dtype.count // expand_sz))
+  nsrc = UOp(root.op, root.dtype.scalar().vec(root.dtype.count*expand_sz), tuple(new_srcs), new_arg)
+  return UOp(Ops.UNROLL, root.dtype, (nsrc,), expand_args)
+def do_contract(con:UOp):
+  ex = con.src[0]
+  # CONTRACT without UNROLL repeats the element VECTORIZED
+  if ex.op is not Ops.UNROLL: return UOp(Ops.VECTORIZE, con.dtype, con.src*con.dtype.count)
+  # CONTRACT may remove several axes from UNROLL
+  assert con.dtype.count == prod([x[1] for x in con.arg]), "dtype is wrong"
+  idxs = []
+  for rpk in _choices_from_args(new_ex_args:=tuple(x for x in ex.arg if x not in con.arg)):
+    idxs += [_expand_arg_to_idx(ex.arg, {**rpk, **lrpk}) for lrpk in _choices_from_args(con.arg)]
+  return UOp(Ops.UNROLL, con.dtype, (ex.src[0].gep(tuple(idxs)),), new_ex_args)
+expander = PatternMatcher([
+  # double expand
+  (UPat(Ops.UNROLL, name="outer", src=(UPat(Ops.UNROLL, name="inner"),)),
+   lambda outer, inner: UOp(Ops.UNROLL, outer.dtype, (inner.src[0],), inner.arg+outer.arg)),
+  # do expansion
+  (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.GEP, Ops.WMMA, Ops.LOAD, Ops.STORE, Ops.INDEX, Ops.ASSIGN,
+         Ops.VECTORIZE, Ops.IF), name="root", custom_early_reject=set([Ops.UNROLL])), do_expand),
+  (UPat(Ops.CONTRACT, name="con"), do_contract),
+  # vectorize DEFINE_ACC
+  (UPat(Ops.VECTORIZE, src=UPat(Ops.DEFINE_ACC, name="acc"), name="v"),
+    lambda acc,v: acc.replace(dtype=v.dtype, src=(acc.src[0].broadcast(v.dtype.count),)+acc.src[1:])),
+  # BARRIERs aren't actually expanded
+  (UPat(Ops.BARRIER, src=(UPat(Ops.UNROLL, name="ex"),)),
+   lambda ex: UOp(Ops.UNROLL, src=(UOp(Ops.BARRIER, src=ex.src),)*len(ex.src), arg=ex.arg)),
+  # empty UNROLL is NOOP
+  (UPat(Ops.UNROLL, src=(UPat.var('x'),), arg=()), lambda x: x),
+  # UNROLL GEP (needed for WMMA, generalize this) -> vectorized ALU
+  (UPat(Ops.UNROLL, name="ex", src=tuple(UPat.var('x').gep(i)+UPat.var('y').gep(i) for i in range(256 if AMX else 8))),
+    lambda ex,x,y: UOp(Ops.UNROLL, ex.dtype, tuple((x+y).gep(i) for i in range(256 if AMX else 8)), ex.arg)),
+])
+def create_gate(root:UOp) -> UOp|None:
+  @functools.lru_cache(None)
+  def _gate_srcs(u:UOp, gate:UOp) -> UOp:
+    if u.op is Ops.BARRIER: return u
+    if u.op is Ops.LOAD and u.src[-1].op is Ops.BARRIER:
+      return UOp(u.op, u.dtype, u.src[:-1]+(UOp(Ops.IF, src=(gate, u.src[-1])),), arg=u.arg)
+    return u if (replace_source:=tuple(_gate_srcs(x, gate) for x in u.src)) == u.src else UOp(u.op, u.dtype, replace_source, u.arg)
+  idx = root.src[0]
+  if idx.op is Ops.CAST: idx = idx.src[0]
+  return None if idx.op is not Ops.INDEX or len(idx.src) == 2 or (ret:=_gate_srcs(root, idx.src[2])) is root else ret
+migrate_indexing = PatternMatcher([
+  # create gate MUST BE BEFORE expander
+  (UPat(Ops.STORE, name="root"), create_gate),
+])
+def expand_rewrite(sink:UOp) -> UOp:
+  # initial symbolic + migrate indexing (remove this)
+  sink = graph_rewrite(sink, sym+migrate_indexing)
+  # expand
+  return graph_rewrite(sink, sym+expander)

tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

tinygrad 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl