tinygrad 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +35 -37
  4. tinygrad/codegen/linearize.py +19 -10
  5. tinygrad/codegen/lowerer.py +31 -8
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +10 -0
  8. tinygrad/device.py +28 -11
  9. tinygrad/dtype.py +12 -3
  10. tinygrad/engine/jit.py +3 -2
  11. tinygrad/engine/multi.py +0 -1
  12. tinygrad/engine/realize.py +7 -4
  13. tinygrad/engine/schedule.py +227 -255
  14. tinygrad/engine/search.py +20 -27
  15. tinygrad/gradient.py +3 -0
  16. tinygrad/helpers.py +7 -4
  17. tinygrad/nn/state.py +2 -2
  18. tinygrad/ops.py +64 -329
  19. tinygrad/renderer/__init__.py +19 -3
  20. tinygrad/renderer/cstyle.py +39 -18
  21. tinygrad/renderer/llvmir.py +55 -18
  22. tinygrad/renderer/ptx.py +6 -2
  23. tinygrad/renderer/wgsl.py +20 -12
  24. tinygrad/runtime/autogen/libc.py +404 -71
  25. tinygrad/runtime/autogen/{libpciaccess.py → pci.py} +25 -715
  26. tinygrad/runtime/autogen/webgpu.py +6985 -0
  27. tinygrad/runtime/graph/metal.py +28 -29
  28. tinygrad/runtime/ops_amd.py +37 -34
  29. tinygrad/runtime/{ops_clang.py → ops_cpu.py} +4 -2
  30. tinygrad/runtime/ops_disk.py +1 -1
  31. tinygrad/runtime/ops_dsp.py +59 -33
  32. tinygrad/runtime/ops_llvm.py +14 -12
  33. tinygrad/runtime/ops_metal.py +78 -62
  34. tinygrad/runtime/ops_nv.py +9 -6
  35. tinygrad/runtime/ops_python.py +5 -5
  36. tinygrad/runtime/ops_webgpu.py +200 -38
  37. tinygrad/runtime/support/am/amdev.py +23 -11
  38. tinygrad/runtime/support/am/ip.py +10 -10
  39. tinygrad/runtime/support/elf.py +2 -0
  40. tinygrad/runtime/support/hcq.py +7 -5
  41. tinygrad/runtime/support/llvm.py +8 -14
  42. tinygrad/shape/shapetracker.py +3 -2
  43. tinygrad/shape/view.py +2 -3
  44. tinygrad/spec.py +21 -20
  45. tinygrad/tensor.py +150 -90
  46. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  47. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  48. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  49. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  50. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  51. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  52. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  53. tinygrad/viz/index.html +544 -0
  54. tinygrad/viz/perfetto.html +178 -0
  55. tinygrad/viz/serve.py +205 -0
  56. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/METADATA +20 -8
  57. tinygrad-0.10.2.dist-info/RECORD +99 -0
  58. tinygrad/codegen/rewriter.py +0 -516
  59. tinygrad-0.10.1.dist-info/RECORD +0 -86
  60. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  61. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +0 -0
  62. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,516 +0,0 @@
1
- from __future__ import annotations
2
- from typing import Optional, Any, Callable
3
- import functools, itertools, operator
4
- from collections import defaultdict
5
- from tinygrad.dtype import dtypes, ImageDType, PtrDType
6
- from tinygrad.ops import UOp, Ops, UPat, PatternMatcher, symbolic_flat, symbolic_simple, resolve
7
- from tinygrad.ops import graph_rewrite, split_uop, uop_given_valid, parse_valid, is_increasing, simplify_valid, GroupOp
8
- from tinygrad.helpers import DEBUG, getenv, flatten, dedup, TRANSCENDENTAL, AMX, prod, partition, all_same
9
- from tinygrad.codegen.transcendental import xexp2, xlog2, xsin, TRANSCENDENTAL_SUPPORTED_DTYPES
10
- from tinygrad.renderer import Renderer
11
-
12
- # ***** float4/image store handling *****
13
-
14
- def fold_expanded(ex, buf):
15
- if buf.dtype.base != dtypes.float and buf.dtype.base != dtypes.half and not isinstance(buf.dtype, ImageDType): return None
16
- new_srcs = dedup(list(ex.src))
17
- old_new_srcs = new_srcs[:]
18
- is_load, is_image = new_srcs[0].op is Ops.LOAD, isinstance(buf.dtype, ImageDType)
19
-
20
- # first, extract all the relevant offsets
21
- offsets_rootsrc: defaultdict[Any, dict] = defaultdict(dict)
22
- for i,s in enumerate(new_srcs):
23
- idx = s.src[0].src[1]
24
- if s.dtype.count != 1 or (is_image and idx.dtype.count == 2): continue
25
- if idx.op is Ops.ADD and idx.src[1].op is Ops.CONST: root_src, arg = idx.src[0], idx.src[1].arg
26
- elif idx.op is Ops.CONST: root_src, arg = "CONST", idx.arg
27
- else: root_src, arg = idx, 0
28
- # add gates for gated
29
- if len(s.src[0].src) == 3: root_src = (s.src[0].src[2], root_src)
30
- assert arg not in offsets_rootsrc[root_src], f"{offsets_rootsrc[root_src][arg]} != {i} with {len(s.src)} sources"
31
- offsets_rootsrc[root_src][arg] = i
32
-
33
- # then rewrite everything we can
34
- lengths = [4] if is_image else ([8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2]))
35
- used: set[tuple[UOp, UOp]] = set()
36
- for rootsrc, offsets in offsets_rootsrc.items():
37
- for o in offsets:
38
- for fold_length in lengths:
39
- if all((rootsrc,o+i) not in used and o+i in offsets for i in range(fold_length)):
40
- load_1 = new_srcs[offsets[o]]
41
- new_src = list(load_1.src)
42
- oidx = new_src[0].src[1]
43
- if oidx.divides(fold_length) is None: continue
44
- if is_image:
45
- # for images, we rewrite the index. it must evenly divide 4 from the above check
46
- new_src[0] = buf.index(
47
- UOp(Ops.VECTORIZE, dtypes.int.vec(2), ((oidx // 4) % buf.dtype.shape[1], (oidx // (4*buf.dtype.shape[1])))),
48
- rootsrc[0] if isinstance(rootsrc, tuple) else None)
49
- else:
50
- # for non image, we upcast the index pointer
51
- new_src[0] = new_src[0].cast(new_src[0].dtype.base.vec(fold_length).ptr(size=new_src[0].dtype.size//fold_length,
52
- local=new_src[0].dtype.local))
53
- # generate the folded new_srcs
54
- if is_load:
55
- new_load = UOp(Ops.LOAD, load_1.dtype.vec(fold_length), tuple(new_src))
56
- for i in range(fold_length): new_srcs[offsets[o+i]] = new_load.gep(i)
57
- else: # vectorize the store
58
- new_src[1] = UOp(Ops.VECTORIZE, new_src[1].dtype.vec(fold_length), tuple(new_srcs[offsets[o+i]].src[1] for i in range(fold_length)))
59
- for i in range(fold_length): new_srcs[offsets[o+i]] = UOp(Ops.STORE, dtypes.void, tuple(new_src)) if i == 0 else None
60
- used.update((rootsrc,o+i) for i in range(fold_length))
61
-
62
- # dedup expand for LOAD
63
- if is_load and len(old_new_srcs) != len(ex.src): new_srcs = [new_srcs[old_new_srcs.index(s)] for s in ex.src]
64
- # remove Nones for STORE
65
- return UOp(ex.op, ex.dtype, tuple(x for x in new_srcs if x is not None), ex.arg) if len(used) else None
66
-
67
- def fix_unfoldable_image_load(load:UOp, buf:UOp):
68
- if not isinstance(buf.dtype, ImageDType) or (oidx:=load.src[0].src[1]).dtype.count == 2: return None
69
- id4 = oidx % 4
70
- new_src = list(load.src)
71
- # TODO: copied logic from above
72
- new_src[0] = load.src[0].src[0].index(
73
- UOp(Ops.VECTORIZE, dtypes.int.vec(2), ((oidx // 4) % buf.dtype.shape[1], (oidx // (4*buf.dtype.shape[1])))),
74
- load.src[0].src[2] if len(load.src[0].src) == 3 else None)
75
- vec_load = UOp(Ops.LOAD, load.dtype.vec(4), tuple(new_src))
76
- return functools.reduce(lambda ret, i: id4.ne(i).where(ret, vec_load.gep(i)), range(4), load.const_like(float('nan')))
77
-
78
- buf_idx_pat = UPat(Ops.INDEX, src=(UPat.var("buf"),), allow_any_len=True)
79
- float4_folding = PatternMatcher([
80
- (UPat(Ops.VECTORIZE, src=UPat(Ops.LOAD, src=(buf_idx_pat,), allow_any_len=True), name="ex"), fold_expanded),
81
- (UPat((Ops.BARRIER, Ops.SINK), src=UPat(Ops.STORE, src=(buf_idx_pat,), allow_any_len=True), name="ex"), fold_expanded),
82
- ])
83
-
84
- # ***** image load valid simplification *****
85
-
86
- def simplify_valid_load(buf:UOp, start_idx:UOp, valid:UOp) -> UOp|None:
87
- if (idx:=uop_given_valid(valid, start_idx)) is None: return buf.const_like(0)
88
- if not isinstance(buf.dtype, ImageDType): return None if idx is start_idx else buf.index(idx, valid)
89
-
90
- # wait for it to be image indexed before running simplification
91
- if start_idx.dtype.count != 2: return None
92
-
93
- # can drop valid if idx is out of bound when valid is False
94
- drop_stmt = []
95
- for stmt in split_uop(valid, Ops.AND):
96
- X, is_upper_bound, c = parse_valid(stmt)
97
-
98
- # for X0 + X1 + ... >= 1, check if it's out of bound when Xi = 0 for all i
99
- if not is_upper_bound and c == 1 and all(u.op in GroupOp.Irreducible and u.vmin == 0 for u in split_uop(X, Ops.ADD)):
100
- testidx = functools.reduce(lambda nowidx,u: nowidx.substitute({u:u.const_like(0)}), split_uop(X, Ops.ADD), idx)
101
- testidx = testidx.simplify()
102
- if testidx.gep(0).vmax < 0 or testidx.gep(1).vmax < 0:
103
- drop_stmt.append(stmt)
104
- continue
105
-
106
- # if X <= c, check if it's out of bound when X = c+1
107
- # if X >= c, check if it's out of bound when X = c-1
108
- test_value = c + 1 if is_upper_bound else c - 1
109
- for i,b in zip(idx.src, (buf.dtype.shape[1], buf.dtype.shape[0])):
110
- if is_increasing(i):
111
- rw = i.substitute({X:X.const_like(test_value)}).simplify()
112
- if rw.vmin >= b or rw.vmax < 0:
113
- drop_stmt.append(stmt)
114
- break
115
-
116
- if not drop_stmt and idx is start_idx: return None
117
- new_valid = functools.reduce(operator.and_, ss) if (ss:=[s for s in split_uop(valid, Ops.AND) if s not in drop_stmt]) else None
118
- return buf.index(idx, new_valid)
119
-
120
- # ***** optional patterns *****
121
-
122
- powers_of_two = {2**i:i for i in range(64)}
123
- @functools.lru_cache(None)
124
- def get_late_rewrite_patterns(ops, force_transcendental=False):
125
- pat: list[tuple[UPat, Callable]] = [(UPat(op, dtype=TRANSCENDENTAL_SUPPORTED_DTYPES, src=(UPat.var("d"),)), f) for op,f in \
126
- ((Ops.EXP2, xexp2), (Ops.LOG2, xlog2), (Ops.SIN, xsin)) if op not in ops or force_transcendental]
127
- # rewrite MOD to AND (which should always be supported, but not for generic in tests): x % (2**y) -> x & (2**y-1)
128
- if Ops.AND in ops:
129
- pat += [(UPat.var("x", dtypes.ints)%UPat.cvar("c"), lambda x,c: x & (c.arg-1) if c.arg in powers_of_two else None)]
130
- # rewrite MUL/IDIV to SHL+SHR: x*(2**y) -> shl(x,y) and x//(2**y) -> shr(x,y)
131
- if Ops.SHL in ops and Ops.SHR in ops:
132
- pat += [
133
- (UPat.var("x", dtypes.ints)*UPat.cvar("c"), lambda c,x: x << powers_of_two[c.arg] if c.arg in powers_of_two else None),
134
- (UPat.var("x", dtypes.ints)//UPat.cvar("c"), lambda x,c: x >> powers_of_two[c.arg] if c.arg in powers_of_two and resolve(x>=0,False) else None)
135
- ]
136
- if Ops.NEG in ops:
137
- pat += [(UPat.var('x')*-1, lambda x: x.alu(Ops.NEG))]
138
- if Ops.SUB in ops: pat += [(UPat.var('x')+UPat.var('y').alu(Ops.NEG), lambda x,y: x.alu(Ops.SUB, y))]
139
- if Ops.MULACC in ops:
140
- pat += [(UPat.var('a')*UPat.var('b')+UPat.var('c'), lambda a,b,c: a.alu(Ops.MULACC, b, c))]
141
- return PatternMatcher(pat)
142
-
143
- # ***** threefry *****
144
-
145
- def threefry2x32(x: UOp, key: UOp):
146
- # split x into two uint32, since x in a uint64
147
- x0, x1 = (x & 0xffffffff).cast(dtypes.uint32), ((x // 2**32) & 0xffffffff).cast(dtypes.uint32)
148
-
149
- rotations = [[13, 15, 26, 6], [17, 29, 16, 24]]
150
- key0, key1 = (key & 0xffffffff).cast(dtypes.uint32), ((key // 2**32) & 0xffffffff).cast(dtypes.uint32)
151
- ks = [key1, key0 ^ key1 ^ 0x1BD11BDA, key0]
152
- xr = [x0 + ks[-1], x1 + ks[0]]
153
- for i in range(5):
154
- for r in rotations[i % 2]: xr[0], xr[1] = (x0 := xr[0] + xr[1]), x0 ^ ((xr[1] * 2**r) + (xr[1] // 2**(32 - r)))
155
- xr = [(xr[0] + ks[i % 3]), (xr[1] + ks[(i + 1) % 3] + i + 1)]
156
-
157
- return xr[1].cast(dtypes.uint64) * 2**32 | xr[0].cast(dtypes.uint64)
158
-
159
- # ***** main rewriter *****
160
-
161
- def loop_collapse(compval, multconst, rng:UOp, acc:UOp, idx2=None,idx3=None,extra=None,vec=None,ne=None,
162
- add=UOp.const(dtypes.int, 0), mul:UOp=UOp.const(dtypes.int, 1)):
163
- if getenv("DISABLE_LOOP_COLLAPSE") or rng not in acc.src: return None # must be the right REDUCE
164
- loop_start, loop_end = rng.src
165
- if loop_start.arg != 0:
166
- # TODO: support and test this with other mul and loop_starts
167
- if DEBUG >= 1: print(f"WARNING, NOT FOLDING: mul:{mul.arg} loop_start:{loop_start.arg}")
168
- return None
169
- if idx2 is not None: add = add + idx2
170
- if idx3 is not None: add = add + idx3
171
- if vec is not None:
172
- # add, mul, loop_start, loop_end
173
- def dvec(x:UOp):
174
- if x.op is Ops.CONST: return UOp.const(x.dtype.vec(vec.dtype.count), x.arg)
175
- return UOp(Ops.VECTORIZE, x.dtype.vec(vec.dtype.count), src=(x,)*vec.dtype.count)
176
- add, mul, loop_start, loop_end = dvec(add), dvec(mul), dvec(loop_start), dvec(loop_end)
177
- if mul.vmin > 0 and ne is not None:
178
- comprange = UOp.minimum(loop_end, UOp.maximum((add-compval)//mul + (loop_end-loop_start), loop_start))
179
- elif mul.vmax < 0 and ne is None:
180
- comprange = UOp.minimum(loop_end, UOp.maximum((add-compval-mul)//mul + (loop_end-loop_start), loop_start))
181
- else:
182
- return None
183
- new_reduce_op = comprange.cast(multconst.dtype) * multconst
184
- # TODO: what does it mean to have the same numbered DEFINE_ACC with different ranges?
185
- new_acc = acc.replace(src=acc.src[0:1]+tuple(x for x in acc.src[1:] if x is not rng))
186
- ret = new_acc.assign(new_acc+new_reduce_op)
187
- if extra is not None: ret = ret + acc.assign(acc+extra)
188
- return ret
189
-
190
- def index_collapse(idx:UOp,rng:UOp,buf:UOp,ld:UOp,acc:UOp,add=UOp.const(dtypes.int, 0),mul=UOp.const(dtypes.int, 1)):
191
- if rng not in acc.src: return None
192
- new_load = UOp.load(buf.index(add+mul*idx, (idx >= rng.src[0]) & (idx < rng.src[1])), dtype=ld.dtype)
193
- new_acc = acc.replace(src=acc.src[0:1]+tuple(x for x in acc.src[1:] if x is not rng))
194
- return new_acc.assign(new_acc+new_load)
195
-
196
- # TODO: there's a lot shared with no_vectorized_wmma here
197
- def gep_through_wmma(gep:UOp, wmma:UOp):
198
- out_sz = prod(x[1] for x in wmma.arg[6][-1])
199
- wmma_idxs = gep.arg[::out_sz]
200
- for i in range(out_sz):
201
- if tuple(x-i for x in gep.arg[i::out_sz]) != wmma_idxs: return None
202
- tsrcs = []
203
- for s,sz in zip(wmma.src, wmma.arg[6]):
204
- src_args = []
205
- ssz = prod(x[1] for x in sz)
206
- for w in wmma_idxs: src_args += list(range((w//out_sz)*ssz, (w//out_sz)*ssz + ssz))
207
- tsrcs.append(s.gep(tuple(src_args)))
208
- return UOp(Ops.WMMA, gep.dtype, tuple(tsrcs), wmma.arg)
209
-
210
- def no_vectorized_wmma(wmma:UOp):
211
- out_sz = prod(x[1] for x in wmma.arg[6][-1])
212
- if wmma.dtype.count == out_sz: return None
213
- tsrcs = []
214
- for s,sz in zip(wmma.src, wmma.arg[6]):
215
- ssz = prod(x[1] for x in sz)
216
- tsrcs.append([s.gep(tuple(range(grp, grp+ssz))) for grp in range(0, s.dtype.count, ssz)])
217
- wmmas = [UOp(Ops.WMMA, wmma.dtype.scalar().vec(out_sz), tsrc, wmma.arg) for tsrc in zip(*tsrcs)]
218
- wmma_ex = flatten([[e.gep(i) for i in range(out_sz)] for e in wmmas])
219
- return UOp(Ops.VECTORIZE, wmma.dtype, tuple(wmma_ex))
220
-
221
- def reduce_collapse(acc:UOp, ret:UOp, alu:UOp):
222
- reduce_parented, reduce_unparented = partition(acc.src[1:], lambda x: x in ret.toposort)
223
- if len(reduce_unparented) == 0: return None
224
- new_acc = acc.replace(src=acc.src[0:1]+tuple(reduce_parented))
225
- ret = new_acc.assign(new_acc.alu(alu.op, ret))
226
- if alu.op is Ops.ADD:
227
- for r in reduce_unparented: ret = ret * (r.src[1]-r.src[0]).cast(ret.dtype.scalar()).broadcast(ret.dtype.count)
228
- return ret
229
-
230
- acc_pat, rng_pat = UPat(Ops.DEFINE_ACC, name="acc"), UPat(Ops.RANGE, name="rng")
231
- rng_aug = UPat.any(rng_pat, UPat.var("add")+rng_pat, UPat.var("mul")*rng_pat, UPat.var("add")+UPat.var("mul")*rng_pat)
232
-
233
- index_load = UPat.var("buf").index(rng_aug).load(name="ld")
234
-
235
- arange_augrng = UPat.any(rng_aug, rng_aug+UPat.var("idx2"), rng_aug+UPat.var("idx2")+UPat.var("idx3"), UPat(Ops.VECTORIZE, name="vec", src=rng_aug))
236
- arange_m = ((arange_augrng<UPat.cvar("compval"))!=UPat(Ops.CONST, name="ne", arg=True)).where(UPat.cvar("multconst"), UPat.const(None, 0))
237
-
238
- # this moves the accumulation variable down an unrolled add chain which allows for more efficient accumulation using mulacc
239
- mulacc_unrolled = PatternMatcher([(UPat.var("x")+UPat.var("y")+acc_pat, lambda x,y,acc: (acc+x)+y if y.op is not Ops.DEFINE_ACC else None)])
240
-
241
- # this is symbolic 2.0
242
- sym = symbolic_flat+PatternMatcher([
243
- # self ASSIGN is just self
244
- (UPat(Ops.ASSIGN, src=(UPat.var('x'), UPat.var('x'))), lambda x: x),
245
- # VECTORIZE/CONST, VECTORIZE/GEP
246
- (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"), lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src))),
247
- (UPat(Ops.VECTORIZE, src=UPat(Ops.GEP, src=(UPat.var("x"),)), name="vec"), lambda vec,x: x.gep(tuple(y.arg[0] for y in vec.src))),
248
- # reorder ALU/VECTORIZE
249
- (UPat(GroupOp.ALU, src=(UPat(Ops.VECTORIZE, src=UPat(name='x')), UPat(Ops.VECTORIZE, src=UPat(name='y'))), name='alu'),
250
- lambda x,y,alu: UOp(Ops.VECTORIZE, alu.dtype, (UOp(alu.op, alu.dtype.scalar(), (x,y)),)*alu.dtype.count)),
251
- # VECTORIZE of a single element is just that element
252
- (UPat(Ops.VECTORIZE, src=(UPat(name='x'),)), lambda x: x),
253
- # VECTORIZE void is SINK
254
- (UPat(Ops.VECTORIZE, dtype=dtypes.void, src=UPat(Ops.BARRIER, name='b')), lambda b: b),
255
- (UPat(Ops.VECTORIZE, dtype=dtypes.void, name='x'), lambda x: UOp(Ops.SINK, dtypes.void, x.src)),
256
- # GEP/VECTORIZE, GEP/GEP, GEP/CONST, GEP/VCONST
257
- (UPat(Ops.GEP, src=(UPat(Ops.GEP, name='g2'),), name='g1'),
258
- lambda g1, g2: g2.src[0].gep(tuple(g2.arg[g1.arg[i]] for i in range(g1.dtype.count)))),
259
- (UPat(Ops.GEP, src=(UPat(Ops.VECTORIZE, name="vec"),), name="gep"),
260
- lambda gep, vec: UOp(Ops.VECTORIZE, gep.dtype, tuple(vec.src[i] for i in gep.arg)) if len(gep.arg) > 1 else vec.src[gep.arg[0]]),
261
- (UPat(Ops.GEP, src=(UPat.cvar("c", vec=False),), name="gep"), lambda gep, c: gep.const_like(c.arg)),
262
- (UPat(Ops.GEP, src=(UPat(Ops.VCONST, name="c"),), name="gep"), lambda gep, c: gep.const_like(tuple(c.arg[x] for x in gep.arg))),
263
- # push all GEPs through ALUs (fix arange stuff)
264
- (UPat(Ops.GEP, src=(UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST), name='alu'),), name='gep'),
265
- lambda gep,alu: UOp(alu.op, alu.dtype.scalar().vec(gep.dtype.count), tuple(x.gep(gep.arg) for x in alu.src), alu.arg)),
266
- # push some GEPs through WMMAs
267
- (UPat(Ops.GEP, src=(UPat(Ops.WMMA, name="wmma"),), name="gep"), gep_through_wmma),
268
- # tensor core with a 0 input is acc
269
- (UPat(Ops.WMMA, src=(UPat.const(None, 0.0), UPat.var(), UPat.var("acc"))), lambda acc: acc),
270
- (UPat(Ops.WMMA, src=(UPat.var(), UPat.const(None, 0.0), UPat.var("acc"))), lambda acc: acc),
271
- # tensor core cleanups
272
- (UPat.var("add") + UPat(Ops.WMMA, name="wmma"),
273
- lambda add, wmma: UOp(wmma.op, wmma.dtype, (wmma.src[0], wmma.src[1], wmma.src[2]+add), wmma.arg)),
274
- # threefry + remove longs
275
- (UPat(Ops.THREEFRY, dtype=dtypes.uint64, src=(UPat.var("x"), UPat.var("key"))), threefry2x32),
276
- (UPat.var('x', dtypes.uint32).cast(dtypes.uint64).cast(dtypes.uint32), lambda x: x), # cast there and back is noop (TODO: genericize)
277
- ((UPat.var('x', dtypes.uint64)&0xFFFFFFFF).cast(dtypes.uint32), lambda x: x.cast(dtypes.uint32)), # cast does truncation
278
- (((UPat.var(None, dtypes.uint64)*(1<<32)) | UPat.var('y', dtypes.uint32).cast(dtypes.uint64)).cast(dtypes.uint32), lambda y: y),
279
- (((UPat.var('x', dtypes.uint64)*(1<<32)) | UPat.var(None, dtypes.uint32).cast(dtypes.uint64))//(1<<32), lambda x: x),
280
- # hacks for threefry long removal when padded (TODO: genericize)
281
- (UPat.var('x', dtypes.uint32).cast(dtypes.uint64) * UPat.var('y').where(UPat.const(dtypes.uint64, 1<<32), UPat.const(dtypes.uint64, 0)),
282
- lambda x,y: y.where(x, UOp.const(dtypes.uint32, 0)).cast(dtypes.uint64) * (1<<32)),
283
- ((UPat.var('x', dtypes.uint64)&(UPat.var('y').where(UPat.const(dtypes.uint64, 0xFFFFFFFF), UPat.const(dtypes.uint64, 0)))).cast(dtypes.uint32),
284
- lambda x,y: y.where(x.cast(dtypes.uint32), UOp.const(dtypes.uint32, 0))),
285
- # arange loop folding
286
- (acc_pat.assign(UPat.any(arange_m, arange_m+UPat.var("extra"))+acc_pat), loop_collapse),
287
- # indexing, with cast or where
288
- (acc_pat.assign(UPat.var("idx").eq(UPat(Ops.RANGE, name="rng")).cast()*index_load+acc_pat), index_collapse),
289
- (acc_pat.assign(UPat.var("idx").eq(UPat(Ops.RANGE, name="rng")).where(index_load, UPat.const(None, 0.0))+acc_pat), index_collapse),
290
- # parentless reduce # TODO: add MUL
291
- (acc_pat.assign(UPat((Ops.ADD, Ops.MAX), src=[acc_pat, UPat.var("ret")], name="alu")), reduce_collapse),
292
- # ** self folding **
293
- (UPat(Ops.DEFINE_ACC, src=(UPat.var("x"),)), lambda x: x), # a DEFINE_ACC without ranges is a CONST
294
- (UPat(Ops.ASSIGN, src=(UPat.cvar(),UPat.var("x"))), lambda x: x), # an ASSIGN to a const is a NOOP
295
- # x!=0 -> (bool)x
296
- (UPat.var("x")!=0, lambda x: x.cast(dtypes.bool.vec(x.dtype.count))),
297
- # ** where **
298
- # push cast to branches
299
- (UPat.var("s").where(UPat.var("a"), UPat.var("b")).cast().named("cast"), lambda s,a,b,cast: s.where(a.cast(cast.dtype), b.cast(cast.dtype))),
300
- # ** load/store folding **
301
- (UPat.store(UPat(Ops.INDEX, name="index"), UPat.load(UPat(Ops.INDEX, name="index"))), lambda index: UOp(Ops.NOOP)),
302
- (UPat.store(UPat(Ops.INDEX, name="index"), UPat.var("gate").where(UPat.var("alt"), UPat.load(UPat(Ops.INDEX, name="index")))),
303
- lambda index, gate, alt: UOp.store(index.src[0].index(index.src[1], gate), alt)),
304
- # fold gated LOAD/STORE
305
- (UPat().index(UPat(), UPat.const(dtypes.bool, True)).named("idx"), lambda idx: idx.replace(src=idx.src[0:2])), # remove True
306
- (UPat().index(UPat(), UPat.const(dtypes.bool, False)).named("idx"), lambda idx: idx.const_like(0)), # False -> NULL pointer
307
- (UPat(Ops.LOAD, src=(UPat.const(None, 0),), allow_any_len=True, name="x"), lambda x: x.const_like(0)), # NULL pointer load loads 0
308
- (UPat(Ops.STORE, src=(UPat.const(None, 0),), allow_any_len=True), lambda: UOp(Ops.NOOP)), # NULL pointer store does nothing
309
- # remove NOOPs from SINK
310
- (UPat(Ops.SINK, name="root"),
311
- lambda root: UOp(Ops.SINK, root.dtype, a, root.arg) if len(a:=tuple(x for x in root.src if x.op is not Ops.NOOP)) != len(root.src) else None),
312
- # remove VECTORIZE from SINK/BARRIER
313
- (UPat(Ops.BARRIER, src=(UPat((Ops.VECTORIZE, Ops.SINK), name='sink'),)), lambda sink: UOp(Ops.BARRIER, dtypes.void, sink.src)),
314
- (UPat(Ops.SINK, name="root"),
315
- lambda root: UOp(Ops.SINK, root.dtype, tuple(flatten(x.src if x.op in {Ops.SINK, Ops.UNROLL} else (x,) for x in root.src)), root.arg)
316
- if any(x.op in {Ops.SINK, Ops.UNROLL} for x in root.src) else None),
317
- ((UPat.var("x") * UPat.var("x")).reciprocal(), lambda x: x.reciprocal()*x.reciprocal()), # 1/(x^c) -> (1/x)^c
318
- ((UPat.var("x") * UPat.var("x") * UPat.var("x")).reciprocal(), lambda x: x.reciprocal()*x.reciprocal()*x.reciprocal()),
319
- (UPat.var("x") * ((1+UPat.var("x")).reciprocal().named("d")), lambda x,d: 1-d), # x*/(1+x) -> 1-1/(1+x)
320
- (UPat.var("x") * ((1+UPat.var("x")).reciprocal().named("d")*UPat.var("y")), lambda x,y,d: y*(1-d)),
321
- (UPat.var("x") * ((1+UPat.var("x")).reciprocal().named("d")+UPat.var("y")), lambda x,y,d: (1-d)+x*y),
322
- ])
323
-
324
- # *** uop expander ***
325
-
326
- def _expand_arg_to_idx(args:tuple[tuple[int, int], ...], rpk:dict[int, int]) -> int:
327
- idx, mul = 0, 1
328
- for axis,m in args[::-1]:
329
- idx += rpk[axis] * mul
330
- mul *= m
331
- return idx
332
-
333
- def _choices_from_args(args:tuple[tuple[int, int], ...]) -> list[dict[int, int]]:
334
- return [dict(x) for x in itertools.product(*[zip(itertools.repeat(axis), range(m)) for axis,m in args])]
335
-
336
- @functools.lru_cache(None)
337
- def _swizzle_args(cargs:tuple[tuple[int, int], ...], eargs:tuple[tuple[int, int], ...], exclude_args:tuple[int, ...]) -> list[int]:
338
- return [_expand_arg_to_idx(eargs, {**rpk, **{x:0 for x in exclude_args}} if exclude_args else rpk) for rpk in _choices_from_args(cargs)]
339
-
340
- def do_expand(root:UOp):
341
- expands = [x for x in root.src if x.op is Ops.UNROLL]
342
- if len(expands) == 0: return None
343
- # NOTE: we 0 out the reduce axis for WMMA. in theory they should all be the same, but is this always correct?
344
- exclude_args = tuple(dedup(root.arg[-1] + tuple(y[0] for y in flatten(root.arg[-2])))) if root.op is Ops.WMMA else ()
345
- if all_same(expands_args:=[x.arg for x in expands]) and len(exclude_args) == 0:
346
- # if there's only one expand arg, it's okay to use it (optimization)
347
- expand_args = expands[0].arg
348
- else:
349
- # otherwise, we sort them and GEP
350
- expand_args = tuple(x for x in sorted(dedup(flatten(expands_args))) if x[0] not in exclude_args)
351
- expand_sz = prod([x[1] for x in expand_args])
352
- new_srcs = []
353
- for i,src in enumerate(root.src):
354
- if src.op is Ops.UNROLL:
355
- if root.op is Ops.IF and i == 0:
356
- # IF means OR on first arg to IF
357
- new_srcs.append(functools.reduce(operator.__or__, [src.src[0].gep(i) for i in range(expand_sz)]))
358
- elif expand_args == src.arg:
359
- # just remove the expand
360
- new_srcs.append(src.src[0])
361
- else:
362
- lst = _swizzle_args(expand_args, src.arg, exclude_args)
363
- # if the base dtype is > 1, put those at the end
364
- if src.dtype.count > 1: lst = flatten([[i*src.dtype.count+j for j in range(src.dtype.count)] for i in lst])
365
- new_srcs.append(src.src[0].gep(tuple(lst)))
366
- else:
367
- # non-UNROLL input
368
- if root.op is Ops.IF:
369
- # for the first arg of IF, just pass them through ignoring UNROLLS
370
- new_srcs.append(src)
371
- elif src.dtype.count > 1:
372
- # put any input dtype > 1 grouped together
373
- new_srcs.append(UOp(Ops.VECTORIZE,
374
- src.dtype.scalar().vec(expand_sz*src.dtype.count), tuple(src.gep(i) for i in range(src.dtype.count))*expand_sz))
375
- else:
376
- # repeat the arg
377
- new_srcs.append(src.broadcast(expand_sz))
378
-
379
- new_arg = root.arg
380
- if root.op is Ops.GEP:
381
- assert root.dtype.count == 1
382
- # is this right?
383
- new_arg = tuple(range(root.arg[0], new_srcs[0].dtype.count, new_srcs[0].dtype.count // expand_sz))
384
- nsrc = UOp(root.op, root.dtype.scalar().vec(root.dtype.count*expand_sz), tuple(new_srcs), new_arg)
385
- return UOp(Ops.UNROLL, root.dtype, (nsrc,), expand_args)
386
-
387
- def do_contract(con:UOp):
388
- ex = con.src[0]
389
- # CONTRACT without UNROLL repeats the element VECTORIZED
390
- if ex.op is not Ops.UNROLL: return UOp(Ops.VECTORIZE, con.dtype, con.src*con.dtype.count)
391
- # CONTRACT may remove several axes from UNROLL
392
- assert con.dtype.count == prod([x[1] for x in con.arg]), "dtype is wrong"
393
- idxs = []
394
- for rpk in _choices_from_args(new_ex_args:=tuple(x for x in ex.arg if x not in con.arg)):
395
- idxs += [_expand_arg_to_idx(ex.arg, {**rpk, **lrpk}) for lrpk in _choices_from_args(con.arg)]
396
- return UOp(Ops.UNROLL, con.dtype, (ex.src[0].gep(tuple(idxs)),), new_ex_args)
397
-
398
- def no_vectorized_alu(alu):
399
- if alu.dtype.vcount == 1: return None
400
- alus = tuple(UOp(alu.op, alu.dtype.scalar(), tuple(s.gep(i) for s in alu.src), alu.arg) for i in range(alu.dtype.vcount))
401
- return UOp(Ops.VECTORIZE, alu.dtype, alus)
402
-
403
- def create_gate(root:UOp) -> UOp|None:
404
- @functools.lru_cache(None)
405
- def _gate_srcs(u:UOp, gate:UOp) -> UOp:
406
- if u.op is Ops.BARRIER: return u
407
- if u.op is Ops.LOAD and u.src[-1].op is Ops.BARRIER:
408
- return UOp(u.op, u.dtype, u.src[:-1]+(UOp(Ops.IF, dtypes.void, (gate, u.src[-1])),), u.arg)
409
- return u if (replace_source:=tuple(_gate_srcs(x, gate) for x in u.src)) == u.src else UOp(u.op, u.dtype, replace_source, u.arg)
410
- idx = root.src[0]
411
- if idx.op is Ops.CAST: idx = idx.src[0]
412
- return None if idx.op is not Ops.INDEX or len(idx.src) == 2 or (ret:=_gate_srcs(root, idx.src[2])) is root else ret
413
-
414
- expander = PatternMatcher([
415
- # double expand
416
- (UPat(Ops.UNROLL, name="outer", src=(UPat(Ops.UNROLL, name="inner"),)),
417
- lambda outer, inner: UOp(Ops.UNROLL, outer.dtype, (inner.src[0],), inner.arg+outer.arg)),
418
- # do expansion
419
- (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.GEP, Ops.WMMA, Ops.LOAD, Ops.STORE, Ops.INDEX, Ops.ASSIGN,
420
- Ops.VECTORIZE, Ops.IF), name="root", custom_early_reject=set([Ops.UNROLL])), do_expand),
421
- (UPat(Ops.CONTRACT, name="con"), do_contract),
422
- # vectorize DEFINE_ACC
423
- (UPat(Ops.VECTORIZE, src=UPat(Ops.DEFINE_ACC, name="acc"), name="v"), lambda acc,v: acc.replace(dtype=v.dtype)),
424
- # BARRIERs aren't actually expanded
425
- (UPat(Ops.BARRIER, src=(UPat(Ops.UNROLL, name="ex"),)),
426
- lambda ex: UOp(Ops.UNROLL, dtypes.void, (UOp(Ops.BARRIER, dtypes.void, ex.src),)*len(ex.src), ex.arg)),
427
- # empty UNROLL is NOOP
428
- (UPat(Ops.UNROLL, src=(UPat.var('x'),), arg=()), lambda x: x),
429
- # UNROLL GEP (needed for WMMA, generalize this) -> vectorized ALU
430
- (UPat(Ops.UNROLL, name="ex", src=tuple(UPat.var('x').gep(i)+UPat.var('y').gep(i) for i in range(256 if AMX else 8))),
431
- lambda ex,x,y: UOp(Ops.UNROLL, ex.dtype, tuple((x+y).gep(i) for i in range(256 if AMX else 8)), ex.arg)),
432
- ])
433
-
434
- def no_vectorized_load_store(ls:UOp):
435
- idx = ls.src[0]
436
- assert isinstance(idx.dtype, PtrDType)
437
- if idx.dtype.v == 1: return None
438
- tv = [UOp(ls.op, ls.dtype.scalar(), tuple(j.gep(i) for j in ls.src)) for i in range(idx.dtype.v)]
439
- return UOp(Ops.VECTORIZE, ls.dtype, tuple(tv))
440
-
441
- def no_vectorized_acc(acc:UOp):
442
- if acc.dtype.count == 1: return None
443
- alus = tuple(UOp(acc.op, acc.dtype.scalar(),
444
- tuple(s.gep(i) if j == 0 else s for j,s in enumerate(acc.src)), acc.arg+(i,)) for i in range(acc.dtype.count))
445
- return UOp(Ops.VECTORIZE, acc.dtype, alus)
446
-
447
- devectorize = PatternMatcher([
448
- # no ALU on vectorized dtypes
449
- (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.ASSIGN, Ops.INDEX), name="alu"), no_vectorized_alu),
450
- (UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma),
451
- (UPat(Ops.DEFINE_ACC, name="acc"), no_vectorized_acc),
452
- (UPat((Ops.LOAD, Ops.STORE), name="ls"), no_vectorized_load_store),
453
- ])
454
-
455
- def delete_redundant_gates(buf:UOp, idx:UOp, val:UOp, store_gate:UOp, cast:UOp|None=None) -> UOp|None:
456
- if store_gate not in [gate.src[0] for gate in val.toposort if gate.op is Ops.IF]: return None
457
- # remove the gate from the index
458
- return UOp.store(buf.index(idx).cast(cast.dtype) if cast is not None else buf.index(idx), val)
459
-
460
- load_store_indexing = PatternMatcher([
461
- # late fixup of unfoldable image loads
462
- (UPat(Ops.LOAD, src=(UPat.var("buf"), UPat()), allow_any_len=True, name="load"), fix_unfoldable_image_load),
463
- # simplify valid
464
- (UPat(Ops.AND, name="valid"), simplify_valid),
465
- # image load valid idx simplification
466
- (UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("start_idx"), UPat.var("valid"))), simplify_valid_load),
467
- # delete_redundant_gates (after expand)
468
- (UPat(Ops.STORE, src=(UPat.any(stidx:=UPat.var("buf").index(UPat.var("idx"), UPat.var("store_gate")), stidx.cast().named("cast")),
469
- UPat.var("val"))), delete_redundant_gates),
470
- ])
471
-
472
- migrate_indexing = PatternMatcher([
473
- # create gate MUST BE BEFORE expander
474
- (UPat(Ops.STORE, name="root"), create_gate),
475
- ])
476
-
477
- def move_mask(x:UOp, buf:UOp, idx:UOp, mask:UOp, cast:UOp|None=None) -> UOp:
478
- # this moves the mask from the indexing to the load/store op for rendering
479
- nidx = buf.index(idx).cast(cast.dtype) if cast is not None else buf.index(idx)
480
- return UOp.load(nidx, x.const_like(0), mask, *x.src[1:], dtype=x.dtype) if x.op is Ops.LOAD else UOp.store(nidx, x.src[1], mask, *x.src[2:])
481
-
482
- pm_render = PatternMatcher([
483
- # for rendering, we use explicit VECTORIZE
484
- (UPat(Ops.CONST, name='c'),
485
- lambda c: UOp(Ops.VECTORIZE, c.dtype, (UOp.const(c.dtype.scalar(), c.arg),)*c.dtype.vcount) if c.dtype.vcount > 1 else None),
486
- (UPat(Ops.VCONST, name='c'), lambda c: UOp(Ops.VECTORIZE, c.dtype, tuple(UOp.const(c.dtype.scalar(), x) for x in c.arg))),
487
- (UPat(Ops.GEP, name='gep'), lambda gep: UOp(Ops.VECTORIZE, gep.dtype, tuple(gep.src[0].gep(x) for x in gep.arg)) if len(gep.arg) > 1 else None),
488
- (UPat(Ops.VECTORIZE, src=(UPat(name='x'),)), lambda x: x),
489
- # move masks of loads/stores
490
- (UPat((Ops.LOAD, Ops.STORE), src=(UPat.any(masked_index:=UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var("idx"), UPat.var("mask"))),
491
- masked_index.cast(None).named("cast")),), allow_any_len=True, name="x"), move_mask),
492
- # gate any stores that aren't gated with ifs
493
- (UPat(Ops.STORE, dtype=dtypes.void, src=(UPat(), UPat(), UPat(dtype=dtypes.bool)), name="store"),
494
- lambda store: UOp(Ops.STORE, src=store.src[:2]+(UOp(Ops.IF, src=(store.src[2],)),))),
495
- ])
496
-
497
- # *** uop graph ***
498
-
499
- def full_graph_rewrite(sink:UOp, opts:Optional[Renderer]=None) -> UOp:
500
- assert sink.op is Ops.SINK, f"sink isn't sink, it's {sink.op}"
501
- supported_ops = tuple(opts.code_for_op.keys()) if opts is not None else ()
502
- extra_matcher = opts.extra_matcher if opts is not None and opts.extra_matcher is not None else PatternMatcher([])
503
-
504
- # initial symbolic + migrate indexing (remove this)
505
- sink = graph_rewrite(sink, sym+migrate_indexing)
506
-
507
- # expand
508
- sink = graph_rewrite(sink, sym+expander)
509
-
510
- # devectorize + load_store_indexing + mulacc_unrolled, mulacc_unrolled must be last because it can break loop_collapse
511
- sink = graph_rewrite(sink, sym+(devectorize+float4_folding if opts is not None and opts.supports_float4 else devectorize)+load_store_indexing+
512
- mulacc_unrolled)
513
-
514
- # final rules for the renderer (without sym)
515
- sink = graph_rewrite(sink, symbolic_simple+get_late_rewrite_patterns(supported_ops, TRANSCENDENTAL>=2)+pm_render+extra_matcher)
516
- return sink
@@ -1,86 +0,0 @@
1
- tinygrad/__init__.py,sha256=2Jhg7NSWlegCi4OAfGW0iHRVHeqMx09f7446rwAmc60,587
2
- tinygrad/device.py,sha256=mUrxoZJfqBJepDeqbmS2Y8UbOB_UPERqP9zBN7sCBk8,18968
3
- tinygrad/dtype.py,sha256=010zGuqXwUoyrhe23nJQ5oHmJt_6CRL7Hcl0kWjvKbo,9843
4
- tinygrad/gradient.py,sha256=hVyRMnwzjtWwKfF0NMGQovE2I7_2GdKFyR9pWJn4eE4,4280
5
- tinygrad/helpers.py,sha256=DIofGx-mg-umNKjhNpINcrxZExUwo6JIKHgmFuKNLUM,19203
6
- tinygrad/ops.py,sha256=T1gmR1ywWPnGEcw-MPAW5SGEvNS_789PtnFrR8rz5H4,70614
7
- tinygrad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- tinygrad/spec.py,sha256=Wi8PdgeIHzv6WCTwKT9zGP5_53Bt_QtmvGTjIzTuwi0,8953
9
- tinygrad/tensor.py,sha256=-DVucOwq8HmM7IHqW7ZNABy7-vYZI-YfF0Vzdo0rs70,181559
10
- tinygrad/codegen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- tinygrad/codegen/kernel.py,sha256=4TcuMvvGODd0zFBuhK9KzvQhwzknRgeCL_n9WrOpljQ,42432
12
- tinygrad/codegen/linearize.py,sha256=X9aOMhvimDTvMq2Vnj3zfGNIRhJnAG5mLcq6EeUfvvU,10382
13
- tinygrad/codegen/lowerer.py,sha256=DqFg53oALqWG7RygOjXEioyGk52BnGbCzvQSqiOIKGw,7527
14
- tinygrad/codegen/rewriter.py,sha256=T3tOn-T1IFqiP90GCqfmkr0V5i0knr2tjQfKPVr6zzM,29948
15
- tinygrad/codegen/transcendental.py,sha256=0qRaEtIoJKDfjPqvQWexShZW3F__wtzfjhU__BqiMD8,13112
16
- tinygrad/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- tinygrad/engine/jit.py,sha256=6D8pCpRPZNANwFEHAsAMUgUaEwaCf4mB_1oem6oZZ7o,16630
18
- tinygrad/engine/memory.py,sha256=UyiNYIoUjtcJ1SX6ApoCnbrSKZpbBbhfwr13TIaiqEM,3231
19
- tinygrad/engine/multi.py,sha256=FnyCvwFTdBXiydXB0dD5Y63KnerCurZS_o8WOU5fHFM,10371
20
- tinygrad/engine/realize.py,sha256=vOQ33sTn_P2oT0GRFU5tKYLfiRmxrIIdVs27v0_a40g,9690
21
- tinygrad/engine/schedule.py,sha256=9JywWlmVo912JmbIF9r5ajr5-Sesqw1EhE-CYnk5LHI,27407
22
- tinygrad/engine/search.py,sha256=VOsUOPaI7zWtDgL2SJDiKSPaTs3MhjgZOJBZ5_ISeiY,12093
23
- tinygrad/nn/__init__.py,sha256=BAxMz-g7v-v1A32KaBzmGiaEnvQ_dU9d3KoPdYYwLDQ,15156
24
- tinygrad/nn/datasets.py,sha256=wcT0Qrlpw_RzM7uBy8uphzKAjGT-ZE48fiP-0g3WvI0,1042
25
- tinygrad/nn/optim.py,sha256=qfdYKi_ssX5O_DU6h8GJ0WCzBzAZLyyS3p_946PJNsQ,6816
26
- tinygrad/nn/state.py,sha256=zXFMwAw7sf35C9x3iAxYiPsmhk7_S6qPcX3wXxqp6Bw,16030
27
- tinygrad/renderer/__init__.py,sha256=KvZ3y7MnqKDHOKoTcxghT95FhTBJuzs3IFEBM077jw8,6989
28
- tinygrad/renderer/cstyle.py,sha256=TrmxtioR7Lgw_8CK-Ay9EA0HjS1TXRf-RaMsCsuHszw,29852
29
- tinygrad/renderer/llvmir.py,sha256=iHFjE9-GtH2sGD_feCGe7aGECPHJ5qGOyuuyCDxSEXU,8529
30
- tinygrad/renderer/ptx.py,sha256=is4PMkdbDDgKsTy67DtkugatdhbtVMijKE9u1f6-0ag,14899
31
- tinygrad/renderer/wgsl.py,sha256=3AGZp2jvymxLZSJHjzD7zlwxvds36-lSu2ZkntrW_ww,6697
32
- tinygrad/runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
- tinygrad/runtime/ops_amd.py,sha256=zRictmcgUPw6d8iykz6QWqCmiPirOh-hTACT-GYxso8,37282
34
- tinygrad/runtime/ops_clang.py,sha256=InJPVVhY10bxRXOaDQRUBlWafSZRdo72Y0VVW9O6dBw,1310
35
- tinygrad/runtime/ops_cloud.py,sha256=1umLappzYKg0ECC9C_EN1lzXquAQVqY7wj-mS0vTcSk,10362
36
- tinygrad/runtime/ops_cuda.py,sha256=26ko2bTLT_jDmU2i8_xiL4BomD9krzvlz6wY5FE-e5c,7138
37
- tinygrad/runtime/ops_disk.py,sha256=3eTpJSVQQH32KBa7tRLBm2pPSkCaIC1adPc748UkUeg,6619
38
- tinygrad/runtime/ops_dsp.py,sha256=JCSehSDLzaa8Nhd6Xtas0d5vOCgwQGY4U3awHOLThEI,16900
39
- tinygrad/runtime/ops_gpu.py,sha256=VrY9iM5i44OQwTJE8sgUavCtr1TOpE96I5-JmqUFz-E,8972
40
- tinygrad/runtime/ops_hip.py,sha256=MbR4depxgHcaGpOJGvUCiLq5tdbpaDiqs-Xj41rY2xQ,3730
41
- tinygrad/runtime/ops_llvm.py,sha256=yjW5wVMJdmayeeTQwnQ60mtOWiiYEqvtb9Rt5cDj2rU,3239
42
- tinygrad/runtime/ops_metal.py,sha256=rz3fiaiB1by_RuP7VFVBgGA1mye43_-6PIfMfeQ3YmU,13229
43
- tinygrad/runtime/ops_npy.py,sha256=8VNf1S5M_MRk9d3GxSsTPbfEz7I_aOwl7QMZ1mUG3As,370
44
- tinygrad/runtime/ops_nv.py,sha256=8NvxtEc89dbJ5xgVhGs9_zeGMztp9HRZHLwPqylDsdQ,34541
45
- tinygrad/runtime/ops_python.py,sha256=5CZek-7fMhq6w07lGDM0pXltZzzMMRtuU2x5PeWMyWY,11681
46
- tinygrad/runtime/ops_qcom.py,sha256=Dt4hgAd6o13CxsOFRSt7lHY6bCOTLvtQpOr_jx_lYbc,22565
47
- tinygrad/runtime/ops_webgpu.py,sha256=-uBAWhVZ7T_9zG2v6PWjkJLpovrClqb_XAPk-l83ryc,4345
48
- tinygrad/runtime/autogen/adreno.py,sha256=u7VxIomPAlW3nFUs4gSTe-6ijam_ywkvDM9OuTLF-j8,897915
49
- tinygrad/runtime/autogen/amd_gpu.py,sha256=Iasq-zYiv8bvT43dtvPO1W5jaLEQ3d6hP0CoFVhSsak,3977783
50
- tinygrad/runtime/autogen/comgr.py,sha256=3pp3XyqEJDBLa9XtGx2-Gc1iJgBbbgIq4pdFEpYXT44,39874
51
- tinygrad/runtime/autogen/cuda.py,sha256=N0QyaMvQumr_HZh7fusCHM1d4o4mYti3Wq1MN7JSKr8,243920
52
- tinygrad/runtime/autogen/hip.py,sha256=1yUHDCwL3KkD15if2Q1Ud3GbJiR7DxsNorKZTCINw54,245532
53
- tinygrad/runtime/autogen/hsa.py,sha256=7Hsrn17HmChyeFOSX_3Fnzl9c0COtq2Z2ExqGu5FNiU,277716
54
- tinygrad/runtime/autogen/io_uring.py,sha256=ZIZ2YnQkLr8WIHMieBw9Dv-NZ1ar9TwdP4YBv3gJm28,59786
55
- tinygrad/runtime/autogen/kfd.py,sha256=VdhuG4qec0EgM-jJmWcdTS-8WrmywNkcjSX7ibbmvdk,30866
56
- tinygrad/runtime/autogen/kgsl.py,sha256=2EgJ5Kst4oRUv81hsV2srgwPvWpY-weaSB4E2lGMAyc,50656
57
- tinygrad/runtime/autogen/libc.py,sha256=xKJk2hCzVpauJSc8wCQis5x3SwcXnDli7_HyRUqEGRc,197318
58
- tinygrad/runtime/autogen/libpciaccess.py,sha256=zaVmkoUHVTEQcPQwkFpgMCgX4HtX-BL-MGMbi8XtgCI,84194
59
- tinygrad/runtime/autogen/llvm.py,sha256=aeVd_ByohxbGRyqXzShPOupI2xtcdk34I6_OIBrMQHg,467606
60
- tinygrad/runtime/autogen/nv_gpu.py,sha256=9X2tPdv2E5JmXGZeT8i9jL19YJ4ETTsYwfU_Wn9mTwc,1679326
61
- tinygrad/runtime/autogen/nvrtc.py,sha256=19te2-TW5suFy85KnJox3CPOmeeml5YxqIDeL-Bx_m4,23132
62
- tinygrad/runtime/autogen/opencl.py,sha256=NL6fa8P3KC_McNZ8g2babdr3b8vrY-bFK0qzNAtL-rE,82656
63
- tinygrad/runtime/autogen/qcom_dsp.py,sha256=jx36-zC6reTuWgfbHCrKVjOZcF4Q9fBnq3CuTbxztQk,61848
64
- tinygrad/runtime/autogen/vfio.py,sha256=IJV1eeWWllU6b9LAX_IH0bUW5NDzfhPQy_YzXGhD9-8,32431
65
- tinygrad/runtime/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
- tinygrad/runtime/graph/cuda.py,sha256=vLjT_c93G6ia_1MsbYWP5Uq96Aeko0AOskRkwT5-MUI,4818
67
- tinygrad/runtime/graph/hcq.py,sha256=kKu2YnjAZU40XMACSbbcxJSi2xdTg3OYLO2zcPLyAf0,12600
68
- tinygrad/runtime/graph/metal.py,sha256=6JN7WJr9w1pIvQGYr0Rsnyg-SA-slK61jKV__KmHnXg,6001
69
- tinygrad/runtime/support/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- tinygrad/runtime/support/allocator.py,sha256=INW6TaJxMi4cwMEDtkta2wFH3kJ87wy3SyFFu3_jJ9w,4721
71
- tinygrad/runtime/support/compiler_cuda.py,sha256=6cU1OMMW3aOUFNVALpDYWKXh-zFc5q81PSSQhRK9fLw,5471
72
- tinygrad/runtime/support/compiler_hip.py,sha256=fbRP82UdG4T-KCRYH_H2hEXlMFeHIJntSnY35ZWE5JY,4398
73
- tinygrad/runtime/support/elf.py,sha256=AxWyaAVEe4xdSTiISqIf80oaHRwUZwb5b1V_Q3874s8,3857
74
- tinygrad/runtime/support/hcq.py,sha256=JWLRwoGPZCgRBryNb6uq0qCcfkVfMmNnlnHmCzPFxFI,21901
75
- tinygrad/runtime/support/llvm.py,sha256=-qI8NRmSx2dBZnG-OQCTWfmy0XySj_T8kHNRHULTyG4,2174
76
- tinygrad/runtime/support/am/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- tinygrad/runtime/support/am/amdev.py,sha256=LD80Bc9DHdl99bxrfwn2c43m1NdDD0mjPnSOJkEfkgU,20433
78
- tinygrad/runtime/support/am/ip.py,sha256=WnSIZWSG9IvSjVYLJv-VShW_X6vcziynj__lvTOt4yQ,24531
79
- tinygrad/shape/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
- tinygrad/shape/shapetracker.py,sha256=clrYjN-zEZHSh_biDy3jnHL4Fq9HyWCdH_CRpwLKki0,7612
81
- tinygrad/shape/view.py,sha256=7KJwP2lS1YWhMq80Ka6_h_qavyxSEm9jWloVgHYRx-k,18110
82
- tinygrad-0.10.1.dist-info/LICENSE,sha256=ABRhUPEILzINYIukgazD-_rPipkUNUwslrb0RxnV6Xc,1058
83
- tinygrad-0.10.1.dist-info/METADATA,sha256=JMb7PpBcZqsf0tbmNJzvBqUrH6U1JER_0mz20UMo0LA,11241
84
- tinygrad-0.10.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
85
- tinygrad-0.10.1.dist-info/top_level.txt,sha256=vDABMCWBFQnx2kn9Azueu88FP-1klQdePoHikQhHymc,9
86
- tinygrad-0.10.1.dist-info/RECORD,,