PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +11 -6
tinygrad/codegen/kernel.py +308 -175
tinygrad/codegen/linearize.py +95 -0
tinygrad/codegen/lowerer.py +143 -0
tinygrad/codegen/transcendental.py +257 -0
tinygrad/codegen/uopgraph.py +506 -0
tinygrad/device.py +72 -171
tinygrad/dtype.py +122 -47
tinygrad/engine/jit.py +184 -87
tinygrad/{lazy.py → engine/lazy.py} +74 -66
tinygrad/engine/memory.py +51 -0
tinygrad/engine/realize.py +86 -61
tinygrad/engine/schedule.py +366 -317
tinygrad/engine/search.py +58 -47
tinygrad/function.py +59 -58
tinygrad/helpers.py +120 -102
tinygrad/multi.py +82 -78
tinygrad/nn/__init__.py +116 -67
tinygrad/nn/datasets.py +12 -5
tinygrad/nn/optim.py +1 -1
tinygrad/nn/state.py +91 -6
tinygrad/ops.py +1126 -143
tinygrad/renderer/__init__.py +47 -23
tinygrad/renderer/cstyle.py +338 -265
tinygrad/renderer/llvmir.py +125 -143
tinygrad/renderer/ptx.py +225 -0
tinygrad/runtime/autogen/adreno.py +17904 -0
tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/io_uring.py +97 -63
tinygrad/runtime/autogen/kfd.py +60 -47
tinygrad/runtime/autogen/kgsl.py +1386 -0
tinygrad/runtime/autogen/libc.py +5462 -0
tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/autogen/opencl.py +11 -11
tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
tinygrad/runtime/graph/clang.py +3 -3
tinygrad/runtime/graph/cuda.py +11 -15
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +71 -43
tinygrad/runtime/ops_amd.py +244 -323
tinygrad/runtime/ops_clang.py +12 -5
tinygrad/runtime/ops_cloud.py +220 -0
tinygrad/runtime/ops_cuda.py +42 -99
tinygrad/runtime/ops_disk.py +25 -26
tinygrad/runtime/ops_dsp.py +181 -0
tinygrad/runtime/ops_gpu.py +29 -16
tinygrad/runtime/ops_hip.py +68 -0
tinygrad/runtime/ops_llvm.py +15 -10
tinygrad/runtime/ops_metal.py +147 -64
tinygrad/runtime/ops_nv.py +356 -397
tinygrad/runtime/ops_python.py +78 -79
tinygrad/runtime/ops_qcom.py +405 -0
tinygrad/runtime/support/__init__.py +0 -0
tinygrad/runtime/support/compiler_cuda.py +77 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/runtime/support/hcq.py +539 -0
tinygrad/shape/shapetracker.py +40 -50
tinygrad/shape/view.py +102 -63
tinygrad/tensor.py +1109 -365
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
tinygrad-0.10.0.dist-info/RECORD +77 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad/codegen/uops.py +0 -451
tinygrad/engine/graph.py +0 -100
tinygrad/renderer/assembly.py +0 -269
tinygrad/shape/symbolic.py +0 -327
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0

tinygrad/renderer/assembly.py DELETED Viewed

@@ -1,269 +0,0 @@
-from typing import DefaultDict, Dict, List, Union, Optional, cast, Callable
-import struct, math
-from collections import defaultdict
-from tinygrad.helpers import DEBUG
-from tinygrad.ops import BinaryOps, UnaryOps, TernaryOps, Op
-from tinygrad.dtype import dtypes, DType, PtrDType, ConstType
-from tinygrad.codegen.uops import UOps, UOp, UOpGraph, PatternMatcher, UPat
-from tinygrad.renderer import Renderer, TensorCore
-def render_val(x, dtype):
-  if dtypes.is_float(dtype):
-    if dtype == dtypes.double: return "0d%02X%02X%02X%02X%02X%02X%02X%02X" % tuple(struct.pack("d",x)[::-1])
-    if dtype == dtypes.half: return "0x%02X%02X" % tuple(struct.pack("e",x)[::-1])
-    return "0f%02X%02X%02X%02X" % tuple(struct.pack("f",x)[::-1])
-  return str(int(x)) + ("U" if dtypes.is_unsigned(dtype) else "")
-class PTXRenderer(Renderer):
-  device = "CUDA"
-  suffix = "PTX"
-  global_max = (2147483647, 65535, 65535)
-  local_max = (1024, 1024, 64)
-  shared_max = 49152
-  tensor_cores = [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)])] # noqa: E501
-  def __init__(self, arch:str): self.tensor_cores = PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else []
-  # language options
-  kernel_prefix = """.version VERSION
-.target TARGET
-.address_size 64
-.visible .entry"""
-  barrier = "bar.sync\t0;"
-  gid = [f'%ctaid.{chr(120+i)}' for i in range(3)]
-  gdim = [f'%nctaid.{chr(120+i)}' for i in range(3)]
-  lid = [f'%tid.{chr(120+i)}' for i in range(3)]
-  asm_for_op: Dict[Op, Callable] = {
-    UnaryOps.NEG: lambda d,a,dt,name: f"not.pred {d}, {a};" if name == "pred" else f"sub.{name} {d}, 0, {a};" if dtypes.is_unsigned(dt) \
-      else f"neg.{name} {d}, {a};",
-    UnaryOps.RECIP: lambda d,a,dt,name: f"rcp{'.approx' if dtypes.is_float(dt) else ''}.{name} {d}, {a};",
-    UnaryOps.EXP2: lambda d,a,dt,name: f"ex2.approx.{name} {d}, {a};", UnaryOps.LOG2: lambda d,a,dt,name: f"lg2.approx.{name} {d}, {a};",
-    UnaryOps.SIN: lambda d,a,dt,name: f"sin.approx.{name} {d}, {a};", UnaryOps.SQRT: lambda d,a,dt,name: f"sqrt.approx.{name} {d}, {a};",
-    BinaryOps.SHR: lambda d,a,b,dt,name: f"shr.{name} {d}, {a}, {b};", BinaryOps.SHL: lambda d,a,b,dt,name: f"shl.b{name[1:]} {d}, {a}, {b};",
-    BinaryOps.ADD: lambda d,a,b,dt,name: f"{'or' if name == 'pred' else 'add'}.{name} {d}, {a}, {b};",
-    BinaryOps.MUL: lambda d,a,b,dt,name: ('and' if dt == dtypes.bool else 'mul') + f"{'.lo' if dtypes.is_int(dt) else ''}.{name} {d}, {a}, {b};",
-    BinaryOps.XOR: lambda d,a,b,dt,name: f"xor.pred {d}, {a}, {b};" if name == "pred" else f"xor.b{name[1:]} {d}, {a}, {b};",
-    BinaryOps.IDIV: lambda d,a,b,dt,name: f"div.{name} {d}, {a}, {b};",
-    BinaryOps.MAX: lambda d,a,b,dt,name: f"max.{name} {d}, {a}, {b};", BinaryOps.MOD: lambda d,a,b,dt,name: f"rem.{name} {d}, {a}, {b};",
-    BinaryOps.CMPLT: lambda d,a,b,dt,name: f"setp.lt.{name} {d}, {a}, {b};",
-    BinaryOps.CMPNE: lambda d,a,b,dt,name: f"setp.ne.{name} {d}, {a}, {b};",
-    TernaryOps.MULACC: lambda d,a,b,c,dt,name: f"{'fma.rn' if dtypes.is_float(dt) else 'mad.lo'}.{name} {d}, {a}, {b}, {c};",
-    TernaryOps.WHERE: lambda d,a,b,c,dt,name:
-      f"@{a} mov.{name} {d}, {b};\n@!{a} mov.{name} {d}, {c};" if name == "pred" else f"selp.{'b16' if name == 'f16' else name} {d}, {b}, {c}, {a};"
-  }
-  supports_half: List[Op] = [UnaryOps.NEG, UnaryOps.EXP2, BinaryOps.ADD, BinaryOps.MUL, BinaryOps.MAX, BinaryOps.CMPLT,
-                             TernaryOps.WHERE]
-  # HACK: Use s16 and u16 for int8 and uint8 buffers. This can be wrong in cast.
-  types: Dict[DType, str] = { dtypes.int8: "s16", dtypes.int16: "s16", dtypes.int32: "s32", dtypes.int64: "s64",
-                              dtypes.uint8: "u16", dtypes.uint16: "u16", dtypes.uint32: "u32", dtypes.uint64: "u64",
-                              dtypes.float16: "f16", dtypes.float32: "f32", dtypes.float64: "f64", dtypes.bool: "pred" }
-  mem_types: Dict[DType, str] =  types.copy()
-  mem_types.update({dtypes.int8: "s8", dtypes.uint8: "u8", dtypes.bool: "u8", dtypes.float16: "b16"})
-  const_requires_mov: List[DType] = [dtypes.half, dtypes.bool]
-  def render_const(self, x:ConstType, dtype:DType, mov=None) -> Union[List[str], str]:
-    val = render_val(x, dtype)
-    if dtype == dtypes.bool: return [f"setp.ne.s16 {mov}, {val}, 0;"]
-    return [f"mov.b{self.types[dtype][1:]} {mov}, {val};"] if mov else val
-  def render_local(self, dest, name, size, dtype) -> List[str]:
-    return [f".shared .align 4 .b8 {name}[{size*dtype.itemsize}];", f"mov.u64 {dest}, {name}[0];"]
-  def render_loop(self, idx, start, label, acc=None) -> List[str]: return [f"mov.u32 {idx}, {start};", f"{label}:"]
-  def render_bra(self, b1, pred=None) -> List[str]: return [f"@{pred} bra {b1};"] if pred else [f"bra {b1};"]
-  def render_load(self, loc, dest, dtype, gate=None, alt=None, ss="", offset=0) -> List[str]:
-    assert dtype != dtypes.bool
-    if gate: return [f"@{gate} ld{ss}.{self.mem_types[dtype]} {dest}, [{loc}+{offset}];", f"@!{gate} mov.b{self.types[dtype][1:]} {dest}, {alt};"]
-    return [f"ld{ss}.{self.mem_types[dtype]} {dest}, [{loc}+{offset}];"]
-  def render_store(self, loc, val, dtype, gate=None, ss="", offset=0) -> List[str]:
-    return [(f"@{gate} " if gate else "") + f"st{ss}.{self.mem_types[dtype]} [{loc}+{offset}], {val};"]
-  def render_cast(self, d:str, a:str, dtype:DType, atype:DType, bitcast=False, pred=False) -> List[str]:
-    if bitcast: return [f"mov.b{self.types[dtype][1:]} {d}, {a};"]
-    if atype == dtypes.bool: return[f"selp.b{self.types[dtype][1:]} {d}, {render_val(1, dtype)}, {render_val(0, dtype)}, {a};"]
-    if dtype == dtypes.bool: return [f"setp.ne.b{self.types[atype][1:]} {d}, {a}, {self.render_const(0, atype)};"]
-    rnd = ('.rzi' if dtypes.is_int(dtype) and dtypes.is_float(atype) else
-           '.rn' if dtypes.is_float(dtype) and (dtype.itemsize < atype.itemsize or dtypes.is_int(atype) or atype == dtypes.bool) else '')
-    return [f"cvt{rnd}.{self.types[dtype]}.{self.types[atype]} {d}, {a};"]
-  def render_kernel(self, kernel, function_name, bufs, regs) -> str:
-    kernel = [f".reg .{reg.split('_')[-2]} %{reg}<{cnt}>;" for reg,cnt in regs] + kernel + ["ret;"]
-    def fmt(line): return line if line[0]=="$" else "\t" + line.replace(" ", "\t" if len(line.split(" ")[0]) > 7 else "\t\t", 1)
-    return (f"{self.kernel_prefix} {function_name}(\n\t" +
-            ',\n\t'.join([f".param .{'u64' if dtype.__class__ == PtrDType else self.types[dtype]} {name}" for name,dtype in bufs]) + "\n)\n{\n" +
-            '\n'.join([fmt(line) for op in kernel for line in op.splitlines()]) +
-            "\n}")
-  def render(self, name:str, uops:UOpGraph) -> str:
-    kernel:List[str] = []
-    bufs = []
-    uops.linearize(ptx_matcher)
-    if DEBUG >= 4: uops.print()
-    def kk(*s: str): kernel.append("\n".join(s))
-    c: DefaultDict[str, int] = defaultdict(int)
-    r: Dict[UOp, Union[List[str], str]] = {}
-    def ssa(prefix:str, u:Optional[UOp]=None, dtype:Optional[str]=None) -> str:
-      nonlocal c, r
-      prefix += f"_{dtype if dtype is not None else self.types[cast(DType, cast(UOp, u).dtype)]}_"
-      c[prefix] += 1
-      if u is not None: r[u] = f"%{prefix}{c[prefix]-1}"
-      return f"%{prefix}{c[prefix]-1}"
-    def const(x:ConstType, dtype:DType, mov=False):
-      if mov or dtype in self.const_requires_mov:
-        kk(*self.render_const(x, dtype, mov=(out:=ssa('const', dtype=self.types[dtype]))))
-        return out
-      return self.render_const(x, dtype)
-    def _cast(a, dtype:DType, atype:DType, bitcast=False, u=None, pred=False):
-      if atype == dtype or isinstance(atype, PtrDType):
-        if u: r[u] = a
-        return a
-      kk(*self.render_cast((ret:=ssa('cast', u, self.types[dtype])), a, dtype, atype, bitcast))
-      return ret
-    for u in uops:
-      uop,dtype,src,args = u.op,u.dtype,u.src,u.arg
-      if uop is UOps.IF:
-        assert src[0].dtype is not None
-        kk(*self.render_bra(f"IF_{r[src[0]][1:]}_{cast(List, uops._uops).index(u)}", _cast(r[src[0]], dtypes.bool, src[0].dtype, u=u, pred=True)))
-      elif uop is UOps.BARRIER and self.barrier: kk(self.barrier)
-      elif uop is UOps.ENDRANGE:
-        kk(self.asm_for_op[BinaryOps.ADD](r[src[0]], r[src[0]], "1", dtypes.int, self.types[dtypes.int]),
-            self.asm_for_op[BinaryOps.CMPLT](pred:=ssa("pred", dtype="pred"), r[src[0]], r[src[0].src[1]], dtypes.int, self.types[dtypes.int]))
-        kk(*self.render_bra(f"LOOP_{r[src[0]][1:]}", pred))
-      elif uop is UOps.ENDIF:
-        kk(f"IF_{r[src[0].src[0]][1:]}_{cast(List, uops._uops).index(src[0])}:")
-      elif uop is UOps.STORE:
-        assert src[0].dtype is not None and src[2].dtype is not None
-        assert src[0].dtype == dtypes.int64, "store isn't int64"
-        assert src[1].op is UOps.CONST, f"store isn't const {u}"
-        mem_type = '.shared' if src[0].op is UOps.DEFINE_LOCAL or any(x.op is UOps.DEFINE_LOCAL for x in src[0].parents) else '.global'
-        if src[2].dtype.count > 1:
-          kk((f"@{r[src[3]]} " if len(src)>3 else "") + \
-              f"st{mem_type}.v{src[2].dtype.count}.{self.mem_types[src[2].dtype.scalar()]} [{r[src[0]]}+{src[1].arg}], {{{', '.join(r[src[2]])}}};")
-        else:
-          kk(*self.render_store(r[src[0]], r[src[2]], src[2].dtype, gate=r[src[3]] if len(src)>3 else None, ss=mem_type, offset=src[1].arg))
-      else:
-        assert dtype is not None, f"None dtype for uop {uop}"
-        if uop is UOps.RANGE: kk(*self.render_loop(loop:=ssa('ridx', u), r[src[0]], "LOOP_"+loop[1:]))
-        elif uop is UOps.ALU:
-          assert src[0].dtype is not None
-          if args is BinaryOps.CMPLT or args is BinaryOps.CMPNE:
-            # pass in the other dtype here
-            kk(self.asm_for_op[args](ssa("alu", u), *[r[x] for x in src], src[0].dtype, self.types[src[0].dtype]))
-          else:
-            kk(self.asm_for_op[args](ssa("alu", u), *[r[x] for x in src], dtype, self.types[dtype]))
-        elif uop is UOps.DEFINE_ACC:
-          if dtype.count > 1:
-            r[u] = [ssa('acc', dtype=self.types[dtype.scalar()]) for _ in range(dtype.count)]
-            for uu in r[u]: kk(f"mov.b{self.types[dtype.scalar()][1:]} {uu}, {const(src[0].arg, dtype.scalar())};")
-          else: kk(f"mov.b{self.types[dtype][1:]} {ssa('acc', u)}, {const(src[0].arg, dtype)};")
-        elif uop is UOps.SPECIAL:
-          assert args[1][0] != "i", "idx not supported"
-          kk(f"mov.u32 %{args[1]}, {(self.gid if args[1][0] == 'g' else self.lid)[args[0]]};")
-          r[u] = "%" + args[1]
-          kernel = [f".reg .u32 %{args[1]};"] + kernel
-        elif uop is UOps.CONST:
-          if dtype.count > 1: r[u] = [const(args, dtype.scalar(), mov=True) for _ in range(dtype.count)]
-          else: r[u] = const(args, dtype, mov=True)
-        elif uop is UOps.GEP: r[u] = r[src[0]][u.arg]
-        elif uop is UOps.LOAD:
-          assert src[0].dtype == dtypes.int64, "load isn't int64"
-          assert src[1].op is UOps.CONST, f"load isn't const {u}"
-          mem_type = '.shared' if src[0].op is UOps.DEFINE_LOCAL or any(x.op is UOps.DEFINE_LOCAL for x in src[0].parents) else '.global'
-          if dtype.count > 1:
-            r[u] = [ssa('val', dtype=self.types[dtype.scalar()]) for _ in range(dtype.count)]
-            if(len(src)>3):
-              for v in r[u]: kk(f"mov.{self.mem_types[dtype.scalar()]} {v}, {render_val(0, dtype.scalar())};")
-            kk((f"@{r[src[2]]}"if len(src) > 3 else "")
-              + f" ld{mem_type}.v{dtype.count}.{self.mem_types[dtype.scalar()]} {{{', '.join(r[u])}}}, [{r[src[0]]}+{src[1].arg}];")
-          else:
-            kk(*self.render_load(r[src[0]], ssa('val', u), dtype, gate=r[src[2]] if len(src) > 3 else None,
-                                alt=r[src[3]] if len(src) > 3 else None, ss=mem_type, offset=src[1].arg))
-        elif uop is UOps.PHI:
-          if dtype.count > 1:
-            for x0, x1 in zip(r[src[0]], r[src[1]]): kk(f"mov.b{self.types[dtype.scalar()][1:]} {x0}, {x1};")
-          else:
-            kk(f"mov.b{self.types[dtype][1:]} {r[src[0]]}, {r[src[1]]};")
-          r[u] = r[src[0]]
-        elif uop in {UOps.CAST, UOps.BITCAST}:
-          assert src[0].dtype is not None
-          if dtype.count>1: r[u] = [r[x] for x in src] # type: ignore
-          else: _cast(r[src[0]], dtype, src[0].dtype, bitcast=uop is UOps.BITCAST, u=u)
-        elif uop is UOps.DEFINE_LOCAL:
-          # TODO: we should sum these, and fetch 0xC000 from somewhere
-          assert args[1]*dtype.itemsize <= 0xC000, "too large local"
-          kk(*self.render_local(ssa('local', u, self.types[dtypes.ulong]), args[0], args[1], dtype))
-        elif uop is UOps.DEFINE_VAR:
-          bufs.append((args.expr, dtype))
-          r[u] = f"%{args.expr}"
-          kk(*self.render_load(args.expr, ssa('dat', u, self.types[dtype]), dtype, ss=".param"))
-        elif uop is UOps.DEFINE_GLOBAL:
-          bufs.append((nm:=f"data{args[0]}", dtype))
-          r[u] = f"%{nm}"
-          dt = dtypes.ulong if dtype.__class__ == PtrDType else dtype
-          kk(*self.render_load(nm, ssa('dat', u, self.types[dt]), dt, ss=".param"))
-        elif uop is UOps.WMMA:
-          wmma = []
-          for vv in src[:2]:
-            for i in range(0, len(r[vv]), 2):
-              wmma.append(ssa("wmma", dtype="b32"))
-              kk(f'mov.b32 {wmma[-1]}, {{{", ".join(r[vv][i:i+2])}}};')
-          r[u] = [ssa("wmma", dtype=self.types[dtype.scalar()]) for _ in range(dtype.count)]
-          kk(f'mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32\
-            {{{", ".join(r[u])}}}, {{{", ".join(wmma[:4])}}}, {{{", ".join(wmma[4:])}}}, {{{", ".join(r[src[2]])}}};')
-        else: raise NotImplementedError(f"no code for {uop}")
-    return self.render_kernel(kernel, name, bufs, c.items())
-ptx_matcher = PatternMatcher([
-  (UPat(UOps.ALU, BinaryOps.MUL, name="root", dtype=set([dt for dt in dtypes.fields().values() if dtypes.is_int(dt)]),
-      src=[UPat(UOps.CONST, set([2**i for i in range(64)]), name="const"), UPat(name="mul")]),
-    lambda root, mul, const: UOp(UOps.ALU, root.dtype, (mul, UOp.const(dtypes.int, int(math.log2(const.arg)))), BinaryOps.SHL)),
-  (UPat(UOps.ALU, BinaryOps.IDIV, name="root", dtype=set([dt for dt in dtypes.fields().values() if dtypes.is_int(dt)]),
-      src=[UPat(UOps.CONST, set([2**i for i in range(64)]), name="const"), UPat(name="div")]),
-    lambda root, div, const: UOp(UOps.ALU, root.dtype, (div, UOp.const(dtypes.int, int(math.log2(const.arg)))), BinaryOps.SHR)),
-  (UPat(UOps.ALU, BinaryOps.CMPNE, (UPat(dtype=dtypes.bool),UPat()), "root"), lambda root: UOp(root.op, root.dtype, root.src, BinaryOps.XOR)),
-  (UPat(UOps.ALU, BinaryOps.CMPLT, (UPat(name="x", dtype=dtypes.bool),UPat(name="y")), "root"),
-    lambda root,x,y: UOp(root.op, root.dtype, (UOp(UOps.ALU, dtypes.bool, (x,), UnaryOps.NEG), y), BinaryOps.MUL)),
-  (UPat(UOps.ALU, BinaryOps.ADD,
-    [UPat(name="non_muls"), UPat(UOps.ALU, BinaryOps.MUL, name="muls")], "root"),
-    lambda root, muls, non_muls: UOp(UOps.ALU, root.dtype, muls.src + (non_muls,), TernaryOps.MULACC)),
-  *[(UPat(UOps.ALU, op, dtype=dtypes.half, name="x"),
-    lambda x: UOp(UOps.CAST, dtypes.half, (UOp(x.op, dtypes.float32, tuple([UOp(UOps.CAST, dtypes.float32, (vv,)) for vv in x.src]), x.arg),)))
-    for op in PTXRenderer.asm_for_op.keys() if op not in PTXRenderer.supports_half],
-  (UPat(UOps.LOAD, name="root", dtype=dtypes.bool, src=(UPat(name="x"),UPat(name="y"),UPat(name="z"),UPat(name="k"))),
-    lambda root,x,y,z,k: UOp(UOps.CAST, dtypes.bool, (UOp(root.op, dtypes.int8, (x,y,z,UOp(UOps.CAST, dtypes.uint8, (k,)))),), root.arg)),
-  (UPat(UOps.LOAD, name="root", dtype=dtypes.bool, src=(UPat(),UPat())),
-    lambda root: UOp(UOps.CAST, dtypes.bool, (UOp(root.op, dtypes.uint8, root.src, root.arg),))),
-  (UPat(UOps.STORE, name="root", src=(UPat(),UPat(),UPat(name="z",dtype=dtypes.bool), UPat())),
-    lambda root,z: UOp(root.op, root.dtype, root.src[:2] + (UOp(UOps.CAST, dtypes.uint8, (z,)),), root.arg)),
-  (UPat(UOps.STORE, name="root", src=(UPat(),UPat(),UPat(name="z",dtype=dtypes.bool))),
-    lambda root,z: UOp(root.op, root.dtype, root.src[:2] + (UOp(UOps.CAST, dtypes.uint8, (z,)),), root.arg)),
-  (UPat(UOps.STORE, name="root", src=(UPat(),UPat(),UPat(),UPat(name="g", dtype=dtypes.int))),
-    lambda root,g: UOp(root.op, root.dtype, root.src[:3] + (UOp(UOps.CAST, dtypes.bool, (g,)),), root.arg)),
-  # ptr_ar (load/store)
-  (UPat({UOps.LOAD, UOps.STORE}, name="root", allow_len={2,3,4,5}, src=(UPat({UOps.DEFINE_LOCAL,UOps.DEFINE_GLOBAL}),
-                               UPat(UOps.ALU, BinaryOps.ADD, src=[UPat(name="alu"), UPat(UOps.CONST, name="const")]))),
-    lambda root, alu, const: UOp(root.op, root.dtype,
-      (alu.cast(dtypes.int64)*UOp.const(dtypes.int64, root.src[0].dtype.itemsize)+root.src[0].cast(dtypes.int64),
-       UOp.const(const.dtype, root.src[0].dtype.itemsize)*const)+root.src[2:])),
-  (UPat({UOps.LOAD, UOps.STORE}, name="root", allow_len={2,3,4,5}, src=(UPat({UOps.DEFINE_LOCAL,UOps.DEFINE_GLOBAL}),
-                                                                              UPat(UOps.CONST, name="const"))),
-    lambda root, const: UOp(root.op, root.dtype, (root.src[0].cast(dtypes.int64),
-                                UOp.const(dtypes.int64, const.arg * root.src[0].dtype.itemsize),
-                                                  )+root.src[2:])),
-  (UPat({UOps.LOAD, UOps.STORE}, name="root", allow_len={2,3,4,5}, src=(UPat({UOps.DEFINE_LOCAL,UOps.DEFINE_GLOBAL}),
-                                                                              UPat(name="alu"))),  # no const here
-    lambda root, alu: UOp(root.op, root.dtype,
-      (alu.cast(dtypes.int64)*UOp.const(dtypes.int64, root.src[0].dtype.itemsize)+root.src[0].cast(dtypes.int64),
-        UOp.const(dtypes.int64, 0))+root.src[2:])),
-])

tinygrad/shape/symbolic.py DELETED Viewed

@@ -1,327 +0,0 @@
-from __future__ import annotations
-import functools
-from math import gcd
-from tinygrad.helpers import partition
-from typing import List, Dict, Callable, Tuple, Type, Union, Optional, Any, Set, Mapping
-# NOTE: Python has different behavior for negative mod and floor div than c
-# symbolic matches the Python behavior, but the code output is agnostic, and will never have negative numbers in div or mod
-class Node:
-  b: Union[Node, int]
-  min: int
-  max: sint
-  def render(self, ops=None, ctx=None) -> Any:
-    if ops is None: ops = render_python
-    assert self.__class__ in (Variable, NumNode) or self.min != self.max
-    return ops[type(self)](self, ops, ctx)
-  def vars(self) -> Set[Variable]: return set()
-  # substitute Variables with the values in var_vals
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: raise RuntimeError(self.__class__.__name__)
-  def unbind(self) -> Tuple[Node, Optional[int]]: return self.substitute({v: v.unbind()[0] for v in self.vars() if v.val is not None}), None
-  @functools.cached_property
-  def key(self) -> str: return self.render(ctx="DEBUG")
-  def __repr__(self): return self.render(ctx="REPR")
-  def __str__(self): return "<"+self.key+">"
-  def __hash__(self): return hash(self.key)
-  def __bool__(self): return not (self.max == self.min == 0)
-  def __eq__(self, other:object) -> bool:
-    if not isinstance(other, Node): return NotImplemented
-    return self.key == other.key
-  def __neg__(self): return self*-1
-  def __add__(self, b:Union[Node,int]): return Node.sum([self, NumNode(b) if isinstance(b, int) else b])
-  def __radd__(self, b:int): return self+b
-  def __sub__(self, b:Union[Node,int]): return self+-b
-  def __rsub__(self, b:int): return -self+b
-  def __le__(self, b:Union[Node,int]): return self < (b+1)
-  def __gt__(self, b:Union[Node,int]): return (-self) < (-b)
-  def __ge__(self, b:Union[Node,int]): return (-self) < (-b+1)
-  def __lt__(self, b:Union[Node,int]): return create_node(LtNode(self, b))
-  def __mul__(self, b:Union[Node, int]):
-    if b == 0: return NumNode(0)
-    if b == 1: return self
-    return create_node(MulNode(self, b.b)) if isinstance(b, NumNode) else create_node(MulNode(self, b))
-  def __rmul__(self, b:int): return self*b
-  # *** complex ops ***
-  def __rfloordiv__(self, b:int): return NumNode(b) // self
-  def __floordiv__(self, b:Union[Node,int], factoring_allowed=True):
-    if isinstance(b, Node):
-      if b.__class__ is NumNode: return self.__floordiv__(b.b, factoring_allowed)
-      if self == b: return NumNode(1)
-      if (b - self).min > 0 and self.min >= 0: return NumNode(0) # b - self simplifies the node
-      raise RuntimeError(f"not supported: {self} // {b}")
-    assert b != 0
-    if b < 0: return (self*-1).__floordiv__(-b, factoring_allowed)
-    if b == 1: return self
-    # the numerator of div is not allowed to be negative
-    if self.min < 0:
-      offset = self.min//b
-      # factor out an "offset" to make the numerator positive. don't allowing factoring again
-      return (self + -offset*b).__floordiv__(b, factoring_allowed=False) + offset
-    return create_node(DivNode(self, b))
-  def __rmod__(self, b:int): return NumNode(b) % self
-  def __mod__(self, b:Union[Node,int]):
-    if isinstance(b, Node):
-      if b.__class__ is NumNode: return self % b.b
-      if self == b: return NumNode(0)
-      if (b - self).min > 0 and self.min >= 0: return self # b - self simplifies the node
-      raise RuntimeError(f"not supported: {self} % {b}")
-    assert b > 0
-    if b == 1: return NumNode(0)
-    if isinstance(self.max, int) and isinstance(self.min, int):
-      if self.min >= 0 and self.max < b: return self
-      if (self.min//b) == (self.max//b): return self - (b*(self.min//b))
-      if self.min < 0: return (self - ((self.min//b)*b)) % b
-    return create_node(ModNode(self, b))
-  @staticmethod
-  def sum(nodes:List[Node]) -> Node:
-    nodes = [x for x in nodes if x.max or x.min]
-    if not nodes: return NumNode(0)
-    if len(nodes) == 1: return nodes[0]
-    mul_groups: Dict[Node, int] = {}
-    num_node_sum = 0
-    for node in SumNode(nodes).flat_components:
-      if node.__class__ is NumNode: num_node_sum += node.b
-      elif node.__class__ is MulNode: mul_groups[node.a] = mul_groups.get(node.a, 0) + node.b
-      else: mul_groups[node] = mul_groups.get(node, 0) + 1
-    new_nodes = [MulNode(a, b_sum) if b_sum != 1 else a for a, b_sum in mul_groups.items() if b_sum != 0]
-    if num_node_sum: new_nodes.append(NumNode(num_node_sum))
-    return create_node(SumNode(new_nodes)) if len(new_nodes) > 1 else new_nodes[0] if len(new_nodes) == 1 else NumNode(0)
-  @staticmethod
-  def ands(nodes:List[Node]) -> Node:
-    if not nodes: return NumNode(1)
-    if len(nodes) == 1: return nodes[0]
-    if any(not x for x in nodes): return NumNode(0)
-    # filter 1s
-    nodes = [x for x in nodes if x.min != x.max]
-    return create_node(AndNode(nodes)) if len(nodes) > 1 else (nodes[0] if len(nodes) == 1 else NumNode(1))
-# 4 basic node types
-class Variable(Node):
-  def __new__(cls, *args):
-    expr, nmin, nmax = args
-    assert nmin >= 0 and nmin <= nmax, f"invalid Variable {expr=} {nmin=} {nmax=}"
-    if nmin == nmax: return NumNode(nmin)
-    return super().__new__(cls)
-  def __getnewargs__(self): return (self.expr, self.min, self.max)  # args passed to __new__ when unpickling
-  def __init__(self, expr:str, nmin:int, nmax:sint):
-    self.expr, self.min, self.max = expr, nmin, nmax
-    self._val: Optional[int] = None
-  @property
-  def val(self):
-    assert self._val is not None, f"Variable isn't bound, can't access val of {self}"
-    return self._val
-  def bind(self, val):
-    assert self._val is None and self.min<=val<=self.max, f"cannot bind {val} to {self}"
-    self._val = val
-    return self
-  def unbind(self) -> Tuple[Variable, int]:
-    assert self.val is not None, f"cannot unbind {self}"
-    return Variable(self.expr, self.min, self.max), self.val
-  def vars(self): return {self}
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return var_vals.get(self, self)
-class NumNode(Node):
-  def __init__(self, num:int):
-    assert isinstance(num, int), f"{num} is not an int"
-    self.b:int = num
-    self.min, self.max = num, num
-  def bind(self, val):
-    assert self.b == val, f"cannot bind {val} to {self}"
-    return self
-  def __mul__(self, b:Union[Node,int]): return NumNode(self.b*b) if isinstance(b, int) else b*self.b
-  def __eq__(self, other): return self.b == other
-  def __hash__(self): return hash(self.b)  # needed with __eq__ override
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return self
-def create_node(ret:Node):
-  assert ret.min <= ret.max, f"min greater than max! {ret.min} {ret.max} when creating {type(ret)} {ret}"
-  if ret.min == ret.max: return NumNode(ret.min)
-  return ret
-def create_lt_node(lhs:Node, b:Union[Node, int]):
-  if isinstance(lhs, SumNode):
-    if isinstance(b, int):
-      new_sum = []
-      for x in lhs.nodes:
-        # TODO: should we just force the last one to always be the number
-        if isinstance(x, NumNode): b -= x.b
-        else: new_sum.append(x)
-      lhs = Node.sum(new_sum)
-      nodes = lhs.nodes if isinstance(lhs, SumNode) else [lhs]
-      assert all(not isinstance(node, MulNode) or isinstance(node.b, int) for node in nodes), "not supported"
-      muls, others = partition(nodes, lambda x: isinstance(x, MulNode) and x.b > 0 and x.max >= b)
-      if muls:
-        # NOTE: gcd in python 3.8 takes exactly 2 args
-        mul_gcd = b
-        for x in muls: mul_gcd = gcd(mul_gcd, x.b)  # type: ignore  # mypy cannot tell that x.b is int here due to assert above
-        all_others = Node.sum(others)
-        if all_others.min >= 0 and all_others.max < mul_gcd:
-          lhs, b = Node.sum([mul//mul_gcd for mul in muls]), b//mul_gcd
-    return create_node(LtNode(lhs, b)) if isinstance(lhs, SumNode) else create_lt_node(lhs, b)
-  if isinstance(lhs, MulNode):
-    if isinstance(b, Node) or isinstance(lhs.b, Node) or lhs.b == -1: return create_node(LtNode(lhs, b))
-    sgn = 1 if lhs.b > 0 else -1
-    return create_node(LtNode(lhs.a*sgn, (b + abs(lhs.b) - 1)//abs(lhs.b)))
-  return create_node(LtNode(lhs, b))
-def create_ge_node(lhs:Node, b:Union[Node, int]): return create_lt_node(-lhs, -b+1)
-class OpNode(Node):
-  def __init__(self, a:Node, b:Union[Node, int]):
-    self.a, self.b = a, b
-    self.min, self.max = self.get_bounds()
-  def vars(self): return self.a.vars() | (self.b.vars() if isinstance(self.b, Node) else set())
-  def get_bounds(self) -> Tuple[int, sint]: raise NotImplementedError("must be implemented")
-class LtNode(OpNode):
-  def get_bounds(self) -> Tuple[int, int]:
-    if self.a == self.b: return (0, 0)
-    if isinstance(self.b, int): return (1, 1) if self.a.max < self.b else (0, 0) if self.a.min >= self.b else (0, 1)
-    return (1, 1) if self.a.max < self.b.min else (0, 0) if self.a.min >= self.b.max else (0, 1)
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
-    return create_lt_node(self.a.substitute(var_vals), (self.b if isinstance(self.b, int) else self.b.substitute(var_vals)))
-class MulNode(OpNode):
-  def __mul__(self, b: Union[Node, int]): return self.a*(self.b*b) # two muls in one mul
-  def __floordiv__(self, b: Union[Node, int], factoring_allowed=False): # NOTE: mod negative isn't handled right
-    if self.b % b == 0: return self.a*(self.b//b)
-    if b % self.b == 0 and self.b > 0: return self.a//(b//self.b)
-    return Node.__floordiv__(self, b, factoring_allowed)
-  def __mod__(self, b: Union[Node, int]): return Node.__mod__(self.a * (self.b%b), b)
-  def get_bounds(self) -> Tuple[int, sint]:
-    assert self.a.min >= 0
-    if isinstance(self.b, int): return (self.a.min*self.b, self.a.max*self.b) if self.b >= 0 else (self.a.max*self.b, self.a.min*self.b)
-    return (self.a.min*self.b.min, self.a.max*self.b.max) if self.b.min >= 0 else (self.a.max*self.b.min, self.a.min*self.b.max)
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
-    return self.a.substitute(var_vals) * (self.b if isinstance(self.b, int) else self.b.substitute(var_vals))
-class DivNode(OpNode):
-  def __floordiv__(self, b: Union[Node, int], _=False): return self.a//(self.b*b) # two divs is one div
-  def get_bounds(self) -> Tuple[int, sint]:
-    assert self.a.min >= 0 and isinstance(self.b, int)
-    return self.a.min//self.b, self.a.max//self.b
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return self.a.substitute(var_vals) // self.b
-class ModNode(OpNode):
-  def __mod__(self, b: Union[Node, int]):
-    if isinstance(b, int) and isinstance(self.b, int) and self.b % b == 0: return self.a % b
-    return Node.__mod__(self, b)
-  def __floordiv__(self, b: Union[Node, int], factoring_allowed=True):
-    return (self.a//b) % (self.b//b) if self.b % b == 0 else Node.__floordiv__(self, b, factoring_allowed)
-  def get_bounds(self) -> Tuple[int, sint]:
-    assert self.a.min >= 0 and isinstance(self.b, int)
-    if self.a.max - self.a.min >= self.b or (self.a.min != self.a.max and self.a.min%self.b >= self.a.max%self.b): return (0, self.b-1)
-    return (self.a.min%self.b, self.a.max%self.b)
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node: return self.a.substitute(var_vals) % self.b
-class RedNode(Node):
-  def __init__(self, nodes:List[Node]):
-    self.nodes = nodes
-    self.min, self.max = self.get_bounds()
-  def vars(self) -> Set[Variable]: return set.union(*[x.vars() for x in self.nodes], set())
-  def get_bounds(self) -> Tuple[int, sint]: raise NotImplementedError("must be implemented")
-class SumNode(RedNode):
-  def get_bounds(self) -> Tuple[int, sint]: return sum([x.min for x in self.nodes]), sum([x.max for x in self.nodes])
-  @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def __mul__(self, b: Union[Node, int]): return Node.sum([x*b for x in self.nodes]) # distribute mul into sum
-  @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def __floordiv__(self, b: Union[Node, sint], factoring_allowed=True):
-    if self == b: return NumNode(1)
-    fully_divided: List[Node] = []
-    rest: List[Node] = []
-    if isinstance(b, Node):
-      for x in self.flat_components:
-        if x % b == 0: fully_divided.append(x // b)
-        else: rest.append(x)
-      if (sum_fully_divided:=create_node(SumNode(fully_divided))) != 0: return sum_fully_divided + create_node(SumNode(rest)) // b
-      return Node.__floordiv__(self, b, False)
-    if b == 1: return self
-    if not factoring_allowed: return Node.__floordiv__(self, b, factoring_allowed)
-    _gcd = b
-    divisor = 1
-    for x in self.flat_components:
-      if x.__class__ in (NumNode, MulNode):
-        if x.b % b == 0: fully_divided.append(x // b)
-        else:
-          if x.__class__ is NumNode and (div := x.b // b):
-            fully_divided.append(NumNode(div))
-            x = NumNode(x.b - b * div)
-          rest.append(x)
-          if isinstance(x.b, int):
-            _gcd = gcd(_gcd, x.b)
-            if x.__class__ == MulNode and divisor == 1 and b % x.b == 0: divisor = x.b
-          else:
-            _gcd = 1
-      else:
-        rest.append(x)
-        _gcd = 1
-    if _gcd > 1: return Node.sum(fully_divided) + Node.sum(rest).__floordiv__(_gcd) // (b//_gcd)
-    if divisor > 1: return Node.sum(fully_divided) + Node.sum(rest).__floordiv__(divisor) // (b//divisor)
-    return Node.sum(fully_divided) + Node.__floordiv__(Node.sum(rest), b)
-  @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def __mod__(self, b: Union[Node, int]):
-    if self == b: return NumNode(0)
-    if isinstance(b, Node) and (b - self).min > 0: return self # b - self simplifies the node
-    new_sum = Node.sum([node%b if node.__class__ in (NumNode, MulNode) else node for node in self.nodes])
-    return Node.__mod__(new_sum, b)
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
-    return Node.sum([node.substitute(var_vals) for node in self.nodes])
-  # recursively expand sumnode components
-  # TODO: can remove this if there's no SumNode inside SumNode
-  @property
-  def flat_components(self): return [y for x in self.nodes for y in (x.flat_components if isinstance(x, SumNode) else [x])]
-class AndNode(RedNode):
-  def get_bounds(self) -> Tuple[int, sint]: return min([x.min for x in self.nodes]), max([x.max for x in self.nodes])
-  def substitute(self, var_vals: Mapping[Variable, Union[NumNode, Variable]]) -> Node:
-    subed = []
-    for node in self.nodes:
-      if not (sub:=node.substitute(var_vals)): return NumNode(0)
-      subed.append(sub)
-    return Node.ands(subed)
-def sym_render(a: Union[Node, int], ops=None, ctx=None) -> str: return str(a) if isinstance(a, int) else a.render(ops, ctx)
-def sym_infer(a: Union[Node, int], var_vals: Optional[Dict[Variable, int]]) -> int:
-  if isinstance(a, (int, float)): return a
-  ret = a.substitute({k:NumNode(v) for k, v in var_vals.items()}) if var_vals is not None else a
-  assert isinstance(ret, NumNode), f"sym_infer didn't produce NumNode from {a} with {var_vals}"
-  return ret.b
-# symbolic int, these are allowed in a Tensor shape
-sint = Union[int, Variable, MulNode, SumNode]
-def render_mulnode(node:MulNode, ops, ctx):
-  # TODO: add ProdNode and remove this case
-  if isinstance(node.a,Variable) and isinstance(node.b,Variable) and node.a.expr and node.b.expr and node.b.expr < node.a.expr:
-    return f"({sym_render(node.b,ops,ctx)}*{node.a.render(ops,ctx)})"
-  return f"({node.a.render(ops,ctx)}*{sym_render(node.b,ops,ctx)})"
-render_python: Dict[Type, Callable[..., str]] = {
-  Variable: lambda self,ops,ctx: f"{self.expr}[{self.min}-{self.max}{'='+str(self.val) if self._val is not None else ''}]" if ctx == "DEBUG" \
-    else (f"Variable('{self.expr}', {self.min}, {self.max})"+(f".bind({self.val})" if self._val is not None else '') if ctx == "REPR" \
-    else f"{self.expr}"),
-  NumNode: lambda self,ops,ctx: f"NumNode({self.b})" if ctx == "REPR" else f"{self.b}",
-  MulNode: render_mulnode,
-  DivNode: lambda self,ops,ctx: f"({self.a.render(ops,ctx)}//{self.b})",
-  ModNode: lambda self,ops,ctx: f"({self.a.render(ops,ctx)}%{self.b})",
-  LtNode: lambda self,ops,ctx: f"({self.a.render(ops,ctx)}<{sym_render(self.b,ops,ctx)})",
-  SumNode: lambda self,ops,ctx: f"({'+'.join(sorted([x.render(ops,ctx) for x in self.nodes]))})",
-  AndNode: lambda self,ops,ctx: f"({' and '.join(sorted([x.render(ops,ctx) for x in self.nodes]))})",
-}

tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl