PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

tinygrad/codegen/kernel.py +248 -115
tinygrad/codegen/lowerer.py +215 -0
tinygrad/codegen/transcendental.py +310 -0
tinygrad/codegen/uopgraph.py +622 -0
tinygrad/codegen/uops.py +235 -393
tinygrad/device.py +428 -69
tinygrad/dtype.py +18 -4
tinygrad/engine/graph.py +19 -32
tinygrad/engine/jit.py +148 -70
tinygrad/engine/realize.py +127 -51
tinygrad/engine/schedule.py +259 -216
tinygrad/engine/search.py +29 -22
tinygrad/function.py +9 -0
tinygrad/helpers.py +87 -49
tinygrad/lazy.py +34 -35
tinygrad/multi.py +41 -36
tinygrad/nn/__init__.py +39 -22
tinygrad/nn/state.py +3 -3
tinygrad/ops.py +63 -62
tinygrad/renderer/__init__.py +43 -21
tinygrad/renderer/assembly.py +104 -106
tinygrad/renderer/cstyle.py +87 -60
tinygrad/renderer/llvmir.py +21 -30
tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/kfd.py +32 -0
tinygrad/runtime/autogen/libc.py +4260 -0
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/graph/clang.py +2 -2
tinygrad/runtime/graph/cuda.py +8 -11
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +18 -15
tinygrad/runtime/ops_amd.py +197 -305
tinygrad/runtime/ops_clang.py +2 -2
tinygrad/runtime/ops_cuda.py +36 -94
tinygrad/runtime/ops_disk.py +3 -7
tinygrad/runtime/ops_gpu.py +4 -2
tinygrad/runtime/ops_hip.py +70 -0
tinygrad/runtime/ops_metal.py +38 -27
tinygrad/runtime/ops_nv.py +283 -363
tinygrad/runtime/ops_python.py +26 -30
tinygrad/runtime/support/compiler_cuda.py +78 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/shape/shapetracker.py +5 -14
tinygrad/shape/symbolic.py +4 -8
tinygrad/shape/view.py +34 -22
tinygrad/tensor.py +399 -97
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
tinygrad-0.9.2.dist-info/RECORD +70 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/runtime/{driver → support}/__init__.py +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0

tinygrad/multi.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
-from typing import Optional, Union, Any, Tuple, List
+from typing import Optional, Union, Any, Tuple, List, Dict
 import functools, itertools, operator
-from tinygrad.helpers import all_same, all_int, dedup, round_up, prod, DEBUG, RING
+from tinygrad.helpers import all_same, all_int, dedup, prod, DEBUG, RING, getenv
 from tinygrad.dtype import DType, ConstType
-from tinygrad.ops import BinaryOps, LoadOps, UnaryOps, TernaryOps, ReduceOps
+from tinygrad.ops import BinaryOps, MetaOps, UnaryOps, TernaryOps, ReduceOps
 from tinygrad.lazy import LazyBuffer
 from tinygrad.shape.shapetracker import sint
@@ -15,7 +15,7 @@ def all_reduce(op: ReduceOps, lbs: List[LazyBuffer]) -> List[LazyBuffer]:
   n_lbs, dim = len(lbs), prod(lbs[0].shape)
   # Ring allreduce doesn't provide a benefit with only 2 nodes or where number of elements is less than 256k (empirically)
   # so just fallback to naive allreduce to save on kernel dispatch, chunking and reassembling chunks.
-  use_ring = (RING >= 2 or (n_lbs > 2 and dim > 256_000 and RING >= 1))
+  use_ring = (RING >= 2 or (n_lbs > 2 and dim > getenv("RING_ALLREDUCE_THRESHOLD", 256_000) and RING >= 1))
   if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{dim} | {lbs[0].dtype}")
   if not use_ring:
     return [functools.reduce(lambda x,y: x.e(bop, y), [x.copy_to_device(lb.device) for x in lbs]) for lb in lbs]
@@ -42,10 +42,9 @@ def all_reduce(op: ReduceOps, lbs: List[LazyBuffer]) -> List[LazyBuffer]:
   pads = [((s,dim-e),) for s,e in chunks]
   return [functools.reduce(lambda x,y: x.e(BinaryOps.ADD, y), [c.pad(pads[i]) for i,c in enumerate(lb_c)]).reshape(lbs[0].shape) for lb_c in chunked]
-def to_sharded(lbs:List[LazyBuffer], axis:int) -> List[LazyBuffer]:
-  if DEBUG >= 3 and lbs[0].shape[axis] % len(lbs) != 0: print(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}")
-  sz = round_up(lbs[0].shape[axis], len(lbs)) // len(lbs)
-  return [lb.shrink(tuple((0,s) if a != axis else (min(s,sz*i),min(s,sz*(i+1))) for a,s in enumerate(lb.shape))) for i,lb in enumerate(lbs)]
+def to_sharded(lbs:List[LazyBuffer], axis:int, bounds: Tuple[Tuple[int, int], ...]) -> List[LazyBuffer]:
+  if DEBUG >= 3 and lbs[0].shape[axis] % len(lbs) != 0: print(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}, bounds={bounds}")
+  return [lb.shrink(tuple((0,s) if a != axis else bound for a,s in enumerate(lb.shape))) for i, (bound, lb) in enumerate(zip(bounds, lbs))]
 class MultiLazyBuffer:
   def __init__(self, lbs:List[LazyBuffer], axis:Optional[int], real:Optional[List[bool]]=None):
@@ -54,11 +53,10 @@ class MultiLazyBuffer:
     self.lbs, self.axis, self.dtype, self.device, self.real = lbs, axis, lbs[0].dtype, tuple(x.device for x in lbs), real or [True]*len(lbs)
     if axis is not None:
       splits = list(itertools.accumulate([lb.shape[axis] for lb in lbs], initial=0))
-      self.bounds = list(zip(splits, splits[1:]))
+      self.bounds = tuple(zip(splits, splits[1:]))
   @property
-  def shape(self):
-    return tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape))
+  def shape(self): return tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape))
   @property
   def size(self): return sum(x.size for x in self.real_lbs)
@@ -66,13 +64,13 @@ class MultiLazyBuffer:
   @property
   def real_lbs(self): return [lb for lb,r in zip(self.lbs, self.real) if r]
-  def __repr__(self):
-    return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
+  def __repr__(self): return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
   @staticmethod
-  def from_sharded(lb:LazyBuffer, devices:Tuple[str, ...], axis:Optional[int]=None):
-    lbs = [lb.contiguous() if lb.base != lb and not lb.is_unrealized_unmasked_const() else lb] * len(devices)
-    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis) if axis is not None else lbs, devices)]
+  def from_sharded(lb:LazyBuffer, devices:Tuple[str, ...], axis:Optional[int], bounds:Optional[Tuple[Tuple[int, int], ...]]):
+    assert (axis is None) == (bounds is None), "must specify bounds iff axis is specified"
+    lbs = [lb] * len(devices)
+    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)]
     return MultiLazyBuffer([lb if lb.is_unrealized_unmasked_const() else lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
   def copy_to_device(self, device:str) -> LazyBuffer:
@@ -80,7 +78,8 @@ class MultiLazyBuffer:
       # if we already have a copy on the device, return that
       for lb in self.real_lbs:
         if lb.device == device: return lb
-      return self.lbs[self.real.index(True)].copy_to_device(device)
+      return self.real_lbs[0].copy_to_device(device)
+    # copy lbs to device, pad to final shape, and sum
     llbs:List[LazyBuffer] = []
     for lb,real,(start,end) in zip(self.lbs, self.real, self.bounds):
       if not real: continue
@@ -90,32 +89,32 @@ class MultiLazyBuffer:
   # passthroughs
   def is_realized(self) -> bool: return all(lb.base.realized is not None for lb, r in zip(self.lbs, self.real) if r is True)
-  def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
+  def cast(self, dtype:DType, bitcast:bool=False, allow_buffer_view=True):
+    return MultiLazyBuffer([x.cast(dtype, bitcast, allow_buffer_view) for x in self.lbs], self.axis, self.real)
   def const(self, val:ConstType) -> MultiLazyBuffer: return MultiLazyBuffer([x.const(val) for x in self.lbs], self.axis, self.real)
   def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
   def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)
   # elementwise is simple
-  def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:MultiLazyBuffer, arg:Optional[Any]=None) -> MultiLazyBuffer:
+  def e(self, op:Union[MetaOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:MultiLazyBuffer, arg:Optional[Any]=None) -> MultiLazyBuffer:
     msrcs = (self,)+in_srcs
     assert all(isinstance(x, MultiLazyBuffer) for x in msrcs), f"all buffers must be MultiLazyBuffer {msrcs}"
     assert all_same([x.device for x in msrcs]), f"all buffers must have the same device {[x.device for x in msrcs]}"
     # NOTE: they all have to share an axis, we always choose [-1]
-    axis = axes[-1] if len(axes := dedup([x.axis for x in msrcs if x.axis is not None])) else None
-    srcs = []
+    axis, bounds = axes[-1] if len(axes := dedup([(x.axis, x.bounds) for x in msrcs if x.axis is not None])) else (None, None)
+    srcs:List[List[LazyBuffer]] = []
     not_all_real = any(not all(mlb.real) for mlb in msrcs)
     new_real = [all(transposed) for transposed in zip(*[mlb.real for mlb in msrcs])] if not_all_real else self.real
     assert any(new_real), "output contains no real lb"
     for mlb in msrcs:
-      if mlb.axis == axis or not_all_real: srcs.append(mlb.lbs)
-      elif mlb.axis is None and axis is not None: srcs.append(to_sharded(mlb.lbs, axis))
-      else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis))
-    # NOTE: lsrcs[-1].const(0) is correct for where
-    return MultiLazyBuffer([lsrcs[0].e(op, *lsrcs[1:], arg=arg) if r else lsrcs[-1].const(0) for lsrcs,r in zip(zip(*srcs),new_real)], axis, new_real)
-  def _shape_to_single_shard(self, shape:Tuple[sint, ...], lb:LazyBuffer) -> Tuple[sint, ...]:
-    return tuple(lb.shape[self.axis] if a == self.axis else s for a,s in enumerate(shape))
+      if (mlb.axis == axis and (mlb.axis is None or mlb.bounds == bounds)) or not_all_real: srcs.append(mlb.lbs)
+      elif mlb.axis is None and axis is not None: srcs.append(to_sharded(mlb.lbs, axis, bounds))
+      else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis, bounds))
+    new_real_lbs:Dict[int,LazyBuffer] = {i:lsrcs[0].e(op, *lsrcs[1:], arg=arg) for i,(lsrcs,r) in enumerate(zip(zip(*srcs), new_real)) if r}
+    # NOTE: const dtype should match real
+    real_dtype = next(iter(new_real_lbs.values())).dtype
+    return MultiLazyBuffer([new_real_lbs.get(i, lsrcs[0].const(0).cast(real_dtype)) for i,lsrcs in enumerate(zip(*srcs))], axis, new_real)
   def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> MultiLazyBuffer:
     if self.axis is not None and self.axis in axis:
@@ -128,19 +127,21 @@ class MultiLazyBuffer:
   # *** movement ops ***
+  def _shape_to_single_shard(self, shape:Tuple[sint, ...], lb:LazyBuffer) -> Tuple[sint, ...]:
+    return tuple(lb.shape[self.axis] if a == self.axis else s for a,s in enumerate(shape))
   def reshape(self, arg:Tuple[sint, ...]):
     if self.axis is None: return MultiLazyBuffer([x.reshape(arg) for x in self.lbs], None, self.real)
+    assert prod(self.shape) == prod(arg), "reshape must maintain prod(shape)"
     arg_acc:List[sint] = list(itertools.accumulate(arg, operator.mul, initial=1))
     # new_axis is the last one that preserves prod(prior to new_axis) and must not move items between shards
     # todo: what to do about shrinking to self.shape[self.axis]==1 len(self.real_lbs)==1?
     new_axis = len(arg_acc) - arg_acc[::-1].index(prod(self.shape[:self.axis])) - 1
-    if arg[new_axis] != self.shape[self.axis]:
-      assert self.shape[self.axis] % len(self.real_lbs) == 0, f"cannot reshape on-axis for uneven shard {self.axis} {self.shape} {len(self.real_lbs)}"
-      assert arg[new_axis] % len(self.real_lbs) == 0, f"new on-axis shape must divide evenly between devices {new_axis} {arg} {len(self.real_lbs)}"
-    return MultiLazyBuffer([x.reshape(tuple(s if a != new_axis else
-                              x.shape[self.axis] if s == self.shape[self.axis] else
-                              s // len(self.real_lbs) for a,s in enumerate(arg))) for x in self.lbs],
-                           new_axis, self.real)
+    assert all(prod(lb.shape[self.axis:]) % prod(arg[new_axis + 1:]) == 0 for lb in self.lbs),\
+      f"reshape cannot move items between shards {self.shape} {arg} {self.bounds}"
+    return MultiLazyBuffer([x.reshape(
+      tuple(s if a != new_axis else prod(x.shape[self.axis:]) // prod(arg[new_axis + 1:]) for a, s in enumerate(arg))
+    ) for x in self.lbs], new_axis, self.real)
   def pad(self, arg:Tuple[Tuple[sint, sint], ...]):
     assert self.axis is None or arg[self.axis] == (0,0) or not all(self.real), f"padding not supported for {arg=}"
@@ -152,13 +153,16 @@ class MultiLazyBuffer:
                                 sum(lb.shape[self.axis] for i,lb in enumerate(self.lbs) if i > self.real.index(True))), "can only pad to whole axis"
       return MultiLazyBuffer([x if r else x.const(0) for x,r in zip(self.lbs, self.real)], self.axis)
     return MultiLazyBuffer([x.pad(arg) for x in self.lbs], self.axis, self.real)
   def expand(self, arg:Tuple[sint, ...]):
     # NOTE: this assert isn't needed, sharded axis can have dim 1
     assert self.axis is None or arg[self.axis] == self.shape[self.axis], f"expand not supported on sharded axis {arg=}"
     return MultiLazyBuffer([x.expand(self._shape_to_single_shard(arg, x)) for x in self.lbs], self.axis, self.real)
   def permute(self, arg:Tuple[int, ...]):
     # all permutes supported!
     return MultiLazyBuffer([x.permute(arg) for x in self.lbs], arg.index(self.axis) if self.axis is not None else None, self.real)
   def shrink(self, arg:Tuple[Tuple[sint, sint], ...]):
     assert self.axis is None or arg[self.axis] == (0, self.shape[self.axis]) or arg[self.axis] in self.bounds, f"shrinking not supported for {arg=}"
     if self.axis is not None and arg[self.axis] in self.bounds and arg[self.axis] != (0, self.shape[self.axis]):
@@ -168,6 +172,7 @@ class MultiLazyBuffer:
       return MultiLazyBuffer([lb if i==idx else lb.const(0) for i,lb in enumerate(self.lbs)], self.axis, [i==idx for i in range(len(self.lbs))])
     return MultiLazyBuffer([x.shrink(tuple((0, x.shape[self.axis]) if a == self.axis else s for a,s in enumerate(arg))) for x in self.lbs],
                            self.axis, self.real)
   def stride(self, arg:Tuple[int, ...]):
     assert self.axis is None or arg[self.axis] == 1, "flipping not supported on sharded axis"
     return MultiLazyBuffer([x.stride(arg) for x in self.lbs], self.axis, self.real)

tinygrad/nn/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import math
-from typing import Optional, Union, Tuple, cast
+from typing import Optional, Union, Tuple
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import prod
 from tinygrad.nn import optim, state, datasets  # noqa: F401
-class BatchNorm2d:
+class BatchNorm:
   """
-  Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension).
+  Applies Batch Normalization over a 2D or 3D input.
   - Described: https://paperswithcode.com/method/batch-normalization
   - Paper: https://arxiv.org/abs/1502.03167v3
@@ -20,7 +20,7 @@ class BatchNorm2d:
   ```
   ```python exec="true" source="above" session="tensor" result="python"
-  norm = nn.BatchNorm2d(3)
+  norm = nn.BatchNorm(3)
   t = Tensor.rand(2, 3, 4, 4)
   print(t.mean().item(), t.std().item())
   ```
@@ -39,13 +39,14 @@ class BatchNorm2d:
     self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
   def __call__(self, x:Tensor):
+    shape_mask = [1, -1, *([1]*(x.ndim-2))]
     if Tensor.training:
       # This requires two full memory accesses to x
       # https://github.com/pytorch/pytorch/blob/c618dc13d2aa23625cb0d7ada694137532a4fa33/aten/src/ATen/native/cuda/Normalization.cuh
       # There's "online" algorithms that fix this, like https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm
-      batch_mean = x.mean(axis=(0,2,3))
-      y = (x - batch_mean.detach().reshape(shape=[1, -1, 1, 1]))  # d(var)/d(mean) = 0
-      batch_var = (y*y).mean(axis=(0,2,3))
+      batch_mean = x.mean(axis=(reduce_axes:=tuple(x for x in range(x.ndim) if x != 1)))
+      y = (x - batch_mean.detach().reshape(shape=shape_mask))  # d(var)/d(mean) = 0
+      batch_var = (y*y).mean(axis=reduce_axes)
       batch_invstd = batch_var.add(self.eps).pow(-0.5)
       # NOTE: wow, this is done all throughout training in most PyTorch models
@@ -56,11 +57,10 @@ class BatchNorm2d:
     else:
       batch_mean = self.running_mean
       # NOTE: this can be precomputed for static inference. we expand it here so it fuses
-      batch_invstd = self.running_var.reshape(1, -1, 1, 1).expand(x.shape).add(self.eps).rsqrt()
+      batch_invstd = self.running_var.reshape(shape=shape_mask).expand(x.shape).add(self.eps).rsqrt()
     return x.batchnorm(self.weight, self.bias, batch_mean, batch_invstd)
+BatchNorm2d = BatchNorm3d = BatchNorm
-# TODO: these Conv lines are terrible
 def Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
   """
   Applies a 1D convolution over an input signal composed of several input planes.
@@ -98,16 +98,13 @@ class Conv2d:
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
     self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
     self.stride, self.padding, self.dilation, self.groups = stride, padding, dilation, groups
-    self.weight = self.initialize_weight(out_channels, in_channels, groups)
-    bound = 1 / math.sqrt(cast(int, prod(self.weight.shape[1:])))  # weight shape is always ints but mypy cannot tell
-    self.bias = Tensor.uniform(out_channels, low=-bound, high=bound) if bias else None
+    scale = 1 / math.sqrt(in_channels * prod(self.kernel_size))
+    self.weight = Tensor.uniform(out_channels, in_channels//groups, *self.kernel_size, low=-scale, high=scale)
+    self.bias = Tensor.uniform(out_channels, low=-scale, high=scale) if bias else None
   def __call__(self, x:Tensor):
     return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride, dilation=self.dilation, groups=self.groups)
-  def initialize_weight(self, out_channels, in_channels, groups):
-    return Tensor.kaiming_uniform(out_channels, in_channels//groups, *self.kernel_size, a=math.sqrt(5))
 def ConvTranspose1d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
   """
   Applies a 1D transposed convolution operator over an input signal composed of several input planes.
@@ -144,15 +141,14 @@ class ConvTranspose2d(Conv2d):
   """
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
     super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
+    scale = 1 / math.sqrt(in_channels * prod(self.kernel_size))
+    self.weight = Tensor.uniform(in_channels, out_channels//groups, *self.kernel_size, low=-scale, high=scale)
     self.output_padding = output_padding
   def __call__(self, x:Tensor):
     return x.conv_transpose2d(self.weight, self.bias, padding=self.padding, output_padding=self.output_padding, stride=self.stride,
                               dilation=self.dilation, groups=self.groups)
-  def initialize_weight(self, out_channels, in_channels, groups):
-    return Tensor.kaiming_uniform(in_channels, out_channels//groups, *self.kernel_size, a=math.sqrt(5))
 class Linear:
   """
   Applies a linear transformation to the incoming data.
@@ -170,9 +166,8 @@ class Linear:
   ```
   """
   def __init__(self, in_features, out_features, bias=True):
-    # TODO: is this init good? torch inits to uniform(-1/sqrt(in_features), 1/sqrt(in_features))
-    self.weight = Tensor.kaiming_uniform(out_features, in_features, a=math.sqrt(5))
     bound = 1 / math.sqrt(in_features)
+    self.weight = Tensor.uniform(out_features, in_features, low=-bound, high=bound)
     self.bias = Tensor.uniform(out_features, low=-bound, high=bound) if bias else None
   def __call__(self, x:Tensor):
@@ -282,6 +277,28 @@ class LayerNorm2d(LayerNorm):
   """
   def __call__(self, x): return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+class RMSNorm:
+  """
+  Applies Root Mean Square Normalization to input.
+  - Described: https://paperswithcode.com/method/rmsnorm
+  - Paper: https://arxiv.org/abs/1910.07467
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.RMSNorm(4)
+  t = Tensor.arange(12, dtype=dtypes.float).reshape(3, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  print(norm(t).numpy())
+  ```
+  """
+  def __init__(self, dim, eps=1e-6): self.eps, self.weight = eps, Tensor.ones(dim)
+  def _norm(self, x:Tensor): return x * (x.square().mean(-1, keepdim=True) + self.eps).rsqrt()
+  def __call__(self, x:Tensor) -> Tensor: return self._norm(x.float()).cast(x.dtype) * self.weight
 class Embedding:
   """
   A simple lookup table that stores embeddings of a fixed dictionary and size.
@@ -301,4 +318,4 @@ class Embedding:
     arange_shp, weight_shp, big_shp = (1, 1, self.vocab_sz, 1), (1, 1, self.vocab_sz, self.embed_sz), idx.shape+(self.vocab_sz, self.embed_sz,)
     if not hasattr(self, 'arange'): self.arange = Tensor.arange(self.vocab_sz, requires_grad=False, device=self.weight.device).reshape(arange_shp)
     arange, idx, vals = self.arange.expand(big_shp), idx.reshape(idx.shape+(1, 1,)).expand(big_shp), self.weight.reshape(weight_shp).expand(big_shp)
-    return (arange == idx).mul(vals).sum(2)
+    return (arange == idx).mul(vals).sum(2, acc_dtype=vals.dtype)

tinygrad/nn/state.py CHANGED Viewed

@@ -159,8 +159,7 @@ def torch_load(fn:str) -> Dict[str, Tensor]:
       if DEBUG >= 3: print(f"WARNING: this torch load is slow. CLANG to permute {intermediate_shape} with {permute_indexes}")
       assert storage[1] != dtypes.bfloat16, "can't CLANG permute BF16"
       # TODO: find a nice way to support all shapetracker on disktensors
-      # TODO: BUG: a ".realize()" is needed here for 'GPU=1 python3 test/models/test_efficientnet.py TestEfficientNet.test_car'
-      ret = ret.clang().reshape(intermediate_shape).permute(permute_indexes).realize()
+      ret = ret.clang().reshape(intermediate_shape).permute(permute_indexes)
     return ret.reshape(size)
@@ -168,7 +167,8 @@ def torch_load(fn:str) -> Dict[str, Tensor]:
     def __setstate__(self, state): self.tensor = state[0]
   deserialized_objects: Dict[str, Any] = {}
-  intercept = {"HalfStorage": dtypes.float16, "FloatStorage": dtypes.float32, "BFloat16Storage": dtypes.bfloat16, "IntStorage": dtypes.int32,
+  intercept = {"HalfStorage": dtypes.float16, "FloatStorage": dtypes.float32, "BFloat16Storage": dtypes.bfloat16,
+               "IntStorage": dtypes.int32, "BoolStorage": dtypes.bool,
                "LongStorage": dtypes.int64, "_rebuild_tensor_v2": _rebuild_tensor_v2, "FloatTensor": None, "Parameter": Parameter}
   whitelist = {"torch", "collections", "numpy", "_codecs"}  # NOTE: this is not for security, only speed
   class Dummy: pass

tinygrad/ops.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Union, Tuple, Any, List, Dict, Callable
 import functools, hashlib, math, operator, ctypes, struct
 from enum import Enum, auto
 from dataclasses import dataclass
-from tinygrad.helpers import prod, dedup
+from tinygrad.helpers import prod, dedup, pretty_print
 from tinygrad.dtype import dtypes, DType, ConstType
 from tinygrad.shape.symbolic import Variable, sint
 from tinygrad.shape.shapetracker import ShapeTracker
@@ -18,17 +18,17 @@ class UnaryOps(Enum):
 class BinaryOps(Enum):
   """A + A -> A (elementwise)"""
   ADD = auto(); MUL = auto(); IDIV = auto(); MAX = auto(); MOD = auto(); CMPLT = auto(); CMPNE = auto(); XOR = auto() # noqa: E702
-  SHR = auto(); SHL = auto() # noqa: E702
+  SHL = auto(); SHR = auto(); OR = auto(); AND = auto(); THREEFRY = auto() # noqa: E702
 class TernaryOps(Enum):
   """A + A + A -> A (elementwise)"""
   WHERE = auto(); MULACC = auto() # noqa: E702
 class ReduceOps(Enum):
   """A -> B (reduce)"""
-  SUM = auto(); MAX = auto() # noqa: E702
+  SUM = auto(); MAX = auto(); WMMA = auto() # noqa: E702
 class BufferOps(Enum): LOAD = auto(); CONST = auto(); STORE = auto() # noqa: E702
-class LoadOps(Enum): EMPTY = auto(); CONST = auto(); COPY = auto(); CONTIGUOUS = auto(); CUSTOM = auto(); ASSIGN = auto(); VIEW = auto() # noqa: E702
-Op = Union[UnaryOps, BinaryOps, ReduceOps, LoadOps, TernaryOps, BufferOps]
+class MetaOps(Enum):
+  EMPTY = auto(); CONST = auto(); COPY = auto(); CONTIGUOUS = auto(); CUSTOM = auto(); ASSIGN = auto(); VIEW = auto(); KERNEL = auto(); EXT = auto() # noqa: E702
+Op = Union[UnaryOps, BinaryOps, ReduceOps, MetaOps, TernaryOps, BufferOps]
 # do not preserve f(0) = 0
 UNSAFE_PAD_OPS = {UnaryOps.RECIP, UnaryOps.LOG2, UnaryOps.EXP2, BinaryOps.IDIV}
@@ -45,6 +45,12 @@ class ConstBuffer:
   dtype: DType
   st: ShapeTracker
+@dataclass(frozen=True)
+class KernelInfo:
+  local_dims: int = 0           # number of local dimensions  (this is remapping RANGE to SPECIAL)
+  upcasted: int = 0             # count that are upcasted     (this is remapping RANGE to EXPAND)
+  dont_use_locals: bool = False # don't use local indexing
 @dataclass(frozen=True, eq=False)
 class LazyOp:
   op: Op
@@ -57,13 +63,17 @@ class LazyOp:
     ret = context[key] = all(a.cached_compare(b, context) for a,b in zip(self.src, x.src))
     return ret
   def __eq__(self, x): return self.cached_compare(x, context={})
-  def __repr__(self): return f"LazyOp(op={self.op}, src={self.src}, arg={self.arg})"
+  def __repr__(self:LazyOp): return pretty_print(self, lambda x: f'LazyOp({x.op}, arg={x.arg}, src=(%s))')
   @functools.cached_property
   def dtype(self) -> DType:
     if self.op in BufferOps: return self.arg.dtype
+    if self.op is ReduceOps.WMMA: return self.arg[3]   # WMMA can change the type
     if self.op in [UnaryOps.CAST, UnaryOps.BITCAST]: return self.arg
     return dtypes.bool if self.op in {BinaryOps.CMPLT, BinaryOps.CMPNE} else self.src[-1].dtype
+  @functools.cached_property
+  def full_shape(self) -> Tuple[sint, ...]:
+    if len(self.src) == 0 and self.op in BufferOps: return self.arg.st.shape
+    return tuple(max(x) for x in zip(*[x.full_shape for x in self.src]))
   @functools.cached_property
   def key(self) -> bytes:
     return hashlib.sha256(functools.reduce(lambda x,y: x+y, [s.key for s in self.src], str((self.op, self.arg)).encode())).digest()
@@ -77,35 +87,16 @@ class LazyOp:
     const_vars = [x.arg.val for x in self.lazyops if x.op is BufferOps.CONST and isinstance(x.arg.val, Variable)]
     return sorted(set.union(*extract_vars, set(const_vars)), key=lambda v: v.expr)
-# **************** independent FlopCounter ****************
-@dataclass
-class FlopCounter:
-  shape: Tuple[int, ...]
-  flops: sint
-  mem: Dict[int, int]
-  @property
-  def mem_estimate(self): return sum(self.mem.values())
-  def consume_flops(self):
-    self.flops, ret = 0, self.flops
-    return ret
-InterpretedFlopCounter: Dict[Op, Callable] = {
-  BufferOps.LOAD: lambda arg: FlopCounter(arg.st.shape, 0, {arg.idx: arg.dtype.itemsize * arg.st.real_size()}),
-  BufferOps.CONST: lambda arg: FlopCounter(arg.st.shape, 0, {}),
-  BufferOps.STORE: lambda self,arg: FlopCounter(arg.st.shape, self.consume_flops(), {**self.mem, arg.idx: arg.dtype.itemsize * arg.st.real_size()}),
-  UnaryOps.CAST: lambda self,arg: FlopCounter(self.shape, self.consume_flops(), self.mem),   # cast uses no flops
-  UnaryOps.BITCAST: lambda self,arg: FlopCounter(self.shape, self.consume_flops(), self.mem),   # bitcast uses no flops
-  **{op:lambda self: FlopCounter(self.shape, self.consume_flops() + prod(self.shape), self.mem) for op in UnaryOps if op not in {UnaryOps.CAST, UnaryOps.BITCAST}},  # noqa: E501
-  **{op:lambda self,y: FlopCounter(self.shape, self.consume_flops() + y.consume_flops() + prod(self.shape), {**self.mem, **y.mem}) for op in BinaryOps},  # noqa: E501
-  **{op:lambda self,axis: FlopCounter(tuple(1 if i in axis else s for i,s in enumerate(self.shape)), self.consume_flops() + prod(self.shape), self.mem) for op in ReduceOps},  # noqa: E501
-  TernaryOps.WHERE: lambda self,y,z: FlopCounter(self.shape, self.consume_flops() + y.consume_flops() + z.consume_flops() + prod(self.shape), {**self.mem, **y.mem, **z.mem})}  # noqa: E501
-@functools.lru_cache(None)
-def get_lazyop_info(ast:LazyOp) -> FlopCounter:
-  @functools.lru_cache(None) # NOTE: this cache needs to be recreated for new ASTs
-  def run_ast(ast): return InterpretedFlopCounter[ast.op](*([run_ast(x) for x in ast.src]+([ast.arg] if ast.arg is not None else [])))
-  return run_ast(ast)
+  # TODO: support non-lazyop
+  def __add__(self, x:LazyOp): return LazyOp(BinaryOps.ADD, (self, x))
+  def __sub__(self, x:LazyOp): return LazyOp(BinaryOps.ADD, (self, -x))
+  def __mul__(self, x:LazyOp): return LazyOp(BinaryOps.MUL, (self, x))
+  def ne(self, x:LazyOp): return LazyOp(BinaryOps.CMPNE, (self, x))
+  def eq(self, x:LazyOp): return -self.ne(x)
+  def __neg__(self): return LazyOp(UnaryOps.NEG, (self,))
+  @staticmethod
+  def const(val, dtype:DType, shape:Tuple[sint, ...]):
+    return LazyOp(BufferOps.CONST, (), ConstBuffer(val, dtype, ShapeTracker.from_shape(()).reshape((1,)*len(shape)).expand(shape)))
 # **************** ops in python ****************
@@ -115,18 +106,15 @@ def hook_overflow(dv, fxn):
     except OverflowError: return dv
   return wfxn
-python_alu = {
-  UnaryOps.LOG2: lambda x: math.log2(x) if x > 0 else -math.inf if x == 0 else math.nan,
-  UnaryOps.EXP2: hook_overflow(math.inf, lambda x: 2**x),
-  UnaryOps.SQRT: lambda x: math.sqrt(x) if x >= 0 else math.nan,
-  UnaryOps.SIN: lambda x: math.sin(x) if not math.isinf(x) else math.nan,
-  UnaryOps.RECIP: lambda x: 1/x if x != 0 else math.copysign(math.inf, x),
-  UnaryOps.NEG: lambda x: (not x) if isinstance(x, bool) else -x,
-  BinaryOps.SHR: operator.rshift, BinaryOps.SHL: operator.lshift,
-  BinaryOps.MUL: operator.mul, BinaryOps.ADD: operator.add,
+python_alu: Dict[Op, Callable]  = {
+  UnaryOps.LOG2: lambda x: math.log2(x) if x > 0 else -math.inf if x == 0 else math.nan, UnaryOps.EXP2: hook_overflow(math.inf, lambda x: 2**x),
+  UnaryOps.SQRT: lambda x: math.sqrt(x) if x >= 0 else math.nan, UnaryOps.RECIP: lambda x: 1/x if x != 0 else math.copysign(math.inf, x),
+  UnaryOps.SIN: lambda x: math.sin(x) if not math.isinf(x) else math.nan, UnaryOps.NEG: lambda x: (not x) if isinstance(x, bool) else -x,
+  BinaryOps.SHR: operator.rshift, BinaryOps.SHL: operator.lshift, BinaryOps.MUL: operator.mul, BinaryOps.ADD: operator.add,
   BinaryOps.XOR: operator.xor, BinaryOps.MAX: max, BinaryOps.CMPNE: operator.ne, BinaryOps.CMPLT: operator.lt,
-  BinaryOps.MOD: lambda x,y: abs(int(x))%abs(int(y))*(1,-1)[x<0], BinaryOps.IDIV: lambda x, y: int(x/y) if y != 0 else x*math.inf,
-  TernaryOps.WHERE: lambda x,y,z: y if x else z}
+  BinaryOps.OR: operator.or_, BinaryOps.AND: operator.and_,
+  BinaryOps.MOD: lambda x,y: abs(int(x))%abs(int(y))*(1,-1)[x<0], BinaryOps.IDIV: lambda x,y: abs(x)//abs(y)*(1,-1)[x*y<0] if y != 0 else x*math.inf,
+  TernaryOps.MULACC: lambda x,y,z: (x*y)+z, TernaryOps.WHERE: lambda x,y,z: y if x else z}
 def truncate_fp16(x):
   try:
@@ -140,30 +128,43 @@ truncate: Dict[DType, Callable] = {dtypes.bool: bool,
   dtypes.float16: truncate_fp16, dtypes.float32: lambda x: ctypes.c_float(x).value, dtypes.float64: lambda x: ctypes.c_double(x).value,
   dtypes.uint8: lambda x: ctypes.c_uint8(x).value, dtypes.uint16: lambda x: ctypes.c_uint16(x).value,
   dtypes.uint32: lambda x: ctypes.c_uint32(x).value, dtypes.uint64: lambda x: ctypes.c_uint64(x).value,
-  dtypes.int8: lambda x: ctypes.c_int8(x).value, dtypes.int16: lambda x: ctypes.c_int16(x).value,
-  dtypes.int32: lambda x: ctypes.c_int32(x).value, dtypes.int64: lambda x: ctypes.c_int64(x).value,}
+  dtypes.int8: lambda x: ctypes.c_int8(x).value, dtypes.int16: lambda x: ctypes.c_int16(x).value, dtypes.int32: lambda x: ctypes.c_int32(x).value \
+      if isinstance(x,int) else x, dtypes.int64: lambda x: ctypes.c_int64(x).value}
 def exec_alu(op:Op, dtype:DType, operands): return truncate.get(dtype, lambda x: x)(python_alu[op](*operands))
+def reduce_st(st:ShapeTracker, axis:Tuple[int, ...]) -> Tuple[sint, ...]: return tuple(1 if i in axis else s for i,s in enumerate(st.shape))
 # the living definition of LazyOps
-def verify_lazyop(*ast:LazyOp):
+def verify_lazyop(ast:LazyOp) -> Dict[LazyOp, ShapeTracker]:
+  assert ast.op is MetaOps.KERNEL, "must be SINK"
   sts: Dict[LazyOp, ShapeTracker] = {}
-  def dfs(op:LazyOp, st:ShapeTracker):
+  def assert_valid(op:LazyOp, st:ShapeTracker):
     if op in sts: return
-    for x in op.src: dfs(x, st)
+    # restore globals from the two stage reduce
+    if op.op is BufferOps.LOAD and op.arg.idx < 0:
+      assert_valid(local_reduce:=op.src[0].src[0], op.arg.st)
+      return sts.setdefault(op, sts[local_reduce])
+    for x in op.src: assert_valid(x, st)
     # only reduceop is allowed to change shape, limited to turning n to 1
     if op.op in ReduceOps:
-      expected_shape = tuple(1 if i in op.arg else s for i,s in enumerate(sts[op.src[0]].shape))
-      assert st.shape == expected_shape, f"unexpected reduceop shape {st.shape} != {expected_shape}"
-      st = ShapeTracker.from_shape(expected_shape)
+      axis = op.arg[-1] if op.op is ReduceOps.WMMA else op.arg
+      assert isinstance(axis, tuple) and all(isinstance(i, int) for i in axis), f"reduceop must have axis {op.arg}"
+      st = ShapeTracker.from_shape(reduce_st(sts[op.src[0]], axis))
     else:
       # movementops are pushed to the edges with LOAD
-      if op.op in BufferOps: st = op.arg.st
-      else: st = sts[op.src[0]]
-      for x in op.src: assert sts[x].shape == st.shape, f"found implicit movement op {x.op} {sts[x].shape} != {op.op} {st.shape}"
+      # elementwise inherits shape
+      st = op.arg.st if op.op in BufferOps else sts[op.src[0]]
+      for x in op.src:
+        if sts[x].shape != st.shape:
+          if prod(sts[x].shape) == prod(st.shape): raise AssertionError(f"found implicit reshape {x.op} {op.op} {sts[x].shape} != {st.shape}")
+          raise AssertionError(f"found implicit expand {x.op} {sts[x].shape} != {op.op} {st.shape} {prod(sts[x].shape)} != {prod(st.shape)}")
     sts[op] = st
-  for i, out in enumerate(ast):
+  for i, out in enumerate(ast.src):
     assert out.arg.idx == i, f"unexpected output buffer idx {out.arg.idx} != {i}"
     assert out.op is BufferOps.STORE, f"kernels must have stores as the output, got {out.op}"
-    assert out.arg.st.size == ast[-1].arg.st.size, f"outputs must have the same size, got {out.arg.st.size}"
-    dfs(out, out.arg.st)
+    assert out.arg.st.size == ast.src[-1].arg.st.size, f"outputs must have the same size, got {out.arg.st.size}"
+    assert_valid(out, out.arg.st)
+  shape_dims = [sorted(dedup(dims)) for dims in zip(*[x.shape for x in sts.values()])]
+  assert all(len(x) == 1 or (len(x) == 2 and x[0] == 1) for x in shape_dims), f"shapes must have either 1 or n in each dimension, {shape_dims}"
+  return sts

tinygrad/renderer/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from typing import Optional, List, Tuple, Dict
+from typing import Optional, List, Tuple, Dict, Callable, Any
 import functools
-from dataclasses import dataclass
-from tinygrad.helpers import getenv, to_function_name
-from tinygrad.codegen.uops import UOpGraph
+from dataclasses import dataclass, field
+from tinygrad.helpers import to_function_name, dedup
+from tinygrad.codegen.uops import UOps, UOp, flops_mem
+from tinygrad.ops import Op
 from tinygrad.shape.symbolic import sym_infer, sint, Variable
 from tinygrad.dtype import DType
@@ -12,30 +13,53 @@ class TensorCore: # D = A * B + C, A is (M x K), B is (K x N), C and D are (M x
   dtype_in: DType # dtype for A and B
   dtype_out: DType # dtype for C and D
   threads: List[Tuple[int,int]] # list of (TC dim,amt) that construct the warp thread structure
-  thread_local_aliases: List[List[List[int]]] # a list of [threads_1, ..., threads_n, upcast_1(unrolled), upcast_2(upcast)] defining the alias (-1 is upcast, 1-n is warp threads) for each TC dim # noqa: E501
-  thread_local_sizes: List[List[int]] # in each thread, the number of elements stored in registers for each TC dim
   def __str__(self): return "_".join(["WMMA"] + list(map(str, self.dims)) + [self.dtype_in.name, self.dtype_out.name])
-  def num_upcasts(self): return len(self.thread_local_aliases[0]) - len(self.threads)
-@dataclass(frozen=True)
+@dataclass
 class Program:
   name:str
   src:str
   dname:str
+  uops:Optional[List[UOp]]=None
+  mem_estimate:sint=0  # TODO: get this from the load/store uops once min/max are good
+  # filled in from uops (if we have uops)
   global_size:Optional[List[int]]=None
   local_size:Optional[List[int]]=None
-  uops:Optional[UOpGraph]=None
-  op_estimate:sint=0
-  mem_estimate:sint=0
+  vars:List[Variable]=field(default_factory=list)
+  globals:List[int]=field(default_factory=list)
+  outs:List[int]=field(default_factory=list)
+  _ran_post_init:bool=False  # NOTE: this is needed if you call replace on the Program
-  @functools.cached_property
-  def vars(self) -> List[Variable]: return [] if self.uops is None else self.uops.vars()
+  def __post_init__(self):
+    if not self._ran_post_init and self.uops is not None:
+      # single pass through the uops
+      for u in self.uops:
+        if u.op is UOps.DEFINE_VAR: self.vars.append(u.arg)
+        if u.op is UOps.DEFINE_GLOBAL: self.globals.append(u.arg)
+        if u.op is UOps.STORE: self.outs.extend([x.arg for x in u.src[0].sparents if x.op is UOps.DEFINE_GLOBAL])
+        if u.op is UOps.SPECIAL:
+          # NOTE: you have to set local_size and global_size to the base [1,1,1] outside this
+          if u.arg[0][0] == 'i': self.local_size = None
+          if u.arg[0][0] == 'l':
+            assert self.local_size is not None
+            self.local_size[int(u.arg[0][-1])] = u.arg[1]
+          else:
+            assert self.global_size is not None
+            self.global_size[int(u.arg[0][-1])] = u.arg[1]
+      self.vars = sorted(self.vars, key=lambda v: v.expr)
+      self.outs = sorted(dedup(self.outs))
+      self._ran_post_init = True
+  @property
+  def op_estimate(self) -> sint: return self._ops_lds[0]
+  @property
+  def lds_estimate(self) -> sint: return self._ops_lds[1]
   @functools.cached_property
-  def globals(self) -> List[Tuple[int, bool]]: return [] if self.uops is None else self.uops.globals()
+  def _ops_lds(self) -> Tuple[sint, sint]: return (0,0) if self.uops is None else flops_mem(self.uops, ignore_indexing=True)
-  @functools.cached_property
-  def outcount(self) -> int: return sum(x[1] for x in self.globals)
+  @property
+  def outcount(self) -> int: return len(self.outs)
   @functools.cached_property
   def function_name(self) -> str: return to_function_name(self.name)
@@ -57,9 +81,7 @@ class Renderer:
   local_max: Optional[Tuple[int, ...]] = (0x8FFFFFFF,) * (3) # TODO: UOps.SPECIAL int32 indexes right now
   shared_max: int = 32768
   tensor_cores: List[TensorCore] = []
-  @functools.cached_property
-  def tc_opt(self): return getenv("TC_OPT")
-  @functools.cached_property
-  def tc(self): return getenv("TC", 1)
+  extra_matcher: Any = None
+  code_for_op: Dict[Op, Callable] = {}
-  def render(self, name:str, uops:UOpGraph) -> str: raise NotImplementedError("needs a renderer")
+  def render(self, name:str, uops:List[UOp]) -> str: raise NotImplementedError("needs a renderer")

tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl