PyPI - tinygrad - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

tinygrad/__init__.py +6 -0
tinygrad/codegen/kernel.py +572 -83
tinygrad/codegen/linearizer.py +415 -395
tinygrad/codegen/uops.py +415 -0
tinygrad/device.py +183 -0
tinygrad/dtype.py +113 -0
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +195 -0
tinygrad/engine/realize.py +191 -0
tinygrad/engine/schedule.py +362 -0
tinygrad/engine/search.py +196 -0
tinygrad/{mlops.py → function.py} +76 -55
tinygrad/helpers.py +196 -89
tinygrad/lazy.py +210 -371
tinygrad/multi.py +169 -0
tinygrad/nn/__init__.py +202 -22
tinygrad/nn/datasets.py +7 -0
tinygrad/nn/optim.py +112 -32
tinygrad/nn/state.py +136 -39
tinygrad/ops.py +119 -202
tinygrad/renderer/__init__.py +61 -0
tinygrad/renderer/assembly.py +276 -0
tinygrad/renderer/cstyle.py +353 -166
tinygrad/renderer/llvmir.py +150 -138
tinygrad/runtime/autogen/amd_gpu.py +1900 -0
tinygrad/runtime/autogen/comgr.py +865 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5761 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33328 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/hip_comgr.py +47 -0
tinygrad/runtime/driver/hsa.py +143 -0
tinygrad/runtime/graph/clang.py +38 -0
tinygrad/runtime/graph/cuda.py +81 -0
tinygrad/runtime/graph/hcq.py +143 -0
tinygrad/runtime/graph/hsa.py +171 -0
tinygrad/runtime/graph/metal.py +75 -0
tinygrad/runtime/ops_amd.py +564 -0
tinygrad/runtime/ops_clang.py +24 -77
tinygrad/runtime/ops_cuda.py +175 -89
tinygrad/runtime/ops_disk.py +56 -33
tinygrad/runtime/ops_gpu.py +92 -95
tinygrad/runtime/ops_hsa.py +278 -0
tinygrad/runtime/ops_llvm.py +39 -60
tinygrad/runtime/ops_metal.py +92 -74
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +630 -0
tinygrad/runtime/ops_python.py +204 -0
tinygrad/shape/shapetracker.py +86 -254
tinygrad/shape/symbolic.py +166 -141
tinygrad/shape/view.py +296 -0
tinygrad/tensor.py +2619 -448
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
tinygrad-0.9.0.dist-info/METADATA +227 -0
tinygrad-0.9.0.dist-info/RECORD +60 -0
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/assembly.py +0 -190
tinygrad/codegen/optimizer.py +0 -379
tinygrad/codegen/search.py +0 -72
tinygrad/graph.py +0 -83
tinygrad/jit.py +0 -57
tinygrad/nn/image.py +0 -100
tinygrad/renderer/assembly_arm64.py +0 -169
tinygrad/renderer/assembly_ptx.py +0 -98
tinygrad/renderer/wgsl.py +0 -53
tinygrad/runtime/lib.py +0 -113
tinygrad/runtime/ops_cpu.py +0 -51
tinygrad/runtime/ops_hip.py +0 -82
tinygrad/runtime/ops_shm.py +0 -29
tinygrad/runtime/ops_torch.py +0 -30
tinygrad/runtime/ops_webgpu.py +0 -45
tinygrad-0.7.0.dist-info/METADATA +0 -212
tinygrad-0.7.0.dist-info/RECORD +0 -40
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0

tinygrad/multi.py ADDED Viewed

@@ -0,0 +1,169 @@
+from __future__ import annotations
+from typing import Optional, Union, Any, Tuple, List
+import functools, itertools, operator
+from tinygrad.helpers import all_same, all_int, dedup, round_up, prod, DEBUG, RING
+from tinygrad.dtype import DType, ConstType
+from tinygrad.ops import BinaryOps, LoadOps, UnaryOps, TernaryOps, ReduceOps
+from tinygrad.lazy import LazyBuffer
+from tinygrad.shape.shapetracker import sint
+def all_reduce(op: ReduceOps, lbs: List[LazyBuffer]) -> List[LazyBuffer]:
+  assert all_int(lbs[0].shape), f"does not support symbolic shape {lbs[0].shape}"
+  assert all_same([lb.shape[0] for lb in lbs]), "allreduce with uneven shards is undefined"
+  bop = {ReduceOps.SUM:BinaryOps.ADD, ReduceOps.MAX:BinaryOps.MAX}[op]
+  n_lbs, dim = len(lbs), prod(lbs[0].shape)
+  # Ring allreduce doesn't provide a benefit with only 2 nodes or where number of elements is less than 256k (empirically)
+  # so just fallback to naive allreduce to save on kernel dispatch, chunking and reassembling chunks.
+  use_ring = (RING >= 2 or (n_lbs > 2 and dim > 256_000 and RING >= 1))
+  if DEBUG >= 2: print(f"{'RING ALLREDUCE' if use_ring else 'NAIVE ALLREDUCE'} {n_lbs}x{dim} | {lbs[0].dtype}")
+  if not use_ring:
+    return [functools.reduce(lambda x,y: x.e(bop, y), [x.copy_to_device(lb.device) for x in lbs]) for lb in lbs]
+  factor = max(f for f in [32, 16, 8, 4, 2, 1] if dim % f == 0)
+  base, left = (dim // factor) // n_lbs, (dim // factor) % n_lbs
+  c_lens = [(base + 1) * factor if i < left else base * factor for i in range(n_lbs)]
+  acc = 0
+  chunks = [(acc, (acc := acc + i)) for i in c_lens if i > 0]
+  chunked = [[lb.reshape((dim,)).shrink(((s,e),)) for s,e in chunks] for lb in lbs]
+  # Scatter-reduce step
+  for step in range(n_lbs - 1):
+    for i in range(len(chunks)):
+      s, r = (i+step)%n_lbs, (i+step+1)%n_lbs
+      chunked[r][i] = chunked[r][i].e(bop, chunked[s][i].copy_to_device(chunked[r][i].device, force=True))
+  # Allgather step
+  for step in range(n_lbs - 1):
+    for i in range(len(chunks)):
+      s, r = (i+step-1)%n_lbs, (i+step)%n_lbs
+      chunked[r][i] = chunked[s][i].copy_to_device(chunked[r][i].device, force=True)
+  # Assemble chunks back
+  pads = [((s,dim-e),) for s,e in chunks]
+  return [functools.reduce(lambda x,y: x.e(BinaryOps.ADD, y), [c.pad(pads[i]) for i,c in enumerate(lb_c)]).reshape(lbs[0].shape) for lb_c in chunked]
+def to_sharded(lbs:List[LazyBuffer], axis:int) -> List[LazyBuffer]:
+  if DEBUG >= 3 and lbs[0].shape[axis] % len(lbs) != 0: print(f"multi axis uneven: {lbs[0].shape=} {axis=} {len(lbs)=}")
+  sz = round_up(lbs[0].shape[axis], len(lbs)) // len(lbs)
+  return [lb.shrink(tuple((0,s) if a != axis else (sz*i,min(s,sz*(i+1))) for a,s in enumerate(lb.shape))) for i,lb in enumerate(lbs)]
+class MultiLazyBuffer:
+  def __init__(self, lbs:List[LazyBuffer], axis:Optional[int], real:Optional[List[bool]]=None):
+    assert all(isinstance(x, LazyBuffer) for x in lbs) and len(lbs), "all lbs must be LazyBuffers, and we need at least one of them"
+    assert all_same([x.dtype for x in lbs]), f"all multilazybuffer needs same dtype, getting {[x.dtype for x in lbs]}"
+    self.lbs, self.axis, self.dtype, self.device, self.real = lbs, axis, lbs[0].dtype, tuple(x.device for x in lbs), real or [True]*len(lbs)
+    if axis is not None:
+      splits = list(itertools.accumulate([lb.shape[axis] for lb in lbs], initial=0))
+      self.bounds = [(st,ed) for st,ed in zip(splits, splits[1:])]
+  @property
+  def shape(self):
+    return tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape))
+  @property
+  def size(self): return sum(x.size for x in self.real_lbs)
+  @property
+  def real_lbs(self): return [lb for lb,r in zip(self.lbs, self.real) if r]
+  def __repr__(self):
+    return f"<MLB {self.axis=} {self.real=} {chr(10)}{chr(10).join([f'{x.device} {x.st}' for x in self.lbs])}>"
+  @staticmethod
+  def from_sharded(lb:LazyBuffer, devices:Tuple[str, ...], axis:Optional[int]=None):
+    lbs = [lb.contiguous() if lb.base != lb and not lb.is_unrealized_unmasked_const() else lb] * len(devices)
+    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis) if axis is not None else lbs, devices)]
+    return MultiLazyBuffer([lb if lb.is_unrealized_unmasked_const() else lb.contiguous() for lb in sharded_lbs], axis)
+  def copy_to_device(self, device:str) -> LazyBuffer:
+    if self.axis is None: return self.lbs[self.real.index(True)].copy_to_device(device)
+    sz = self.lbs[0].shape[self.axis]
+    llbs = []
+    for i,lb in enumerate([lb.copy_to_device(device) for lb in self.real_lbs]):
+      pad_arg = tuple((0,0) if a != self.axis else (sz*i, max(0, self.shape[self.axis]-sz*(i+1))) for a in range(len(lb.shape)))
+      llbs.append(lb.pad(pad_arg))
+    return functools.reduce(lambda x,y: x.e(BinaryOps.ADD, y), llbs)
+  # passthroughs
+  def is_realized(self) -> bool: return all(lb.base.realized is not None for lb, r in zip(self.lbs, self.real) if r is True)
+  def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
+  def const(self, val:ConstType) -> MultiLazyBuffer: return MultiLazyBuffer([x.const(val) for x in self.lbs], self.axis, self.real)
+  def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
+  def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)
+  # elementwise is simple
+  def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:MultiLazyBuffer, arg:Optional[Any]=None) -> MultiLazyBuffer:
+    msrcs = (self,)+in_srcs
+    assert all(isinstance(x, MultiLazyBuffer) for x in msrcs), f"all buffers must be MultiLazyBuffer {msrcs}"
+    assert all_same([x.device for x in msrcs]), f"all buffers must have the same device {[x.device for x in msrcs]}"
+    # NOTE: they all have to share an axis, we always choose [-1]
+    axis = axes[-1] if len(axes := dedup([x.axis for x in msrcs if x.axis is not None])) else None
+    srcs = []
+    not_all_real = any(not all(mlb.real) for mlb in msrcs)
+    new_real = [all(transposed) for transposed in zip(*[mlb.real for mlb in msrcs])] if not_all_real else self.real
+    assert any(new_real), "output contains no real lb"
+    for mlb in msrcs:
+      if mlb.axis == axis or not_all_real: srcs.append(mlb.lbs)
+      elif mlb.axis is None and axis is not None: srcs.append(to_sharded(mlb.lbs, axis))
+      else: srcs.append(to_sharded([mlb.copy_to_device(lb.device) for lb in mlb.lbs], axis))
+    # NOTE: lsrcs[-1].const(0) is correct for where
+    return MultiLazyBuffer([lsrcs[0].e(op, *lsrcs[1:], arg=arg) if r else lsrcs[-1].const(0) for lsrcs,r in zip(zip(*srcs),new_real)], axis, new_real)
+  def _shape_to_single_shard(self, shape:Tuple[sint, ...], lb:LazyBuffer) -> Tuple[sint, ...]:
+    return tuple(lb.shape[self.axis] if a == self.axis else s for a,s in enumerate(shape))
+  def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> MultiLazyBuffer:
+    if self.axis is not None and self.axis in axis:
+      # all-reduce on sharded axes
+      reduced_parts = [(x if r else x.const(0)).r(op, axis) for x,r in zip(self.lbs, self.real)]
+      if all(self.real): return MultiLazyBuffer(all_reduce(op, reduced_parts), None)
+      return MultiLazyBuffer(reduced_parts, None, self.real)
+    # reduce on non sharded axes, piecewise is fine. if axis is None this is also correct
+    return MultiLazyBuffer([x.r(op, axis) for x in self.lbs], self.axis, self.real)
+  # *** movement ops ***
+  def reshape(self, arg:Tuple[sint, ...]):
+    if self.axis is None: return MultiLazyBuffer([x.reshape(arg) for x in self.lbs], None, self.real)
+    arg_acc:List[sint] = list(itertools.accumulate(arg, operator.mul, initial=1))
+    # new_axis is the last one that preserves prod(prior to new_axis) and must not move items between shards
+    # todo: what to do about shrinking to self.shape[self.axis]==1 len(self.real_lbs)==1?
+    new_axis = len(arg_acc) - arg_acc[::-1].index(prod(self.shape[:self.axis])) - 1
+    if arg[new_axis] != self.shape[self.axis]:
+      assert self.shape[self.axis] % len(self.real_lbs) == 0, f"cannot reshape on-axis for uneven shard {self.axis} {self.shape} {len(self.real_lbs)}"
+      assert arg[new_axis] % len(self.real_lbs) == 0, f"new on-axis shape must divide evenly between devices {new_axis} {arg} {len(self.real_lbs)}"
+    return MultiLazyBuffer([x.reshape(tuple(s if a != new_axis else
+                              x.shape[self.axis] if s == self.shape[self.axis] else
+                              s // len(self.real_lbs) for a,s in enumerate(arg))) for x in self.lbs],
+                           new_axis, self.real)
+  def pad(self, arg:Tuple[Tuple[sint, sint], ...]):
+    assert self.axis is None or arg[self.axis] == (0,0) or not all(self.real), f"padding not supported for {arg=}"
+    # pad on shard axis -> fill others with zeros and set real to all True
+    if self.axis is not None and arg[self.axis] != (0,0):
+      # pad back to whole axis, remove real mask
+      assert all(arg[i] == (0, 0) or i == self.axis for i in range(len(self.shape))), "cannot pad sharded and non-sharded axis at the same time"
+      assert arg[self.axis] == (sum(lb.shape[self.axis] for i,lb in enumerate(self.lbs) if i < self.real.index(True)), \
+                                sum(lb.shape[self.axis] for i,lb in enumerate(self.lbs) if i > self.real.index(True))), "can only pad to whole axis"
+      return MultiLazyBuffer([x if r else x.const(0) for x,r in zip(self.lbs, self.real)], self.axis)
+    return MultiLazyBuffer([x.pad(arg) for x in self.lbs], self.axis, self.real)
+  def expand(self, arg:Tuple[sint, ...]):
+    # NOTE: this assert isn't needed, sharded axis can have dim 1
+    assert self.axis is None or arg[self.axis] == self.shape[self.axis], f"expand not supported on sharded axis {arg=}"
+    return MultiLazyBuffer([x.expand(self._shape_to_single_shard(arg, x)) for x in self.lbs], self.axis, self.real)
+  def permute(self, arg:Tuple[int, ...]):
+    # all permutes supported!
+    return MultiLazyBuffer([x.permute(arg) for x in self.lbs], arg.index(self.axis) if self.axis is not None else None, self.real)
+  def shrink(self, arg:Tuple[Tuple[sint, sint], ...]):
+    assert self.axis is None or arg[self.axis] == (0, self.shape[self.axis]) or arg[self.axis] in self.bounds, f"shrinking not supported for {arg=}"
+    if self.axis is not None and arg[self.axis] in self.bounds and arg[self.axis] != (0, self.shape[self.axis]):
+      assert all(arg[i] == (0, s) or i == self.axis for i,s in enumerate(self.shape)), "cannot shrink sharded and non-sharded axis at the same time"
+      idx = self.bounds.index(arg[self.axis])
+      # zero out other lbs to not create lb reference
+      return MultiLazyBuffer([lb if i==idx else lb.const(0) for i,lb in enumerate(self.lbs)], self.axis, [i==idx for i in range(len(self.lbs))])
+    return MultiLazyBuffer([x.shrink(tuple((0, x.shape[self.axis]) if a == self.axis else s for a,s in enumerate(arg))) for x in self.lbs],
+                           self.axis, self.real)
+  def stride(self, arg:Tuple[int, ...]):
+    assert self.axis is None or arg[self.axis] == 1, "flipping not supported on sharded axis"
+    return MultiLazyBuffer([x.stride(arg) for x in self.lbs], self.axis, self.real)

tinygrad/nn/__init__.py CHANGED Viewed

@@ -1,10 +1,35 @@
 import math
-from typing import Optional, Union, Tuple
+from typing import Optional, Union, Tuple, cast
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import prod
+from tinygrad.nn import optim, state  # noqa: F401
 class BatchNorm2d:
-  def __init__(self, sz, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1):
+  """
+  Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension).
+  - Described: https://paperswithcode.com/method/batch-normalization
+  - Paper: https://arxiv.org/abs/1502.03167v3
+  See: `Tensor.batchnorm`
+  ```python exec="true" session="tensor"
+  from tinygrad import Tensor, dtypes, nn
+  import numpy as np
+  np.set_printoptions(precision=4)
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.BatchNorm2d(3)
+  t = Tensor.rand(2, 3, 4, 4)
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
+  def __init__(self, sz:int, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1):
     self.eps, self.track_running_stats, self.momentum = eps, track_running_stats, momentum
     if affine: self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz)
@@ -19,14 +44,14 @@ class BatchNorm2d:
       # https://github.com/pytorch/pytorch/blob/c618dc13d2aa23625cb0d7ada694137532a4fa33/aten/src/ATen/native/cuda/Normalization.cuh
       # There's "online" algorithms that fix this, like https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm
       batch_mean = x.mean(axis=(0,2,3))
-      y = (x - batch_mean.reshape(shape=[1, -1, 1, 1]))
+      y = (x - batch_mean.detach().reshape(shape=[1, -1, 1, 1]))  # d(var)/d(mean) = 0
       batch_var = (y*y).mean(axis=(0,2,3))
       batch_invstd = batch_var.add(self.eps).pow(-0.5)
       # NOTE: wow, this is done all throughout training in most PyTorch models
       if self.track_running_stats:
-        self.running_mean.assign((1 - self.momentum) * self.running_mean + self.momentum * batch_mean.detach())
-        self.running_var.assign((1 - self.momentum) * self.running_var + self.momentum * prod(y.shape)/(prod(y.shape) - y.shape[1]) * batch_var.detach() )
+        self.running_mean.assign((1-self.momentum) * self.running_mean + self.momentum * batch_mean.detach())
+        self.running_var.assign((1-self.momentum) * self.running_var + self.momentum * prod(y.shape)/(prod(y.shape)-y.shape[1]) * batch_var.detach())
         self.num_batches_tracked += 1
     else:
       batch_mean = self.running_mean
@@ -37,43 +62,139 @@ class BatchNorm2d:
 # TODO: these Conv lines are terrible
 def Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+  """
+  Applies a 1D convolution over an input signal composed of several input planes.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.Conv1d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   return Conv2d(in_channels, out_channels, (kernel_size,), stride, padding, dilation, groups, bias)
 class Conv2d:
+  """
+  Applies a 2D convolution over an input signal composed of several input planes.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.Conv2d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
     self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
     self.stride, self.padding, self.dilation, self.groups = stride, padding, dilation, groups
-    self.weight = Tensor.kaiming_uniform(out_channels, in_channels//groups, *self.kernel_size, a=math.sqrt(5))
-    bound = 1 / math.sqrt(prod(self.weight.shape[1:]))
+    self.weight = self.initialize_weight(out_channels, in_channels, groups)
+    bound = 1 / math.sqrt(cast(int, prod(self.weight.shape[1:])))  # weight shape is always ints but mypy cannot tell
     self.bias = Tensor.uniform(out_channels, low=-bound, high=bound) if bias else None
-  def __call__(self, x):
+  def __call__(self, x:Tensor):
     return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride, dilation=self.dilation, groups=self.groups)
+  def initialize_weight(self, out_channels, in_channels, groups):
+    return Tensor.kaiming_uniform(out_channels, in_channels//groups, *self.kernel_size, a=math.sqrt(5))
 def ConvTranspose1d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
+  """
+  Applies a 1D transposed convolution operator over an input signal composed of several input planes.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.ConvTranspose1d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   return ConvTranspose2d(in_channels, out_channels, (kernel_size,), stride, padding, output_padding, dilation, groups, bias)
-class ConvTranspose2d:
+class ConvTranspose2d(Conv2d):
+  """
+  Applies a 2D transposed convolution operator over an input image.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.ConvTranspose2d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
-    self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
-    self.stride, self.padding, self.output_padding, self.dilation, self.groups = stride, padding, output_padding, dilation, groups
-    self.weight = Tensor.kaiming_uniform(in_channels, out_channels//groups, *self.kernel_size, a=math.sqrt(5))
-    bound = 1 / math.sqrt(prod(self.weight.shape[1:]))
-    self.bias = Tensor.uniform(out_channels, low=-bound, high=bound) if bias else None
+    super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
+    self.output_padding = output_padding
-  def __call__(self, x):
-    return x.conv_transpose2d(self.weight, self.bias, padding=self.padding, output_padding=self.output_padding, stride=self.stride, dilation=self.dilation, groups=self.groups)
+  def __call__(self, x:Tensor):
+    return x.conv_transpose2d(self.weight, self.bias, padding=self.padding, output_padding=self.output_padding, stride=self.stride,
+                              dilation=self.dilation, groups=self.groups)
+  def initialize_weight(self, out_channels, in_channels, groups):
+    return Tensor.kaiming_uniform(in_channels, out_channels//groups, *self.kernel_size, a=math.sqrt(5))
 class Linear:
+  """
+  Applies a linear transformation to the incoming data.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Linear
+  ```python exec="true" source="above" session="tensor" result="python"
+  lin = nn.Linear(3, 4)
+  t = Tensor.rand(2, 3)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = lin(t)
+  print(t.numpy())
+  ```
+  """
   def __init__(self, in_features, out_features, bias=True):
+    # TODO: is this init good? torch inits to uniform(-1/sqrt(in_features), 1/sqrt(in_features))
     self.weight = Tensor.kaiming_uniform(out_features, in_features, a=math.sqrt(5))
-    bound = 1 / math.sqrt(self.weight.shape[1])
+    bound = 1 / math.sqrt(in_features)
     self.bias = Tensor.uniform(out_features, low=-bound, high=bound) if bias else None
-  def __call__(self, x):
+  def __call__(self, x:Tensor):
     return x.linear(self.weight.transpose(), self.bias)
 class GroupNorm:
+  """
+  Applies Group Normalization over a mini-batch of inputs.
+  - Described: https://paperswithcode.com/method/group-normalization
+  - Paper: https://arxiv.org/abs/1803.08494v3
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.GroupNorm(2, 12)
+  t = Tensor.rand(2, 12, 4, 4) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, num_groups:int, num_channels:int, eps:float=1e-5, affine:bool=True):
     self.num_groups, self.num_channels, self.eps = num_groups, num_channels, eps
     self.weight: Optional[Tensor] = Tensor.ones(num_channels) if affine else None
@@ -89,6 +210,22 @@ class GroupNorm:
     return x * self.weight.reshape(1, -1, *[1] * (len(x.shape)-2)) + self.bias.reshape(1, -1, *[1] * (len(x.shape)-2))
 class InstanceNorm:
+  """
+  Applies Instance Normalization over a mini-batch of inputs.
+  - Described: https://paperswithcode.com/method/instance-normalization
+  - Paper: https://arxiv.org/abs/1607.08022v3
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.InstanceNorm(3)
+  t = Tensor.rand(2, 3, 4, 4) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, num_features:int, eps:float=1e-5, affine:bool=True):
     self.num_features, self.eps = num_features, eps
     self.weight: Optional[Tensor] = Tensor.ones(num_features) if affine else None
@@ -100,6 +237,22 @@ class InstanceNorm:
     return x * self.weight.reshape(1, -1, *[1] * (len(x.shape)-2)) + self.bias.reshape(1, -1, *[1] * (len(x.shape)-2))
 class LayerNorm:
+  """
+  Applies Layer Normalization over a mini-batch of inputs.
+  - Described: https://paperswithcode.com/method/layer-normalization
+  - Paper: https://arxiv.org/abs/1607.06450v1
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.LayerNorm(3)
+  t = Tensor.rand(2, 5, 3) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, normalized_shape:Union[int, Tuple[int, ...]], eps:float=1e-5, elementwise_affine:bool=True):
     self.normalized_shape = (normalized_shape,) if isinstance(normalized_shape, int) else tuple(normalized_shape)
     self.axis, self.eps, self.elementwise_affine = tuple(-1-i for i in range(len(self.normalized_shape))), eps, elementwise_affine
@@ -112,13 +265,40 @@ class LayerNorm:
     return x * self.weight + self.bias
 class LayerNorm2d(LayerNorm):
+  """
+  Applies Layer Normalization over a mini-batch of 2D inputs.
+  See: `LayerNorm`
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.LayerNorm2d(3)
+  t = Tensor.rand(2, 3, 4, 4) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __call__(self, x): return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
 class Embedding:
+  """
+  A simple lookup table that stores embeddings of a fixed dictionary and size.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Embedding
+  ```python exec="true" source="above" session="tensor" result="python"
+  emb = nn.Embedding(10, 3)
+  print(emb(Tensor([1, 2, 3, 1])).numpy())
+  ```
+  """
   def __init__(self, vocab_size:int, embed_size:int):
-    self.vocab_size = vocab_size
-    self.weight = Tensor.glorot_uniform(vocab_size, embed_size)
+    self.vocab_sz, self.embed_sz, self.weight = vocab_size, embed_size, Tensor.glorot_uniform(vocab_size, embed_size)
   def __call__(self, idx:Tensor) -> Tensor:
-    if not hasattr(self, 'vocab_counter'): self.vocab_counter = Tensor.arange(self.vocab_size, requires_grad=False).reshape(1, 1, self.vocab_size)
-    return (self.vocab_counter == idx.unsqueeze(2)).expand(*idx.shape, self.vocab_size) @ self.weight
+    if idx.numel() == 0: return Tensor.empty(idx.shape+(self.embed_sz,), device=self.weight.device)
+    arange_shp, weight_shp, big_shp = (1, 1, self.vocab_sz, 1), (1, 1, self.vocab_sz, self.embed_sz), idx.shape+(self.vocab_sz, self.embed_sz,)
+    if not hasattr(self, 'arange'): self.arange = Tensor.arange(self.vocab_sz, requires_grad=False, device=self.weight.device).reshape(arange_shp)
+    arange, idx, vals = self.arange.expand(big_shp), idx.reshape(idx.shape+(1, 1,)).expand(big_shp), self.weight.reshape(weight_shp).expand(big_shp)
+    return (arange == idx).mul(vals).sum(2)

tinygrad/nn/datasets.py ADDED Viewed

@@ -0,0 +1,7 @@
+import gzip
+from tinygrad import Tensor, fetch
+def _fetch_mnist(file, offset): return Tensor(gzip.open(fetch("https://storage.googleapis.com/cvdf-datasets/mnist/"+file)).read()[offset:])
+def mnist():
+  return _fetch_mnist("train-images-idx3-ubyte.gz", 0x10).reshape(-1, 1, 28, 28), _fetch_mnist("train-labels-idx1-ubyte.gz", 8), \
+         _fetch_mnist("t10k-images-idx3-ubyte.gz", 0x10).reshape(-1, 1, 28, 28), _fetch_mnist("t10k-labels-idx1-ubyte.gz", 8)

tinygrad/nn/optim.py CHANGED Viewed

@@ -1,64 +1,144 @@
 # sorted in order of increasing complexity
 from typing import List
-from tinygrad.helpers import dedup
+from tinygrad.helpers import dedup, flatten, getenv
 from tinygrad.tensor import Tensor
+from tinygrad.dtype import dtypes, least_upper_dtype
 class Optimizer:
+  """
+  Base class for all optimizers.
+  """
   def __init__(self, params: List[Tensor], lr: float):
     # if it's None, but being put into an optimizer, set it to True
     for x in params:
       if x.requires_grad is None: x.requires_grad = True
     self.params: List[Tensor] = dedup([x for x in params if x.requires_grad])
+    assert len(self.params) != 0, "optimizer must have at least one param"
+    self.device = self.params[0].device
     self.buffers: List[Tensor] = dedup([x for x in params if not x.requires_grad])   # buffers are still realized
-    self.lr = Tensor([lr], requires_grad=False).contiguous()
+    # store lr in at least float32 precision
+    self.lr = Tensor(lr if getenv("CONST_LR") else [lr], requires_grad=False, device=self.device,
+                     dtype=least_upper_dtype(dtypes.default_float, dtypes.float32))
   def zero_grad(self):
+    """
+    Zeroes the gradients of all the parameters.
+    """
     for param in self.params: param.grad = None
-  def realize(self, extra=None):
-    # TODO: corealize
-    # NOTE: in extra is too late for most of the params due to issues with assign
-    for p in extra + self.params + self.buffers if extra is not None else self.params + self.buffers:
-      p.realize()
+  def step(self):
+    """
+    Performs a single optimization step.
+    """
+    Tensor.realize(*self.schedule_step())
+  def schedule_step(self) -> List[Tensor]:
+    """
+    Returns the tensors that need to be realized to perform a single optimization step.
+    """
+    assert Tensor.training, (
+            f"""Tensor.training={Tensor.training}, Tensor.training must be enabled to use the optimizer.
+                - help: Consider setting Tensor.training=True before calling Optimizer.step().""")
+    return self._step()+self.params+self.buffers
+  def _step(self) -> List[Tensor]: raise NotImplementedError
-class SGD(Optimizer):
-  def __init__(self, params: List[Tensor], lr=0.001, momentum=0, weight_decay=0.0, nesterov=False):
+class OptimizerGroup(Optimizer):
+  """
+  Combines multiple optimizers into one.
+  """
+  def __init__(self, *optimizers: Optimizer): # pylint: disable=super-init-not-called
+    self.optimizers = optimizers
+    self.params, self.buffers = flatten([o.params for o in self.optimizers]), flatten([o.buffers for o in self.optimizers])
+  def __getitem__(self, i): return self.optimizers[i]
+  def zero_grad(self): [o.zero_grad() for o in self.optimizers]
+  def _step(self) -> List[Tensor]: return [x for o in self.optimizers for x in o._step()]
+# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD.
+def SGD(params: List[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
+  """
+  Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.
+  `classic` is a boolean flag that determines whether to use the popular momentum update rule or the classic momentum update rule.
+  - Described: https://paperswithcode.com/method/sgd
+  """
+  return LARS(params, lr, momentum, weight_decay, nesterov, classic, tcoef=0.0)
+class LARS(Optimizer):
+  """
+  Layer-wise Adaptive Rate Scaling (LARS) optimizer with optional momentum and weight decay.
+  - Described: https://paperswithcode.com/method/lars
+  - Paper: https://arxiv.org/abs/1708.03888v3
+  """
+  def __init__(self, params:List[Tensor], lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=False, classic=True, tcoef=0.001):
     super().__init__(params, lr)
-    self.momentum, self.wd, self.nesterov = momentum, weight_decay, nesterov
-    self.b = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []
+    self.momentum, self.wd, self.nesterov, self.classic, self.tcoef = momentum, weight_decay, nesterov, classic, tcoef
+    self.b = [Tensor.zeros(*t.shape, dtype=t.dtype, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []
-  # https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
-  def step(self) -> None:
+  def _step(self) -> List[Tensor]:
     for i, t in enumerate(self.params):
       assert t.grad is not None
-      g = t.grad.realize() + self.wd * t.detach()
+      # contiguous is needed since the grads can allegedly form a "diamond"
+      # TODO: fix this in lazy.py
+      g = t.grad.contiguous()
+      if self.tcoef != 0:
+        r1 = t.detach().square().sum().sqrt()
+        r2 = g.square().sum().sqrt()
+        r = (r1 > 0).where((r2 > 0).where(self.tcoef * r1 / (r2 + self.wd * r1), 1.0), 1.0)
+      else: r = 1.0
+      g = g + self.wd * t.detach()
+      # classic momentum does post learning rate update
+      if self.classic: g = g * r * self.lr
       if self.momentum:
-        self.b[i].assign(self.momentum * self.b[i] + g).realize()  # NOTE: self.b[i] is zero on the first run, no if required
+        self.b[i].assign(self.momentum * self.b[i] + g)  # NOTE: self.b[i] is zero on the first run, no if required
         g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i]
-      t.assign(t.detach() - g * self.lr)
-    self.realize(self.b)
+      # popular momentum does pre learning rate update
+      if not self.classic: g = g * r * self.lr
+      t.assign((t.detach() - g).cast(t.dtype))
+    return self.b
 # LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W.
-def AdamW(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, wd=0.01): return LAMB(params, lr, b1, b2, eps, wd, adam=True)
-def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8): return LAMB(params, lr, b1, b2, eps, 0.0, adam=True)
+def AdamW(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
+  """
+  AdamW optimizer with optional weight decay.
+  - Described: https://paperswithcode.com/method/adamw
+  - Paper: https://arxiv.org/abs/1711.05101v3
+  """
+  return LAMB(params, lr, b1, b2, eps, weight_decay, adam=True)
+def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
+  """
+  Adam optimizer.
+  - Described: https://paperswithcode.com/method/adam
+  - Paper: https://arxiv.org/abs/1412.6980
+  """
+  return LAMB(params, lr, b1, b2, eps, 0.0, adam=True)
 class LAMB(Optimizer):
-  def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, wd=0.0, adam=False):
+  """
+  LAMB optimizer with optional weight decay.
+  - Described: https://paperswithcode.com/method/lamb
+  - Paper: https://arxiv.org/abs/1904.00962
+  """
+  def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, weight_decay=0.0, adam=False):
     super().__init__(params, lr)
-    self.b1, self.b2, self.eps, self.wd, self.adam, self.t = b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
-    self.m = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
-    self.v = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
+    self.b1, self.b2, self.eps, self.wd, self.adam = b1, b2, eps, weight_decay, adam
+    self.b1_t, self.b2_t = (Tensor([1], dtype=dtypes.float32, device=self.device, requires_grad=False).realize() for _ in [b1, b2])
+    self.m = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params]
+    self.v = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params]
-  def step(self) -> None:
-    self.t.assign(self.t + 1).realize()
+  def _step(self) -> List[Tensor]:
+    self.b1_t *= self.b1
+    self.b2_t *= self.b2
     for i, t in enumerate(self.params):
       assert t.grad is not None
-      g = t.grad.realize()
-      self.m[i].assign(self.b1 * self.m[i] + (1.0 - self.b1) * g).realize()
-      self.v[i].assign(self.b2 * self.v[i] + (1.0 - self.b2) * (g * g)).realize()
-      m_hat = self.m[i] / (1.0 - self.b1**self.t)
-      v_hat = self.v[i] / (1.0 - self.b2**self.t)
+      self.m[i].assign(self.b1 * self.m[i] + (1.0 - self.b1) * t.grad)
+      self.v[i].assign(self.b2 * self.v[i] + (1.0 - self.b2) * (t.grad * t.grad))
+      m_hat = self.m[i] / (1.0 - self.b1_t)
+      v_hat = self.v[i] / (1.0 - self.b2_t)
       up = (m_hat / (v_hat.sqrt() + self.eps)) + self.wd * t.detach()
       if not self.adam:
         r1 = t.detach().square().sum().sqrt()
@@ -66,5 +146,5 @@ class LAMB(Optimizer):
         r = Tensor.where(r1 > 0, Tensor.where(r2 > 0, r1 / r2, 1.0), 1.0)
       else:
         r = 1.0
-      t.assign(t.detach() - self.lr * r * up)
-    self.realize([self.t] + self.m + self.v)
+      t.assign((t.detach() - self.lr * r * up).cast(t.dtype))
+    return [self.b1_t, self.b2_t] + self.m + self.v

tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl