PyPI - tinygrad - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +6 -6
tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +253 -225
tinygrad/codegen/linearizer.py +398 -436
tinygrad/codegen/uops.py +451 -0
tinygrad/device.py +268 -274
tinygrad/dtype.py +56 -40
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +198 -0
tinygrad/engine/realize.py +192 -0
tinygrad/engine/schedule.py +370 -0
tinygrad/engine/search.py +199 -0
tinygrad/{mlops.py → function.py} +40 -32
tinygrad/helpers.py +144 -46
tinygrad/lazy.py +143 -242
tinygrad/multi.py +173 -0
tinygrad/nn/__init__.py +180 -9
tinygrad/nn/datasets.py +8 -0
tinygrad/nn/optim.py +106 -28
tinygrad/nn/state.py +87 -19
tinygrad/ops.py +104 -45
tinygrad/renderer/__init__.py +65 -0
tinygrad/renderer/assembly.py +269 -0
tinygrad/renderer/cstyle.py +308 -210
tinygrad/renderer/llvmir.py +119 -124
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +13403 -0
tinygrad/runtime/autogen/comgr.py +891 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5893 -0
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33597 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +56 -0
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +39 -0
tinygrad/runtime/graph/cuda.py +59 -54
tinygrad/runtime/graph/hcq.py +187 -0
tinygrad/runtime/graph/metal.py +37 -41
tinygrad/runtime/ops_amd.py +550 -0
tinygrad/runtime/ops_clang.py +16 -14
tinygrad/runtime/ops_cuda.py +129 -37
tinygrad/runtime/ops_disk.py +111 -43
tinygrad/runtime/ops_gpu.py +52 -50
tinygrad/runtime/ops_llvm.py +36 -56
tinygrad/runtime/ops_metal.py +41 -24
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +625 -0
tinygrad/runtime/ops_python.py +208 -0
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +46 -107
tinygrad/shape/symbolic.py +99 -98
tinygrad/shape/view.py +162 -45
tinygrad/tensor.py +2492 -483
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/features/image.py +0 -93
tinygrad/features/multi.py +0 -103
tinygrad/features/search.py +0 -160
tinygrad/graph.py +0 -106
tinygrad/jit.py +0 -152
tinygrad/realize.py +0 -50
tinygrad/runtime/graph/hip.py +0 -24
tinygrad/runtime/ops_cpu.py +0 -45
tinygrad/runtime/ops_hip.py +0 -97
tinygrad/runtime/ops_torch.py +0 -49
tinygrad-0.8.0.dist-info/RECORD +0 -41
{tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/nn/__init__.py CHANGED Viewed

@@ -2,9 +2,33 @@ import math
 from typing import Optional, Union, Tuple, cast
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import prod
-from tinygrad.nn import optim, state  # noqa: F401
+from tinygrad.nn import optim, state, datasets  # noqa: F401
 class BatchNorm2d:
+  """
+  Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension).
+  - Described: https://paperswithcode.com/method/batch-normalization
+  - Paper: https://arxiv.org/abs/1502.03167v3
+  See: `Tensor.batchnorm`
+  ```python exec="true" session="tensor"
+  from tinygrad import Tensor, dtypes, nn
+  import numpy as np
+  np.set_printoptions(precision=4)
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.BatchNorm2d(3)
+  t = Tensor.rand(2, 3, 4, 4)
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, sz:int, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1):
     self.eps, self.track_running_stats, self.momentum = eps, track_running_stats, momentum
@@ -20,7 +44,7 @@ class BatchNorm2d:
       # https://github.com/pytorch/pytorch/blob/c618dc13d2aa23625cb0d7ada694137532a4fa33/aten/src/ATen/native/cuda/Normalization.cuh
       # There's "online" algorithms that fix this, like https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm
       batch_mean = x.mean(axis=(0,2,3))
-      y = (x - batch_mean.reshape(shape=[1, -1, 1, 1]))
+      y = (x - batch_mean.detach().reshape(shape=[1, -1, 1, 1]))  # d(var)/d(mean) = 0
       batch_var = (y*y).mean(axis=(0,2,3))
       batch_invstd = batch_var.add(self.eps).pow(-0.5)
@@ -38,9 +62,39 @@ class BatchNorm2d:
 # TODO: these Conv lines are terrible
 def Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+  """
+  Applies a 1D convolution over an input signal composed of several input planes.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.Conv1d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   return Conv2d(in_channels, out_channels, (kernel_size,), stride, padding, dilation, groups, bias)
 class Conv2d:
+  """
+  Applies a 2D convolution over an input signal composed of several input planes.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.Conv2d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
     self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
     self.stride, self.padding, self.dilation, self.groups = stride, padding, dilation, groups
@@ -55,9 +109,39 @@ class Conv2d:
     return Tensor.kaiming_uniform(out_channels, in_channels//groups, *self.kernel_size, a=math.sqrt(5))
 def ConvTranspose1d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
+  """
+  Applies a 1D transposed convolution operator over an input signal composed of several input planes.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.ConvTranspose1d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   return ConvTranspose2d(in_channels, out_channels, (kernel_size,), stride, padding, output_padding, dilation, groups, bias)
 class ConvTranspose2d(Conv2d):
+  """
+  Applies a 2D transposed convolution operator over an input image.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d
+  ```python exec="true" source="above" session="tensor" result="python"
+  conv = nn.ConvTranspose2d(1, 1, 3)
+  t = Tensor.rand(1, 1, 4, 4)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = conv(t)
+  print(t.numpy())
+  ```
+  """
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
     super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
     self.output_padding = output_padding
@@ -70,6 +154,21 @@ class ConvTranspose2d(Conv2d):
     return Tensor.kaiming_uniform(in_channels, out_channels//groups, *self.kernel_size, a=math.sqrt(5))
 class Linear:
+  """
+  Applies a linear transformation to the incoming data.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Linear
+  ```python exec="true" source="above" session="tensor" result="python"
+  lin = nn.Linear(3, 4)
+  t = Tensor.rand(2, 3)
+  print(t.numpy())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = lin(t)
+  print(t.numpy())
+  ```
+  """
   def __init__(self, in_features, out_features, bias=True):
     # TODO: is this init good? torch inits to uniform(-1/sqrt(in_features), 1/sqrt(in_features))
     self.weight = Tensor.kaiming_uniform(out_features, in_features, a=math.sqrt(5))
@@ -80,6 +179,22 @@ class Linear:
     return x.linear(self.weight.transpose(), self.bias)
 class GroupNorm:
+  """
+  Applies Group Normalization over a mini-batch of inputs.
+  - Described: https://paperswithcode.com/method/group-normalization
+  - Paper: https://arxiv.org/abs/1803.08494v3
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.GroupNorm(2, 12)
+  t = Tensor.rand(2, 12, 4, 4) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, num_groups:int, num_channels:int, eps:float=1e-5, affine:bool=True):
     self.num_groups, self.num_channels, self.eps = num_groups, num_channels, eps
     self.weight: Optional[Tensor] = Tensor.ones(num_channels) if affine else None
@@ -95,6 +210,22 @@ class GroupNorm:
     return x * self.weight.reshape(1, -1, *[1] * (len(x.shape)-2)) + self.bias.reshape(1, -1, *[1] * (len(x.shape)-2))
 class InstanceNorm:
+  """
+  Applies Instance Normalization over a mini-batch of inputs.
+  - Described: https://paperswithcode.com/method/instance-normalization
+  - Paper: https://arxiv.org/abs/1607.08022v3
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.InstanceNorm(3)
+  t = Tensor.rand(2, 3, 4, 4) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, num_features:int, eps:float=1e-5, affine:bool=True):
     self.num_features, self.eps = num_features, eps
     self.weight: Optional[Tensor] = Tensor.ones(num_features) if affine else None
@@ -106,6 +237,22 @@ class InstanceNorm:
     return x * self.weight.reshape(1, -1, *[1] * (len(x.shape)-2)) + self.bias.reshape(1, -1, *[1] * (len(x.shape)-2))
 class LayerNorm:
+  """
+  Applies Layer Normalization over a mini-batch of inputs.
+  - Described: https://paperswithcode.com/method/layer-normalization
+  - Paper: https://arxiv.org/abs/1607.06450v1
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.LayerNorm(3)
+  t = Tensor.rand(2, 5, 3) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __init__(self, normalized_shape:Union[int, Tuple[int, ...]], eps:float=1e-5, elementwise_affine:bool=True):
     self.normalized_shape = (normalized_shape,) if isinstance(normalized_shape, int) else tuple(normalized_shape)
     self.axis, self.eps, self.elementwise_affine = tuple(-1-i for i in range(len(self.normalized_shape))), eps, elementwise_affine
@@ -118,16 +265,40 @@ class LayerNorm:
     return x * self.weight + self.bias
 class LayerNorm2d(LayerNorm):
+  """
+  Applies Layer Normalization over a mini-batch of 2D inputs.
+  See: `LayerNorm`
+  ```python exec="true" source="above" session="tensor" result="python"
+  norm = nn.LayerNorm2d(3)
+  t = Tensor.rand(2, 3, 4, 4) * 2 + 1
+  print(t.mean().item(), t.std().item())
+  ```
+  ```python exec="true" source="above" session="tensor" result="python"
+  t = norm(t)
+  print(t.mean().item(), t.std().item())
+  ```
+  """
   def __call__(self, x): return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
 class Embedding:
+  """
+  A simple lookup table that stores embeddings of a fixed dictionary and size.
+  See: https://pytorch.org/docs/stable/generated/torch.nn.Embedding
+  ```python exec="true" source="above" session="tensor" result="python"
+  emb = nn.Embedding(10, 3)
+  print(emb(Tensor([1, 2, 3, 1])).numpy())
+  ```
+  """
   def __init__(self, vocab_size:int, embed_size:int):
-    self.vocab_size, self.embed_size = vocab_size, embed_size
-    self.weight = Tensor.glorot_uniform(vocab_size, embed_size)
+    self.vocab_sz, self.embed_sz, self.weight = vocab_size, embed_size, Tensor.glorot_uniform(vocab_size, embed_size)
   def __call__(self, idx:Tensor) -> Tensor:
-    if not hasattr(self, 'vocab_counter'):
-      self.vocab_counter = Tensor.arange(self.vocab_size, requires_grad=False, device=self.weight.device).reshape(1, 1, self.vocab_size)
-    batch_size, seqlen = idx.shape
-    if seqlen == 0: return Tensor.empty(batch_size, 0, self.embed_size, device=self.weight.device)
-    return (self.vocab_counter == idx.unsqueeze(2)).expand(*idx.shape, self.vocab_size) @ self.weight
+    if idx.numel() == 0: return Tensor.empty(idx.shape+(self.embed_sz,), device=self.weight.device)
+    arange_shp, weight_shp, big_shp = (1, 1, self.vocab_sz, 1), (1, 1, self.vocab_sz, self.embed_sz), idx.shape+(self.vocab_sz, self.embed_sz,)
+    if not hasattr(self, 'arange'): self.arange = Tensor.arange(self.vocab_sz, requires_grad=False, device=self.weight.device).reshape(arange_shp)
+    arange, idx, vals = self.arange.expand(big_shp), idx.reshape(idx.shape+(1, 1,)).expand(big_shp), self.weight.reshape(weight_shp).expand(big_shp)
+    return (arange == idx).mul(vals).sum(2)

tinygrad/nn/datasets.py ADDED Viewed

@@ -0,0 +1,8 @@
+import gzip
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import fetch
+def _fetch_mnist(file, offset): return Tensor(gzip.open(fetch("https://storage.googleapis.com/cvdf-datasets/mnist/"+file)).read()[offset:])
+def mnist():
+  return _fetch_mnist("train-images-idx3-ubyte.gz", 0x10).reshape(-1, 1, 28, 28), _fetch_mnist("train-labels-idx1-ubyte.gz", 8), \
+         _fetch_mnist("t10k-images-idx3-ubyte.gz", 0x10).reshape(-1, 1, 28, 28), _fetch_mnist("t10k-labels-idx1-ubyte.gz", 8)

tinygrad/nn/optim.py CHANGED Viewed

@@ -1,9 +1,13 @@
 # sorted in order of increasing complexity
 from typing import List
-from tinygrad.helpers import dedup, getenv
+from tinygrad.helpers import dedup, flatten, getenv
 from tinygrad.tensor import Tensor
+from tinygrad.dtype import dtypes, least_upper_dtype
 class Optimizer:
+  """
+  Base class for all optimizers.
+  """
   def __init__(self, params: List[Tensor], lr: float):
     # if it's None, but being put into an optimizer, set it to True
     for x in params:
@@ -13,54 +17,128 @@ class Optimizer:
     assert len(self.params) != 0, "optimizer must have at least one param"
     self.device = self.params[0].device
     self.buffers: List[Tensor] = dedup([x for x in params if not x.requires_grad])   # buffers are still realized
-    self.lr = lr if getenv("CONST_LR") else Tensor([lr], requires_grad=False, device=self.device).contiguous()
+    # store lr in at least float32 precision
+    self.lr = Tensor(lr if getenv("CONST_LR") else [lr], requires_grad=False, device=self.device,
+                     dtype=least_upper_dtype(dtypes.default_float, dtypes.float32))
   def zero_grad(self):
+    """
+    Zeroes the gradients of all the parameters.
+    """
     for param in self.params: param.grad = None
-  def realize(self, extra=None):
-    # NOTE: in extra is too late for most of the params due to issues with assign
-    Tensor.corealize(extra + self.params + self.buffers if extra is not None else self.params + self.buffers)
+  def step(self):
+    """
+    Performs a single optimization step.
+    """
+    Tensor.realize(*self.schedule_step())
+  def schedule_step(self) -> List[Tensor]:
+    """
+    Returns the tensors that need to be realized to perform a single optimization step.
+    """
+    assert Tensor.training, (
+            f"""Tensor.training={Tensor.training}, Tensor.training must be enabled to use the optimizer.
+                - help: Consider setting Tensor.training=True before calling Optimizer.step().""")
+    return self._step()+self.params+self.buffers
+  def _step(self) -> List[Tensor]: raise NotImplementedError
-class SGD(Optimizer):
-  def __init__(self, params: List[Tensor], lr=0.001, momentum=0, weight_decay=0.0, nesterov=False):
+class OptimizerGroup(Optimizer):
+  """
+  Combines multiple optimizers into one.
+  """
+  def __init__(self, *optimizers: Optimizer): # pylint: disable=super-init-not-called
+    self.optimizers = optimizers
+    self.params, self.buffers = flatten([o.params for o in self.optimizers]), flatten([o.buffers for o in self.optimizers])
+  def __getitem__(self, i): return self.optimizers[i]
+  def zero_grad(self): [o.zero_grad() for o in self.optimizers]
+  def _step(self) -> List[Tensor]: return [x for o in self.optimizers for x in o._step()]
+# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD.
+def SGD(params: List[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
+  """
+  Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.
+  `classic` is a boolean flag that determines whether to use the popular momentum update rule or the classic momentum update rule.
+  - Described: https://paperswithcode.com/method/sgd
+  """
+  return LARS(params, lr, momentum, weight_decay, nesterov, classic, tcoef=0.0)
+class LARS(Optimizer):
+  """
+  Layer-wise Adaptive Rate Scaling (LARS) optimizer with optional momentum and weight decay.
+  - Described: https://paperswithcode.com/method/lars
+  - Paper: https://arxiv.org/abs/1708.03888v3
+  """
+  def __init__(self, params:List[Tensor], lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=False, classic=True, tcoef=0.001):
     super().__init__(params, lr)
-    self.momentum, self.wd, self.nesterov = momentum, weight_decay, nesterov
-    self.b = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []
+    self.momentum, self.wd, self.nesterov, self.classic, self.tcoef = momentum, weight_decay, nesterov, classic, tcoef
+    self.b = [Tensor.zeros(*t.shape, dtype=t.dtype, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []
-  # https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
-  def step(self) -> None:
+  def _step(self) -> List[Tensor]:
     for i, t in enumerate(self.params):
       assert t.grad is not None
-      # this is needed since the grads can form a "diamond"
+      # contiguous is needed since the grads can allegedly form a "diamond"
       # TODO: fix this in lazy.py
-      t.grad.realize()
-      g = t.grad + self.wd * t.detach()
+      g = t.grad.contiguous()
+      if self.tcoef != 0:
+        r1 = t.detach().square().sum().sqrt()
+        r2 = g.square().sum().sqrt()
+        r = (r1 > 0).where((r2 > 0).where(self.tcoef * r1 / (r2 + self.wd * r1), 1.0), 1.0)
+      else: r = 1.0
+      g = g + self.wd * t.detach()
+      # classic momentum does post learning rate update
+      if self.classic: g = g * r * self.lr
       if self.momentum:
         self.b[i].assign(self.momentum * self.b[i] + g)  # NOTE: self.b[i] is zero on the first run, no if required
         g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i]
-      t.assign(t.detach() - g * self.lr)
-    self.realize(self.b)
+      # popular momentum does pre learning rate update
+      if not self.classic: g = g * r * self.lr
+      t.assign((t.detach() - g).cast(t.dtype))
+    return self.b
 # LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W.
-def AdamW(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, wd=0.01): return LAMB(params, lr, b1, b2, eps, wd, adam=True)
-def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8): return LAMB(params, lr, b1, b2, eps, 0.0, adam=True)
+def AdamW(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
+  """
+  AdamW optimizer with optional weight decay.
+  - Described: https://paperswithcode.com/method/adamw
+  - Paper: https://arxiv.org/abs/1711.05101v3
+  """
+  return LAMB(params, lr, b1, b2, eps, weight_decay, adam=True)
+def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
+  """
+  Adam optimizer.
+  - Described: https://paperswithcode.com/method/adam
+  - Paper: https://arxiv.org/abs/1412.6980
+  """
+  return LAMB(params, lr, b1, b2, eps, 0.0, adam=True)
 class LAMB(Optimizer):
-  def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, wd=0.0, adam=False):
+  """
+  LAMB optimizer with optional weight decay.
+  - Described: https://paperswithcode.com/method/lamb
+  - Paper: https://arxiv.org/abs/1904.00962
+  """
+  def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, weight_decay=0.0, adam=False):
     super().__init__(params, lr)
-    self.b1, self.b2, self.eps, self.wd, self.adam, self.t = b1, b2, eps, wd, adam, Tensor([0], device=self.device, requires_grad=False).realize()
-    self.m = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
-    self.v = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
+    self.b1, self.b2, self.eps, self.wd, self.adam = b1, b2, eps, weight_decay, adam
+    self.b1_t, self.b2_t = (Tensor([1], dtype=dtypes.float32, device=self.device, requires_grad=False).realize() for _ in [b1, b2])
+    self.m = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params]
+    self.v = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params]
-  def step(self) -> None:
-    self.t.assign(self.t + 1)
+  def _step(self) -> List[Tensor]:
+    self.b1_t *= self.b1
+    self.b2_t *= self.b2
     for i, t in enumerate(self.params):
       assert t.grad is not None
       self.m[i].assign(self.b1 * self.m[i] + (1.0 - self.b1) * t.grad)
       self.v[i].assign(self.b2 * self.v[i] + (1.0 - self.b2) * (t.grad * t.grad))
-      m_hat = self.m[i] / (1.0 - self.b1**self.t)
-      v_hat = self.v[i] / (1.0 - self.b2**self.t)
+      m_hat = self.m[i] / (1.0 - self.b1_t)
+      v_hat = self.v[i] / (1.0 - self.b2_t)
       up = (m_hat / (v_hat.sqrt() + self.eps)) + self.wd * t.detach()
       if not self.adam:
         r1 = t.detach().square().sum().sqrt()
@@ -68,5 +146,5 @@ class LAMB(Optimizer):
         r = Tensor.where(r1 > 0, Tensor.where(r2 > 0, r1 / r2, 1.0), 1.0)
       else:
         r = 1.0
-      t.assign(t.detach() - self.lr * r * up)
-    self.realize([self.t] + self.m + self.v)
+      t.assign((t.detach() - self.lr * r * up).cast(t.dtype))
+    return [self.b1_t, self.b2_t] + self.m + self.v

tinygrad/nn/state.py CHANGED Viewed

@@ -1,32 +1,49 @@
 import os, json, pathlib, zipfile, pickle, tarfile, struct
-from tqdm import tqdm
 from typing import Dict, Union, List, Optional, Any, Tuple
 from tinygrad.tensor import Tensor
-from tinygrad.ops import GlobalCounters
 from tinygrad.dtype import dtypes
-from tinygrad.helpers import prod, argsort, DEBUG, Timing, CI, unwrap
+from tinygrad.helpers import prod, argsort, DEBUG, Timing, CI, unwrap, GlobalCounters, tqdm
 from tinygrad.shape.view import strides_for_shape
+from tinygrad.multi import MultiLazyBuffer
-safe_dtypes = {"F16": dtypes.float16, "F32": dtypes.float32, "U8": dtypes.uint8, "I8": dtypes.int8, "I32": dtypes.int32, "I64": dtypes.int64,
-               "F64": dtypes.double, "B": dtypes.bool, "I16": dtypes.short, "U16": dtypes.ushort, "UI": dtypes.uint, "UL": dtypes.ulong}
+safe_dtypes = {"BOOL":dtypes.bool, "I8":dtypes.int8, "U8":dtypes.uint8, "I16":dtypes.int16, "U16":dtypes.uint16, "I32":dtypes.int, "U32":dtypes.uint,
+               "I64":dtypes.int64, "U64":dtypes.uint64, "F16":dtypes.float16, "BF16":dtypes.bfloat16, "F32":dtypes.float32, "F64":dtypes.float64}
 inverse_safe_dtypes = {v:k for k,v in safe_dtypes.items()}
 def safe_load_metadata(fn:Union[Tensor,str]) -> Tuple[Tensor, int, Any]:
+  """
+  Loads a .safetensor file from disk, returning the data, metadata length, and metadata.
+  """
   t = fn if isinstance(fn, Tensor) else Tensor.empty(os.stat(fn).st_size, dtype=dtypes.uint8, device=f"disk:{fn}")
-  json_len = t[0:1].cast(dtypes.int64).numpy()[0]
-  return (t, json_len, json.loads(t[8:8+json_len].numpy().tobytes()))
+  json_len = t[0:8].bitcast(dtypes.int64).item()
+  return t, json_len, json.loads(t[8:8+json_len].numpy().tobytes())
 def safe_load(fn:Union[Tensor,str]) -> Dict[str, Tensor]:
+  """
+  Loads a .safetensor file from disk, returning the state_dict.
+  ```python
+  state_dict = nn.state.safe_load("test.safetensor")
+  ```
+  """
   t, json_len, metadata = safe_load_metadata(fn)
   ret = {}
   for k,v in metadata.items():
     if k == "__metadata__": continue
     dtype = safe_dtypes[v['dtype']]
-    sz = (v['data_offsets'][1]-v['data_offsets'][0])//dtype.itemsize
-    ret[k] = t[8+json_len+v['data_offsets'][0]:8+json_len+v['data_offsets'][0]+sz].cast(dtype).reshape(v['shape'])
+    sz = (v['data_offsets'][1]-v['data_offsets'][0])
+    ret[k] = t[8+json_len+v['data_offsets'][0]:8+json_len+v['data_offsets'][0]+sz].bitcast(dtype).reshape(v['shape'])
   return ret
 def safe_save(tensors:Dict[str, Tensor], fn:str, metadata:Optional[Dict[str, Any]]=None):
+  """
+  Saves a state_dict to disk in a .safetensor file with optional metadata.
+  ```python
+  t = Tensor([1, 2, 3])
+  nn.state.safe_save({'t':t}, "test.safetensor")
+  ```
+  """
   headers, offset = {}, 0
   if metadata: headers['__metadata__'] = metadata
   for k,v in tensors.items():
@@ -36,14 +53,27 @@ def safe_save(tensors:Dict[str, Tensor], fn:str, metadata:Optional[Dict[str, Any
   j += "\x20"*((8-len(j)%8)%8)
   pathlib.Path(fn).unlink(missing_ok=True)
   t = Tensor.empty(8+len(j)+offset, dtype=dtypes.uint8, device=f"disk:{fn}")
-  t[0:1].cast(dtypes.int64).assign([len(j)])
-  t[8:8+len(j)].assign(Tensor(list(j.encode('utf-8')), dtype=dtypes.uint8, device="cpu"))
+  t[0:8].bitcast(dtypes.int64).assign([len(j)])
+  t[8:8+len(j)].assign(list(j.encode('utf-8')))
   for k,v in safe_load(t).items(): v.assign(tensors[k])
 # state dict
 from collections import OrderedDict
 def get_state_dict(obj, prefix:str='', tensor_type=Tensor) -> Dict[str, Tensor]:
+  """
+  Returns a state_dict of the object, with optional prefix.
+  ```python exec="true" source="above" session="tensor" result="python"
+  class Net:
+    def __init__(self):
+      self.l1 = nn.Linear(4, 5)
+      self.l2 = nn.Linear(5, 6)
+  net = Net()
+  print(nn.state.get_state_dict(net).keys())
+  ```
+  """
   if isinstance(obj, tensor_type): return {prefix.strip('.'):obj}
   if hasattr(obj, '_asdict'): return get_state_dict(obj._asdict(), prefix, tensor_type)  # namedtuple
   if isinstance(obj, OrderedDict): return get_state_dict(dict(obj), prefix, tensor_type)
@@ -54,24 +84,61 @@ def get_state_dict(obj, prefix:str='', tensor_type=Tensor) -> Dict[str, Tensor]:
   elif isinstance(obj, dict):
     for k,v in obj.items(): state_dict.update(get_state_dict(v, f"{prefix}{str(k)}.", tensor_type))
   return state_dict
-def get_parameters(obj) -> List[Tensor]: return list(get_state_dict(obj).values())
+def get_parameters(obj) -> List[Tensor]:
+  """
+  ```python exec="true" source="above" session="tensor" result="python"
+  class Net:
+    def __init__(self):
+      self.l1 = nn.Linear(4, 5)
+      self.l2 = nn.Linear(5, 6)
-def load_state_dict(model, state_dict, strict=True, verbose=True):
+  net = Net()
+  print(len(nn.state.get_parameters(net)))
+  ```
+  """
+  return list(get_state_dict(obj).values())
+def load_state_dict(model, state_dict:Dict[str, Tensor], strict=True, verbose=True, consume=False) -> None:
+  """
+  Loads a state_dict into a model.
+  ```python
+  class Net:
+    def __init__(self):
+      self.l1 = nn.Linear(4, 5)
+      self.l2 = nn.Linear(5, 6)
+  net = Net()
+  state_dict = nn.state.get_state_dict(net)
+  nn.state.load_state_dict(net, state_dict)
+  ```
+  """
   start_mem_used = GlobalCounters.mem_used
   with Timing("loaded weights in ", lambda et_ns: f", {(GlobalCounters.mem_used-start_mem_used)/1e9:.2f} GB loaded at {(GlobalCounters.mem_used-start_mem_used)/et_ns:.2f} GB/s"):  # noqa: E501
     model_state_dict = get_state_dict(model)
     if DEBUG >= 1 and len(state_dict) > len(model_state_dict):
       print("WARNING: unused weights in state_dict", sorted(list(state_dict.keys() - model_state_dict.keys())))
     for k,v in (t := tqdm(model_state_dict.items(), disable=CI or not verbose)):
-      t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, {k:50s}")
+      t.desc = f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, {k:50s}: "
       if k not in state_dict and not strict:
         if DEBUG >= 1: print(f"WARNING: not loading {k}")
         continue
-      v.assign(state_dict[k].to(v.device)).realize()
+      if isinstance((mlb:=v.lazydata), MultiLazyBuffer):
+        if isinstance(state_dict[k].lazydata, MultiLazyBuffer): v.replace(state_dict[k]).realize()
+        else: v.replace(state_dict[k].shard(mlb.device, mlb.axis)).realize()
+      else: v.replace(state_dict[k].to(v.device)).realize()
+      if consume: del state_dict[k]
 # torch support!
 def torch_load(fn:str) -> Dict[str, Tensor]:
+  """
+  Loads a torch .pth file from disk.
+  ```python
+  state_dict = nn.state.torch_load("test.pth")
+  ```
+  """
   t = Tensor.empty(os.stat(fn).st_size, dtype=dtypes.uint8, device=f"disk:{fn}")
   offsets: Dict[Union[str, int], int] = {}
@@ -81,7 +148,7 @@ def torch_load(fn:str) -> Dict[str, Tensor]:
     lens[storage[2]] = storage[4] * storage[1].itemsize
     if storage[2] not in offsets: return None
     byte_offset = offsets[storage[2]]+storage_offset*storage[1].itemsize
-    ret = t[byte_offset:byte_offset+prod(size)].cast(storage[1])
+    ret = t[byte_offset:byte_offset+prod(size)*storage[1].itemsize].bitcast(storage[1])
     # 7 lines to deal with permuted tensors. NOTE: this currently requires reading off the disk
     shape_strides = [(s, st) for s,st in zip(size, stride) if s != 1]
@@ -89,10 +156,11 @@ def torch_load(fn:str) -> Dict[str, Tensor]:
     if tuple(permute_indexes) != tuple(range(len(permute_indexes))):
       intermediate_shape = tuple([shape_strides[x][0] for x in argsort(permute_indexes)])
       assert tuple([shape_strides[i][1] for i in argsort(permute_indexes)]) == strides_for_shape(intermediate_shape), "nonpermutable strides"
-      if DEBUG >= 3: print(f"WARNING: this torch load is slow. CPU to permute {intermediate_shape} with {permute_indexes}")
-      assert storage[1] != dtypes.bfloat16, "can't CPU permute BF16"
+      if DEBUG >= 3: print(f"WARNING: this torch load is slow. CLANG to permute {intermediate_shape} with {permute_indexes}")
+      assert storage[1] != dtypes.bfloat16, "can't CLANG permute BF16"
       # TODO: find a nice way to support all shapetracker on disktensors
-      ret = ret.cpu().reshape(intermediate_shape).permute(permute_indexes)
+      # TODO: BUG: a ".realize()" is needed here for 'GPU=1 python3 test/models/test_efficientnet.py TestEfficientNet.test_car'
+      ret = ret.clang().reshape(intermediate_shape).permute(permute_indexes).realize()
     return ret.reshape(size)

tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl