PyPI - tiny-recursive-model - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

tiny-recursive-model 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

tiny_recursive_model/__init__.py CHANGED Viewed

@@ -1,4 +1,11 @@
 from tiny_recursive_model.trm import (
     TinyRecursiveModel,
+)
+from tiny_recursive_model.trainer import (
     Trainer
 )
+from tiny_recursive_model.mlp_mixer_1d import (
+    MLPMixer1D
+)

tiny_recursive_model/trainer.py ADDED Viewed

@@ -0,0 +1,157 @@
+from __future__ import annotations
+import torch
+from torch.nn import Module
+from torch.optim import AdamW
+from torch.utils.data import Dataset, DataLoader
+from einops import pack, unpack
+from accelerate import Accelerator
+# ema - apparently greatly helped with results
+from ema_pytorch import EMA
+from tiny_recursive_model.trm import TinyRecursiveModel
+# helpers
+def range_from_one(n):
+    return range(1, n + 1)
+def is_empty(t):
+    return t.numel() == 0
+# trainer
+def newtonschulz5(
+    t,
+    steps = 5,
+    eps = 1e-7,
+    coefs = (3.4445, -4.7750, 2.0315)
+):
+    if t.ndim <= 3:
+        return t
+    shape = t.shape
+    should_transpose = shape[-2] > shape[-1]
+    if should_transpose:
+        t = t.transpose(-1, -2)
+    t, packed_shape = pack([t], '* i j')
+    t = t / t.norm(dim = (-1, -2), keepdim = True).clamp(min = eps)
+    a, b, c = coefs
+    for _ in range(steps):
+        A = t @ t.transpose(-1, -2)
+        B = b * A + c * A @ A
+        t = a * t + B @ t
+    t, = unpack(t, packed_shape, '* i j')
+    if should_transpose:
+        t = t.transpose(-1, -2)
+    return t
+class Trainer(Module):
+    def __init__(
+        self,
+        model: TinyRecursiveModel | Module,
+        dataset: Dataset,
+        optim_klass = AdamW,
+        learning_rate = 1e-4,
+        weight_decay = 1.,
+        batch_size = 16,
+        epochs = 2,
+        halt_prob_thres = 0.5,
+        max_recurrent_steps = 12,
+        ema_decay_rate = 0.999,
+        switch_ema_every = 10000,           # switch ema https://arxiv.org/abs/2402.09240
+        accelerate_kwargs: dict = dict(),
+        cpu = False
+    ):
+        super().__init__()
+        self.accelerator  = Accelerator(**accelerate_kwargs, cpu = cpu)
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.dataset = dataset
+        self.dataloader = dataloader = DataLoader(self.dataset, batch_size = self.batch_size, shuffle = True)
+        self.optim = optim_klass(
+            model.parameters(),
+            lr = learning_rate,
+            weight_decay = weight_decay
+        )
+        self.model = model
+        # ema model
+        self.ema_model = None
+        if self.accelerator.is_main_process:
+            self.ema_model = EMA(
+                model,
+                beta = ema_decay_rate,
+                update_model_with_ema_every = switch_ema_every,
+                forward_method_names = ('predict',)
+            )
+        # recurrent and act related variables
+        self.halt_prob_thres = halt_prob_thres
+        self.max_recurrent_steps = max_recurrent_steps
+        # prepare maybe distributed
+        self.model, self.optim, self.dataloader = self.accelerator.prepare(self.model, self.optim, self.dataloader)
+    def forward(self):
+        for epoch in range_from_one(self.epochs):
+            for dataset_input, dataset_output in self.dataloader:
+                outputs, latents = self.model.get_initial()
+                for recurrent_step in range_from_one(self.max_recurrent_steps):
+                    loss, (main_loss, halt_loss), outputs, latents, pred, halt = self.model(dataset_input, outputs, latents, labels = dataset_output)
+                    self.accelerator.print(f'[{epoch} ({recurrent_step} / {self.max_recurrent_steps})] loss: {main_loss.mean().item():.3f} | halt loss: {halt_loss.mean().item():.3f}')
+                    self.accelerator.backward(loss)
+                    self.optim.step()
+                    self.optim.zero_grad()
+                    if self.accelerator.is_main_process:
+                        self.ema_model.update()
+                    # handle halting
+                    halt_mask = halt >= self.halt_prob_thres
+                    if not halt_mask.any():
+                        continue
+                    outputs = outputs[~halt_mask]
+                    latents = latents[~halt_mask]
+                    dataset_input = dataset_input[~halt_mask]
+                    dataset_output = dataset_output[~halt_mask]
+                    if is_empty(outputs):
+                        break
+        self.accelerator.print('complete')
+        if self.accelerator.is_main_process:
+            self.ema_model.copy_params_from_ema_to_model()

tiny_recursive_model/trm.py CHANGED Viewed

@@ -2,13 +2,11 @@ from __future__ import annotations
 from contextlib import nullcontext
 import torch
-from torch import nn
+from torch import nn, cat, arange, tensor
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList
-from torch.optim import AdamW
-from torch.utils.data import Dataset, DataLoader
-from einops import rearrange, repeat
+from einops import rearrange, repeat, reduce, pack, unpack
 from einops.layers.torch import Reduce, Rearrange
 # network related
@@ -16,10 +14,6 @@ from einops.layers.torch import Reduce, Rearrange
 from x_transformers import Encoder
 from tiny_recursive_model.mlp_mixer_1d import MLPMixer1D
-# ema - apparently greatly helped with results
-from ema_pytorch import EMA
 # helpers
 def exists(v):
@@ -28,9 +22,6 @@ def exists(v):
 def default(v, d):
     return v if exists(v) else d
-def range_from_one(n):
-    return range(1, n + 1)
 def is_empty(t):
     return t.numel() == 0
@@ -72,6 +63,10 @@ class TinyRecursiveModel(Module):
         self.halt_loss_weight = halt_loss_weight
+    @property
+    def device(self):
+        return next(self.parameters()).device
     def refine_latent_then_output_once(
         self,
         inputs,     # (b n d)
@@ -115,118 +110,112 @@ class TinyRecursiveModel(Module):
         return outputs, latents
-    def forward(
+    @torch.no_grad()
+    def predict(
         self,
         seq,
-        outputs,
-        latents,
-        labels = None
+        halt_prob_thres = 0.5,
+        num_deep_refinement_steps = 12
     ):
+        batch = seq.shape[0]
         inputs = self.input_embed(seq)
-        outputs, latents = self.deep_refinement(inputs, outputs, latents)
+        outputs, latents = self.get_initial()
-        pred = self.to_pred(outputs)
+        # active batch indices, the step it exited at, and the final output predictions
-        should_halt = self.to_halt_pred(outputs)
+        active_batch_indices = arange(batch, device = self.device, dtype = torch.float32)
-        outputs, latents = outputs.detach(), latents.detach()
+        preds = []
+        exited_step_indices = []
+        exited_batch_indices = []
-        return_package = (outputs, latents, pred, should_halt)
+        for i in range(num_deep_refinement_steps):
+            step = i + 1
+            is_last = step == num_deep_refinement_steps
-        if not exists(labels):
-            return return_package
+            outputs, latents = self.deep_refinement(inputs, outputs, latents)
-        # calculate loss if labels passed in
+            halt_prob = self.to_halt_pred(outputs)
-        loss = F.cross_entropy(rearrange(pred, 'b n l -> b l n'), labels)
+            should_halt = (halt_prob >= halt_prob_thres) | is_last
-        is_all_correct = (pred.argmax(dim = -1) == labels).all(dim = -1)
+            if not should_halt.any():
+                continue
-        halt_loss = F.binary_cross_entropy(should_halt, is_all_correct.float())
+            # append to exited predictions
-        # total loss and loss breakdown
+            pred = self.to_pred(outputs[should_halt])
+            preds.append(pred)
-        total_loss = loss + halt_loss * self.halt_loss_weight
-        losses = (loss, halt_loss)
+            # append the step at which early halted
-        return (total_loss, losses, *return_package)
+            exited_step_indices.extend([step] * should_halt.sum().item())
-# trainer
+            # append indices for sorting back
-class Trainer(Module):
-    def __init__(
-        self,
-        model: TinyRecursiveModel | Module,
-        dataset: Dataset,
-        optim_klass = AdamW,
-        learning_rate = 1e-4,
-        weight_decay = 1.,
-        batch_size = 16,
-        epochs = 2,
-        halt_prob_thres = 0.5,
-        max_recurrent_steps = 12,
-        ema_decay_rate = 0.999,
-        ema_update_model_with_ema_every = 10000
-    ):
-        super().__init__()
+            exited_batch_indices.append(active_batch_indices[should_halt])
-        self.batch_size = batch_size
-        self.epochs = epochs
+            if is_last:
+                continue
-        self.dataset = dataset
-        self.dataloader = dataloader = DataLoader(self.dataset, batch_size = self.batch_size, shuffle = True)
+            # ready for next round
-        self.optim = optim_klass(
-            model.parameters(),
-            lr = learning_rate,
-            weight_decay = weight_decay
-        )
+            inputs = inputs[~should_halt]
+            outputs = outputs[~should_halt]
+            latents = latents[~should_halt]
+            active_batch_indices = active_batch_indices[~should_halt]
-        self.model = model
+            if is_empty(outputs):
+                break
-        self.ema_model = EMA(
-            model,
-            beta = ema_decay_rate,
-            update_model_with_ema_every = ema_update_model_with_ema_every
-        )
+        preds = cat(preds).argmax(dim = -1)
+        exited_step_indices = tensor(exited_step_indices)
-        self.halt_prob_thres = halt_prob_thres
+        exited_batch_indices = cat(exited_batch_indices)
+        sort_indices = exited_batch_indices.argsort(dim = -1)
-        self.max_recurrent_steps = max_recurrent_steps
+        return preds[sort_indices], exited_step_indices[sort_indices]
-    def forward(self):
+    def forward(
+        self,
+        seq,
+        outputs,
+        latents,
+        labels = None
+    ):
+        inputs = self.input_embed(seq)
-        for epoch in range_from_one(self.epochs):
+        outputs, latents = self.deep_refinement(inputs, outputs, latents)
-            for dataset_input, dataset_output in self.dataloader:
+        pred = self.to_pred(outputs)
-                outputs, latents = self.model.get_initial()
+        halt_prob = self.to_halt_pred(outputs)
-                for recurrent_step in range_from_one(self.max_recurrent_steps):
+        outputs, latents = outputs.detach(), latents.detach()
-                    loss, (main_loss, halt_loss), outputs, latents, pred, halt = self.model(dataset_input, outputs, latents, labels = dataset_output)
+        return_package = (outputs, latents, pred, halt_prob)
-                    print(f'[{epoch} ({recurrent_step} / {self.max_recurrent_steps})] loss: {main_loss.item():.3f} | halt loss: {halt_loss.item():.3f}')
+        if not exists(labels):
+            return return_package
-                    loss.backward()
+        # calculate loss if labels passed in
-                    self.optim.step()
-                    self.optim.zero_grad()
+        loss = F.cross_entropy(rearrange(pred, 'b n l -> b l n'), labels, reduction = 'none')
+        loss = reduce(loss, 'b ... -> b', 'mean')
-                    self.ema_model.update()
+        is_all_correct = (pred.argmax(dim = -1) == labels).all(dim = -1)
-                    # handle halting
+        halt_loss = F.binary_cross_entropy(halt_prob, is_all_correct.float(), reduction = 'none')
-                    halt_mask = halt >= self.halt_prob_thres
+        # total loss and loss breakdown
-                    if not halt_mask.any():
-                        continue
+        total_loss = (
+            loss +
+            halt_loss * self.halt_loss_weight
+        )
-                    outputs = outputs[~halt_mask]
-                    latents = latents[~halt_mask]
-                    dataset_input = dataset_input[~halt_mask]
-                    dataset_output = dataset_output[~halt_mask]
+        losses = (loss, halt_loss)
-                    if is_empty(outputs):
-                        break
+        return (total_loss.mean(), losses, *return_package)

{tiny_recursive_model-0.0.1.dist-info → tiny_recursive_model-0.0.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tiny-recursive-model
-Version: 0.0.1
+Version: 0.0.3
 Summary: Tiny Recursive Model
 Project-URL: Homepage, https://pypi.org/project/tiny-recursive-model/
 Project-URL: Repository, https://github.com/lucidrains/tiny-recursive-model
@@ -55,6 +55,53 @@ Official repository is [here](https://github.com/SamsungSAILMontreal/TinyRecursi
 <img width="300" alt="trm-fig3" src="https://github.com/user-attachments/assets/bfe3dd2a-e859-492a-84d5-faf37339f534" />
+## Install
+```bash
+$ pip install tiny-recursive-model
+```
+## Usage
+```python
+import torch
+from tiny_recursive_model import TinyRecursiveModel, MLPMixer1D, Trainer
+trm = TinyRecursiveModel(
+    dim = 16,
+    num_tokens = 256,
+    network = MLPMixer1D(
+        dim = 16,
+        depth = 2,
+        seq_len = 256
+    ),
+)
+from torch.utils.data import Dataset
+class MockDataset(Dataset):
+    def __len__(self):
+        return 16
+    def __getitem__(self, idx):
+        inp = torch.randint(0, 256, (256,))
+        out = torch.randint(0, 256, (256,))
+        return inp, out
+trainer = Trainer(
+    trm,
+    MockDataset(),
+    epochs = 1,
+    batch_size = 16,
+    cpu = True
+)
+trainer()
+pred_answer, exit_indices = trm.predict(torch.randint(0, 256, (1, 256)), halt_prob_thres = 0.1)
+torch.save(trm.state_dict(), 'saved-trm.pt')
+```
 ## Citations
 ```bibtex

tiny_recursive_model-0.0.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+tiny_recursive_model/__init__.py,sha256=zuMcrofGu7DnvJM2Mb-O3tqBJF5q8L-8X8OTmq7_o5w,189
+tiny_recursive_model/mlp_mixer_1d.py,sha256=6ivDK9dgHdVl1axg2ayifJ7H5QI3hXptHnb6lfNrno0,1398
+tiny_recursive_model/trainer.py,sha256=6dQPmRaQZWI6527OvlOdgHKCFsufkZnjSHClRdHjs20,4218
+tiny_recursive_model/trm.py,sha256=Ep18uwvhWjHxGeyv42ruXLVc2F6TlZg2_CmeVVfYz7c,6001
+tiny_recursive_model-0.0.3.dist-info/METADATA,sha256=0enBPVOxRoReOf0hms_ZoAI4HHdMWUrrW4Ps0MTuQ9g,3943
+tiny_recursive_model-0.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+tiny_recursive_model-0.0.3.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+tiny_recursive_model-0.0.3.dist-info/RECORD,,

tiny_recursive_model-0.0.1.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-tiny_recursive_model/__init__.py,sha256=UufV6--ilPn4quRWyhvaFRMKRfHvfLsAmF9RU-L31rM,77
-tiny_recursive_model/mlp_mixer_1d.py,sha256=6ivDK9dgHdVl1axg2ayifJ7H5QI3hXptHnb6lfNrno0,1398
-tiny_recursive_model/trm.py,sha256=YwzTod4CeeXlbAiM-TBB7rEEHWsxnPxavaGiVCTPMEM,6350
-tiny_recursive_model-0.0.1.dist-info/METADATA,sha256=G-cM7okuLAiOxhofXoRh2Ih-bwYifcA3AAhmYmKo-v4,3107
-tiny_recursive_model-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-tiny_recursive_model-0.0.1.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-tiny_recursive_model-0.0.1.dist-info/RECORD,,

{tiny_recursive_model-0.0.1.dist-info → tiny_recursive_model-0.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{tiny_recursive_model-0.0.1.dist-info → tiny_recursive_model-0.0.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tiny-recursive-model 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

tiny-recursive-model 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl