PyPI - titans-pytorch - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

titans-pytorch 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.2
+Version: 0.0.4
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -49,7 +49,7 @@ Description-Content-Type: text/markdown
 ## Titans - Pytorch (wip)
-Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module.
+Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.
 ## Install

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
 ## Titans - Pytorch (wip)
-Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module.
+Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.
 ## Install

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.2"
+version = "0.0.4"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/titans_pytorch/titans.py RENAMED Viewed

@@ -16,7 +16,7 @@ from titans_pytorch.associative_scan import (
 import einx
 from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
+from einops.layers.torch import Rearrange, Reduce
 """
 ein notation:
@@ -84,6 +84,7 @@ class NeuralMemory(Module):
     def __init__(
         self,
         dim,
+        chunk_size = 1,
         model: Module | None = None,
         store_memory_loss_fn: Callable = default_loss_fn
     ):
@@ -98,11 +99,15 @@ class NeuralMemory(Module):
         self.memory_model = model
+        # the chunk size within the paper where adaptive step, momentum, weight decay are shared
+        self.chunk_size = chunk_size
         # prepare function for per sample gradients from model above, using torch.func
         def forward_and_loss(params, inputs, target):
             pred = functional_call(self.memory_model, params, inputs)
-            loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) == v|²
+            loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
             return loss
         self.per_sample_grad_and_value_fn = vmap(grad_and_value(forward_and_loss), in_dims = (None, 0, 0))
@@ -119,9 +124,23 @@ class NeuralMemory(Module):
         # learned adaptive learning rate and momentum
         # todo - explore mlp layerwise learned lr / momentum
-        self.to_momentum = LinearNoBias(dim, 1)
-        self.to_adaptive_step = nn.Sequential(LinearNoBias(dim, 1), Rearrange('... 1 -> ...'))
-        self.to_decay_factor = LinearNoBias(dim, 1) # weight decay factor
+        self.to_momentum = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1)
+        )
+        self.to_adaptive_step = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1),
+            Rearrange('... 1 -> ...')
+        )
+        # weight decay factor
+        self.to_decay_factor = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1)
+        )
     def init_weights_and_momentum(self):
         params = TensorDict(dict(self.memory_model.named_parameters()))
@@ -137,6 +156,16 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]]
     ):
+        # curtail sequence by multiple of the chunk size
+        # only a complete chunk of the sequence provides the memory for the next chunk
+        seq_len = seq.shape[-2]
+        round_down_seq_len = round_down_multiple(seq_len, self.chunk_size)
+        seq = seq[:, :round_down_seq_len]
+        # curr weights + past weights, in the case that the initial weights are learned
         curr_weights = TensorDict(dict(self.memory_model.named_parameters()))
         past_state = tuple(TensorDict(d) for d in past_state)
@@ -148,16 +177,19 @@ class NeuralMemory(Module):
         batch = seq.shape[0]
-        adaptive_lr = self.to_adaptive_step(seq).tanh() * 0.5 + 1.
+        adaptive_lr = self.to_adaptive_step(seq).tanh() * 0.5 + 0.5
         adaptive_momentum = self.to_momentum(seq).sigmoid()
         decay_factor = self.to_decay_factor(seq).sigmoid()
         # keys and values
-        seq = rearrange(seq, 'b n d -> (b n) d')
         keys, values = self.to_keys_values(seq).chunk(2, dim = -1)
+        # take care of chunking
+        keys, values = tuple(rearrange(t, 'b (n c) d -> (b n) c d', c = self.chunk_size) for t in (keys, values))
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
         grads, aux_store_loss = self.per_sample_grad_and_value_fn(dict(curr_weights), keys, values)
@@ -172,31 +204,24 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: einx.multiply('b n ..., b n -> b n ...', t, -adaptive_lr))
-        # derive momentum with associative scan - eq (10)
+        # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict()
+        updates = TensorDict()
         for param_name, surprise in surprises.items():
             surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
-            _, momentum = associative_scan(binary_operator, (adaptive_momentum, surprise)) # momentum is S / surprise in the paper
-            momentum = inverse_pack(momentum)
-            next_momentum[param_name] = momentum
+            # derive momentum with associative scan - eq (10)
-        # use associative scan again for learned forgetting (weight decay) - eq (13)
-        updates = TensorDict()
+            _, momentum = associative_scan(binary_operator, (adaptive_momentum, surprise)) # momentum is S / surprise in the paper
-        for param_name, momentum in next_momentum.items():
-            momentum, inverse_pack = pack_one_with_inverse(momentum, 'b n *')
+            # use associative scan again for learned forgetting (weight decay) - eq (13)
             _, update = associative_scan(binary_operator, (1. - decay_factor, momentum)) # momentum is S / surprise in the paper
-            update = inverse_pack(update)
-            updates[param_name] = update
+            updates[param_name] = inverse_pack(update)
+            next_momentum[param_name] = inverse_pack(momentum)
         # compute the next weight per batch

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/.gitignore RENAMED Viewed

File without changes

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/LICENSE RENAMED Viewed

File without changes

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/fig1.png RENAMED Viewed

File without changes

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/fig2.png RENAMED Viewed

File without changes

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/titans_pytorch/__init__.py RENAMED Viewed

File without changes

{titans_pytorch-0.0.2 → titans_pytorch-0.0.4}/titans_pytorch/associative_scan.py RENAMED Viewed

File without changes

titans-pytorch 0.0.2__tar.gz → 0.0.4__tar.gz

titans-pytorch 0.0.2tar.gz → 0.0.4tar.gz