PyPI - titans-pytorch - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

titans-pytorch 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

titans_pytorch/titans.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+import math
 from functools import partial
 import torch
@@ -11,12 +12,13 @@ from tensordict import TensorDict
 from titans_pytorch.associative_scan import (
     associative_scan,
-    binary_operator
+    binary_operator,
+    pad_at_dim
 )
 import einx
 from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
+from einops.layers.torch import Rearrange, Reduce
 """
 ein notation:
@@ -41,6 +43,9 @@ def default(v, d):
 def round_down_multiple(seq, mult):
     return seq // mult * mult
+def round_up_multiple(seq, mult):
+    return math.ceil(seq / mult) * mult
 def pack_one_with_inverse(t, pattern):
     packed, packed_shape = pack([t], pattern)
@@ -84,6 +89,7 @@ class NeuralMemory(Module):
     def __init__(
         self,
         dim,
+        chunk_size = 1,
         model: Module | None = None,
         store_memory_loss_fn: Callable = default_loss_fn
     ):
@@ -98,11 +104,15 @@ class NeuralMemory(Module):
         self.memory_model = model
+        # the chunk size within the paper where adaptive step, momentum, weight decay are shared
+        self.chunk_size = chunk_size
         # prepare function for per sample gradients from model above, using torch.func
         def forward_and_loss(params, inputs, target):
             pred = functional_call(self.memory_model, params, inputs)
-            loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) == v|²
+            loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
             return loss
         self.per_sample_grad_and_value_fn = vmap(grad_and_value(forward_and_loss), in_dims = (None, 0, 0))
@@ -119,9 +129,23 @@ class NeuralMemory(Module):
         # learned adaptive learning rate and momentum
         # todo - explore mlp layerwise learned lr / momentum
-        self.to_momentum = LinearNoBias(dim, 1)
-        self.to_adaptive_step = nn.Sequential(LinearNoBias(dim, 1), Rearrange('... 1 -> ...'))
-        self.to_decay_factor = LinearNoBias(dim, 1) # weight decay factor
+        self.to_momentum = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1)
+        )
+        self.to_adaptive_step = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1),
+            Rearrange('... 1 -> ...')
+        )
+        # weight decay factor
+        self.to_decay_factor = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1)
+        )
     def init_weights_and_momentum(self):
         params = TensorDict(dict(self.memory_model.named_parameters()))
@@ -137,6 +161,16 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]]
     ):
+        # curtail sequence by multiple of the chunk size
+        # only a complete chunk of the sequence provides the memory for the next chunk
+        seq_len = seq.shape[-2]
+        round_down_seq_len = round_down_multiple(seq_len, self.chunk_size)
+        seq = seq[:, :round_down_seq_len]
+        # curr weights + past weights, in the case that the initial weights are learned
         curr_weights = TensorDict(dict(self.memory_model.named_parameters()))
         past_state = tuple(TensorDict(d) for d in past_state)
@@ -148,16 +182,19 @@ class NeuralMemory(Module):
         batch = seq.shape[0]
-        adaptive_lr = self.to_adaptive_step(seq).tanh() * 0.5 + 0.5.
+        adaptive_lr = self.to_adaptive_step(seq).tanh() * 0.5 + 0.5
         adaptive_momentum = self.to_momentum(seq).sigmoid()
         decay_factor = self.to_decay_factor(seq).sigmoid()
         # keys and values
-        seq = rearrange(seq, 'b n d -> (b n) d')
         keys, values = self.to_keys_values(seq).chunk(2, dim = -1)
+        # take care of chunking
+        keys, values = tuple(rearrange(t, 'b (n c) d -> (b n) c d', c = self.chunk_size) for t in (keys, values))
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
         grads, aux_store_loss = self.per_sample_grad_and_value_fn(dict(curr_weights), keys, values)
@@ -172,31 +209,24 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: einx.multiply('b n ..., b n -> b n ...', t, -adaptive_lr))
-        # derive momentum with associative scan - eq (10)
+        # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict()
+        updates = TensorDict()
         for param_name, surprise in surprises.items():
             surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
-            _, momentum = associative_scan(binary_operator, (adaptive_momentum, surprise)) # momentum is S / surprise in the paper
-            momentum = inverse_pack(momentum)
-            next_momentum[param_name] = momentum
+            # derive momentum with associative scan - eq (10)
-        # use associative scan again for learned forgetting (weight decay) - eq (13)
-        updates = TensorDict()
+            _, momentum = associative_scan(binary_operator, (adaptive_momentum, surprise)) # momentum is S / surprise in the paper
-        for param_name, momentum in next_momentum.items():
-            momentum, inverse_pack = pack_one_with_inverse(momentum, 'b n *')
+            # use associative scan again for learned forgetting (weight decay) - eq (13)
             _, update = associative_scan(binary_operator, (1. - decay_factor, momentum)) # momentum is S / surprise in the paper
-            update = inverse_pack(update)
-            updates[param_name] = update
+            updates[param_name] = inverse_pack(update)
+            next_momentum[param_name] = inverse_pack(momentum)
         # compute the next weight per batch
@@ -211,7 +241,19 @@ class NeuralMemory(Module):
         seq,
         past_weights: dict[str, Tensor] | None = None,
     ):
-        batch = seq.shape[0]
+        chunk_size = self.chunk_size
+        batch, seq_len = seq.shape[:2]
+        assert seq_len >= chunk_size
+        seq = seq[:, (chunk_size - 1):]
+        curtailed_seq_len = seq.shape[-2]
+        next_seq_len = round_up_multiple(curtailed_seq_len, chunk_size)
+        padding = next_seq_len - curtailed_seq_len
+        seq = pad_at_dim(seq, (0, padding), dim = 1)
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
@@ -231,7 +273,7 @@ class NeuralMemory(Module):
         # fetch values from memory model
         curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
-        queries = rearrange(queries, 'b n d -> (b n) 1 d')
+        queries = rearrange(queries, 'b (n c) d -> (b n) c d', c = chunk_size)
         # forward functional call
@@ -239,7 +281,12 @@ class NeuralMemory(Module):
         # reconstitute batch dimension
-        values = rearrange(values, '(b n) 1 d -> b n d', b = batch)
+        values = rearrange(values, '(b n) c d -> b (n c) d', b = batch)
+        # restore
+        values = pad_at_dim(values, (chunk_size - 1, 0), dim = 1, value = 0.) # todo, used a learned null memory embedding instead of 0s for retrieving from empty neural memory
+        values = values[:, :-padding]
         return values

{titans_pytorch-0.0.3.dist-info → titans_pytorch-0.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.3
+Version: 0.0.5
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -49,7 +49,7 @@ Description-Content-Type: text/markdown
 ## Titans - Pytorch (wip)
-Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module.
+Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.
 ## Install
@@ -65,7 +65,10 @@ from titans_pytorch import NeuralMemory
 x = torch.randn(2, 64, 32)
-mem = NeuralMemory(32)
+mem = NeuralMemory(
+    dim = 32,
+    chunk_size = 2
+)
 out = mem(x)

titans_pytorch-0.0.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+titans_pytorch/__init__.py,sha256=QKuJPCOJCdgtaPeKoHEkYkiQe65_LV9_8-cIMbBPU30,55
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/titans.py,sha256=3Mewuysj0g7iAlfjdqMlJhn9-pKJuOerB1frQmQYXuc,9428
+titans_pytorch-0.0.5.dist-info/METADATA,sha256=f1DgCKZz9nqNfZOrqbOpyn-yEx2v5M5zgGIW0Zeu84I,3032
+titans_pytorch-0.0.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.0.5.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.0.5.dist-info/RECORD,,

titans_pytorch-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-titans_pytorch/__init__.py,sha256=QKuJPCOJCdgtaPeKoHEkYkiQe65_LV9_8-cIMbBPU30,55
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/titans.py,sha256=0Mh9LJv5hLVbB2MvRJX5QanAeTtU9LAuj6YOQUwsyUQ,7813
-titans_pytorch-0.0.3.dist-info/METADATA,sha256=AXfDl_MTIu24VRagi_rgiH8rHXFBU5euwSD6DMwLgsg,2968
-titans_pytorch-0.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.0.3.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.0.3.dist-info/RECORD,,

{titans_pytorch-0.0.3.dist-info → titans_pytorch-0.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.0.3.dist-info → titans_pytorch-0.0.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

titans-pytorch 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl