PyPI - titans-pytorch - Versions diffs - 0.0.18__tar.gz → 0.0.20__tar.gz - Mend

titans-pytorch 0.0.18tar.gz → 0.0.20tar.gz

Files changed (18) hide show

{titans_pytorch-0.0.18 → titans_pytorch-0.0.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.18
+Version: 0.0.20
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.0.18 → titans_pytorch-0.0.20}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.18"
+version = "0.0.20"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.18 → titans_pytorch-0.0.20}/tests/test_titans.py RENAMED Viewed

@@ -1,7 +1,11 @@
 import torch
 import pytest
-def test_titans():
+@pytest.mark.parametrize('seq_len', (32, 1024, 77))
+def test_titans(
+    seq_len
+):
     from titans_pytorch import NeuralMemory
     mem = NeuralMemory(
@@ -9,7 +13,7 @@ def test_titans():
         chunk_size = 64,
     )
-    seq = torch.randn(2, 1024, 384)
+    seq = torch.randn(2, seq_len, 384)
     retrieved = mem(seq)
     assert seq.shape == retrieved.shape

{titans_pytorch-0.0.18 → titans_pytorch-0.0.20}/titans_pytorch/titans.py RENAMED Viewed

@@ -17,7 +17,7 @@ from titans_pytorch.associative_scan import (
 )
 import einx
-from einops import rearrange, pack, unpack
+from einops import rearrange, repeat, pack, unpack
 from einops.layers.torch import Rearrange, Reduce
 """
@@ -55,6 +55,21 @@ def pack_one_with_inverse(t, pattern):
     return packed, inverse
+# softclamping gradients
+def softclamp_max(t, max_value):
+    half_max_value = max_value / 2
+    return ((t / half_max_value).tanh() * half_max_value) + half_max_value
+def softclamp_grad_norm(t, max_value):
+    t, inverse = pack_one_with_inverse(t, 'bn *')
+    norm = t.norm(dim = -1, keepdim = True)
+    clamped_norm = softclamp_max(norm, max_value)
+    t = t * (clamped_norm / norm)
+    return inverse(t)
 # classes
 class MemoryMLP(Module):
@@ -96,6 +111,7 @@ class NeuralMemory(Module):
         store_memory_loss_fn: Callable = default_loss_fn,
         pre_rmsnorm = True,
         post_rmsnorm = True,
+        max_grad_norm: float | None = None,
         use_accelerated_scan = False,
         default_mlp_kwargs: dict = dict(
             depth = 4
@@ -152,6 +168,11 @@ class NeuralMemory(Module):
         self.to_keys_values = LinearNoBias(dim, dim_inner * 2)
         self.store_memory_loss_fn = store_memory_loss_fn
+        # empty memory embed
+        self.empty_memory_embed = nn.Parameter(torch.zeros(dim))
+        nn.init.normal_(self.empty_memory_embed, std = 0.02)
         # learned adaptive learning rate and momentum
         # todo - explore mlp layerwise learned lr / momentum
@@ -167,6 +188,10 @@ class NeuralMemory(Module):
             Rearrange('b n h -> (b h) n')
         )
+        # allow for softclamp the gradient norms for storing memories
+        self.max_grad_norm = max_grad_norm
         # weight decay factor
         self.to_decay_factor = nn.Sequential(
@@ -187,6 +212,9 @@ class NeuralMemory(Module):
         return init_weights, init_momentum
+    def init_empty_memory_embed(self, batch, seq_len):
+        return repeat(self.empty_memory_embed, 'd -> b n d', b = batch, n = seq_len)
     def store_memories(
         self,
         seq,
@@ -239,6 +267,11 @@ class NeuralMemory(Module):
         grads = TensorDict(grads)
+        # maybe softclamp grad norm
+        if exists(self.max_grad_norm):
+            grads = grads.apply(lambda t: softclamp_grad_norm(t, self.max_grad_norm))
         # restore batch and sequence dimension
         grads = grads.apply(lambda t: rearrange(t, '(b n) ... -> b n ...', b = batch))
@@ -372,11 +405,12 @@ class NeuralMemory(Module):
         values = self.post_rmsnorm(values)
-        # restore
+        # restore, pad with empty memory embed
-        values = pad_at_dim(values, (chunk_size - 1, 0), dim = 1, value = 0.) # todo, used a learned null memory embedding instead of 0s for retrieving from empty neural memory
-        values = values[:, :-padding]
+        empty_memory_embeds = self.init_empty_memory_embed(values.shape[0], chunk_size - 1)
+        values = torch.cat((empty_memory_embeds, values), dim = -2)
+        values = values[:, :-padding]
         return values
     def forward(
@@ -389,7 +423,7 @@ class NeuralMemory(Module):
         batch, seq_len = seq.shape[:2]
         if seq_len < self.chunk_size:
-            return torch.zeros_like(seq)
+            return self.init_empty_memory_embed(batch, seq_len)
         if exists(past_state):
             past_state = tuple(TensorDict(d) for d in past_state)