PyPI - titans-pytorch - Versions diffs - 0.1.21__tar.gz → 0.1.23__tar.gz - Mend

titans-pytorch 0.1.21tar.gz → 0.1.23tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.1.21 → titans_pytorch-0.1.23}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.21
+Version: 0.1.23
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -78,8 +78,7 @@ from titans_pytorch import NeuralMemory
 mem = NeuralMemory(
     dim = 384,
-    chunk_size = 64,
-    pre_rmsnorm = True
+    chunk_size = 64 # set to smaller chunk size for better perf on smaller sequence lengths (but more memory usage)
 ).cuda()
 seq = torch.randn(2, 1024, 384).cuda()

{titans_pytorch-0.1.21 → titans_pytorch-0.1.23}/README.md RENAMED Viewed

@@ -24,8 +24,7 @@ from titans_pytorch import NeuralMemory
 mem = NeuralMemory(
     dim = 384,
-    chunk_size = 64,
-    pre_rmsnorm = True
+    chunk_size = 64 # set to smaller chunk size for better perf on smaller sequence lengths (but more memory usage)
 ).cuda()
 seq = torch.randn(2, 1024, 384).cuda()

{titans_pytorch-0.1.21 → titans_pytorch-0.1.23}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.21"
+version = "0.1.23"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.1.21 → titans_pytorch-0.1.23}/tests/test_titans.py RENAMED Viewed

@@ -8,6 +8,9 @@ from titans_pytorch.mac_transformer import flex_attention, SegmentedAttention, M
 def exists(v):
     return v is not None
+def diff(x, y):
+    return (x - y).abs().amax()
 @pytest.mark.parametrize('seq_len', (32, 1024, 77))
 @pytest.mark.parametrize('silu', (False, True))
 @pytest.mark.parametrize('learned_mem_model_weights', (False, True))
@@ -133,6 +136,39 @@ def test_mac_sampling(sliding):
     assert torch.allclose(sampled, sampled_with_cache)
+@pytest.mark.parametrize('seq_len', (2, 64))
+def test_neural_mem_inference(
+    seq_len
+):
+    mem = NeuralMemory(
+        dim = 384,
+        chunk_size = 64,
+    )
+    seq = torch.randn(2, seq_len, 384)
+    parallel_retrieved = mem(seq)
+    assert seq.shape == parallel_retrieved.shape
+    mem_model_state = None
+    cache_store_seq = None
+    sequential_retrieved = []
+    for ind, token in enumerate(seq.unbind(dim = 1)):
+        one_retrieved, cache_store_seq, mem_model_state = mem.forward_inference(
+            token,
+            seq_index = ind,
+            cache_store_seq = cache_store_seq,
+            mem_model_state = mem_model_state
+        )
+        sequential_retrieved.append(one_retrieved)
+    sequential_retrieved = torch.cat(sequential_retrieved, dim = -2)
+    assert torch.allclose(parallel_retrieved, sequential_retrieved, atol = 1e-5)
 @pytest.mark.parametrize('seq_len', (1023, 17))
 @pytest.mark.parametrize('sliding', (True, False))
 def test_flex(
@@ -157,3 +193,23 @@ def test_flex(
     out_non_flex, _ = attn(seq, disable_flex_attn = True)
     assert torch.allclose(out_flex, out_non_flex, atol = 1e-5)
+def test_assoc_scan():
+    from titans_pytorch.titans import AssocScan
+    import torch.nn.functional as F
+    scan = AssocScan()
+    gates = torch.randn(2, 1024, 512).sigmoid()
+    inputs = torch.randn(2, 1024, 512)
+    output = scan(gates, inputs)
+    gates1, gates2 = gates[:, :512], gates[:, 512:]
+    inputs1, inputs2 = inputs[:, :512], inputs[:, 512:]
+    first_half = scan(gates1, inputs1)
+    second_half = scan(gates2, inputs2, prev = inputs2[:, -1])
+    assert torch.allclose(output[:, -1], second_half[:, -1], atol = 1e-5)

{titans_pytorch-0.1.21 → titans_pytorch-0.1.23}/titans_pytorch/titans.py RENAMED Viewed

@@ -41,6 +41,19 @@ def exists(v):
 def default(v, d):
     return v if exists(v) else d
+def xnor(x, y):
+    return not (x ^ y)
+def safe_cat(inputs, dim = -2):
+    inputs = tuple(filter(exists, inputs))
+    if len(inputs) == 0:
+        return None
+    elif len(inputs) == 1:
+        return inputs[0]
+    return cat(inputs, dim = dim)
 def identity(t):
     return t
@@ -311,7 +324,11 @@ class AssocScan(Module):
         super().__init__()
         self.use_accelerated = use_accelerated
-    def forward(self, gates, inputs):
+    def forward(self, gates, inputs, prev = None):
+        if exists(prev):
+            inputs, _ = pack([prev, inputs], 'b * d')
+            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
         if not self.use_accelerated:
             _, outputs = associative_scan(binary_operator, (gates, inputs))
@@ -366,6 +383,7 @@ class NeuralMemory(Module):
         pre_rmsnorm = True,
         post_rmsnorm = True,
         qk_rmsnorm = False,
+        accept_value_residual = False,
         learned_mem_model_weights = True,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
@@ -399,7 +417,7 @@ class NeuralMemory(Module):
         self.heads = heads
-        self.split_heads = Rearrange('b n (h d) -> (b h) n d', h = heads)
+        self.split_heads = Rearrange('b n (h d) -> b h n d', h = heads)
         self.merge_heads = Rearrange('b h n d -> b n (h d)')
         self.combine_heads = LinearNoBias(dim_inner, dim) if heads > 1 else nn.Identity()
@@ -448,6 +466,14 @@ class NeuralMemory(Module):
         self.to_keys_values = Sequential(LinearNoBias(dim, dim_inner * 2), activation)
         self.store_memory_loss_fn = store_memory_loss_fn
+        # value residual learning
+        self.learned_value_residual = Sequential(
+            LinearNoBias(dim, heads),
+            Rearrange('b n h -> b h n 1'),
+            nn.Sigmoid()
+        ) if accept_value_residual else None
         # empty memory embed
         self.empty_memory_embed = nn.Parameter(torch.zeros(dim))
@@ -529,8 +555,11 @@ class NeuralMemory(Module):
         seq,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]],
         return_aux_kv_loss = False,
-        chunk_size = None
+        chunk_size = None,
+        value_residual = None
     ):
+        assert xnor(exists(value_residual), exists(self.learned_value_residual))
         seq_len, chunk_size = seq.shape[-2], default(chunk_size, self.store_chunk_size)
         # handle edge case
@@ -585,9 +614,17 @@ class NeuralMemory(Module):
         keys = self.k_norm(keys)
+        # maybe value residual learning
+        orig_values = values
+        if exists(self.learned_value_residual):
+            mix = self.learned_value_residual(seq)
+            values = values.lerp(value_residual, mix)
         # take care of chunking
-        keys, values = tuple(rearrange(t, 'b (n c) d -> (b n) c d', c = chunk_size) for t in (keys, values))
+        keys, values = tuple(rearrange(t, 'b h (n c) d -> (b h n) c d', c = chunk_size) for t in (keys, values))
         adaptive_lr = rearrange(adaptive_lr, 'b (n c) -> (b n) c', c = chunk_size)
@@ -645,15 +682,17 @@ class NeuralMemory(Module):
         last_update = updates.apply(lambda t: t[:, -1])
+        output = (updates, orig_values)
         if not return_aux_kv_loss:
-            return updates
+            return output
-        return updates, aux_kv_recon_loss.mean()
+        return output, aux_kv_recon_loss.mean()
     def retrieve_memories(
         self,
         seq,
-        past_weights: dict[str, Tensor] | None = None,
+        past_weights: dict[str, Tensor],
         chunk_size = None
     ):
         chunk_size = default(chunk_size, self.retrieve_chunk_size)
@@ -675,13 +714,7 @@ class NeuralMemory(Module):
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
-        curr_weights = TensorDict(dict(self.memory_model.named_parameters()))
-        if exists(past_weights):
-            past_weights = TensorDict(past_weights)
-            assert past_weights.keys() == curr_weights.keys()
-            curr_weights = curr_weights + past_weights
+        curr_weights = TensorDict(past_weights)
         # sequence Float['b n d'] to queries
@@ -698,7 +731,7 @@ class NeuralMemory(Module):
         # fetch values from memory model
         curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
-        queries = rearrange(queries, 'b (n c) d -> (b n) c d', c = chunk_size)
+        queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
         # forward functional call
@@ -728,6 +761,56 @@ class NeuralMemory(Module):
         return values[:, :seq_len]
+    def forward_inference(
+        self,
+        token: Tensor,
+        seq_index = None, # the index of the token in the sequence, starts at 0
+        mem_model_state = None,
+        cache_store_seq = None
+    ):
+        seq_index = default(seq_index, 0)
+        curr_seq_len = seq_index + 1
+        batch = token.shape[0]
+        if token.ndim == 2:
+            token = rearrange(token, 'b d -> b 1 d')
+        # init memory model if needed
+        if not exists(mem_model_state):
+            mem_model_state = self.init_weights_and_momentum()
+        # increment the sequence cache which is at most the chunk size
+        cache_store_seq = safe_cat((cache_store_seq, token), dim = -2)
+        # early return empty memory, when no memories are stored for steps < first chunk size
+        if curr_seq_len < self.chunk_size:
+            empty_mem = self.init_empty_memory_embed(batch, 1)
+            return empty_mem, cache_store_seq, mem_model_state
+        # store if storage sequence cache hits the chunk size
+        store_seq_cache_len = cache_store_seq.shape[-2]
+        if store_seq_cache_len == self.chunk_size:
+            updates, _ = self.store_memories(cache_store_seq, mem_model_state)
+            past_weights, past_momentum = mem_model_state
+            mem_model_state = (past_weights + updates, past_momentum)
+            cache_store_seq = None
+        # retrieve
+        past_weights, _ = mem_model_state
+        retrieved = self.retrieve_memories(token, past_weights, chunk_size = 1)
+        return retrieved, cache_store_seq, mem_model_state
     def forward(
         self,
         seq,
@@ -735,7 +818,8 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
-        store_chunk_size = None
+        store_chunk_size = None,
+        return_values = False
     ):
         batch, seq_len = seq.shape[:2]
@@ -756,13 +840,18 @@ class NeuralMemory(Module):
         store_seq = default(store_seq, seq)
         store_chunk_size = default(store_chunk_size, chunk_size)
-        updates, aux_kv_recon_loss = self.store_memories(store_seq, past_state, chunk_size = store_chunk_size, return_aux_kv_loss = True)
+        (updates, values), aux_kv_recon_loss = self.store_memories(store_seq, past_state, chunk_size = store_chunk_size, return_aux_kv_loss = True)
         past_weights, _ = past_state
         retrieved = self.retrieve_memories(seq, past_weights + updates, chunk_size = chunk_size)
+        output = retrieved
+        if return_values:
+            output = (retrieved, values)
         if not return_aux_kv_loss:
-            return retrieved
+            return output
-        return retrieved, aux_kv_recon_loss
+        return output, aux_kv_recon_loss