PyPI - titans-pytorch - Versions diffs - 0.1.22__tar.gz → 0.1.26__tar.gz - Mend

titans-pytorch 0.1.22tar.gz → 0.1.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.1.22 → titans_pytorch-0.1.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.22
+Version: 0.1.26
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -78,7 +78,7 @@ from titans_pytorch import NeuralMemory
 mem = NeuralMemory(
     dim = 384,
-    chunk_size = 64
+    chunk_size = 64 # set to smaller chunk size for better perf on smaller sequence lengths (but more memory usage)
 ).cuda()
 seq = torch.randn(2, 1024, 384).cuda()

{titans_pytorch-0.1.22 → titans_pytorch-0.1.26}/README.md RENAMED Viewed

@@ -24,7 +24,7 @@ from titans_pytorch import NeuralMemory
 mem = NeuralMemory(
     dim = 384,
-    chunk_size = 64
+    chunk_size = 64 # set to smaller chunk size for better perf on smaller sequence lengths (but more memory usage)
 ).cuda()
 seq = torch.randn(2, 1024, 384).cuda()

{titans_pytorch-0.1.22 → titans_pytorch-0.1.26}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.22"
+version = "0.1.26"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.1.22 → titans_pytorch-0.1.26}/tests/test_titans.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from contextlib import contextmanager
 import torch
 from torch import nn
@@ -5,12 +7,25 @@ import pytest
 from titans_pytorch import NeuralMemory
 from titans_pytorch.mac_transformer import flex_attention, SegmentedAttention, MemoryAsContextTransformer
+# functions
 def exists(v):
     return v is not None
+def diff(x, y):
+    return (x - y).abs().amax()
+@contextmanager
+def torch_default_dtype(dtype):
+    prev_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(prev_dtype)
+# main test
 @pytest.mark.parametrize('seq_len', (32, 1024, 77))
 @pytest.mark.parametrize('silu', (False, True))
-@pytest.mark.parametrize('learned_mem_model_weights', (False, True))
 @pytest.mark.parametrize('attn_pool_chunks', (False, True))
 @pytest.mark.parametrize('momentum', (False, True))
 @pytest.mark.parametrize('qk_rmsnorm', (False, True))
@@ -19,7 +34,6 @@ def exists(v):
 def test_titans(
     seq_len,
     silu,
-    learned_mem_model_weights,
     attn_pool_chunks,
     momentum,
     qk_rmsnorm,
@@ -35,7 +49,6 @@ def test_titans(
         momentum = momentum,
         qk_rmsnorm = qk_rmsnorm,
         per_parameter_lr_modulation = per_parameter_lr_modulation,
-        learned_mem_model_weights = learned_mem_model_weights
     )
     seq = torch.randn(2, seq_len, 384)
@@ -111,7 +124,11 @@ def test_mac(
     assert logits.shape == (1, seq_len, 256)
 @pytest.mark.parametrize('sliding', (False, True))
-def test_mac_sampling(sliding):
+@pytest.mark.parametrize('mem_layers', ((), None, (4,)))
+def test_mac_sampling(
+    sliding,
+    mem_layers
+):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
         dim = 256,
@@ -120,7 +137,7 @@ def test_mac_sampling(sliding):
         num_persist_mem_tokens = 4,
         num_longterm_mem_tokens = 0,
         sliding_window_attn = sliding,
-        neural_memory_layers = (),
+        neural_memory_layers = mem_layers,
         neural_mem_gate_attn_output = False
     )
@@ -133,6 +150,38 @@ def test_mac_sampling(sliding):
     assert torch.allclose(sampled, sampled_with_cache)
+@pytest.mark.parametrize('seq_len', (2, 64, 256))
+@torch_default_dtype(torch.float64)
+def test_neural_mem_inference(
+    seq_len
+):
+    mem = NeuralMemory(
+        dim = 384,
+        chunk_size = 64,
+    )
+    seq = torch.randn(2, seq_len, 384)
+    parallel_retrieved = mem(seq)
+    assert seq.shape == parallel_retrieved.shape
+    state = None
+    sequential_retrieved = []
+    for ind, token in enumerate(seq.unbind(dim = 1)):
+        one_retrieved, state = mem.forward_inference(
+            token,
+            seq_index = ind,
+            state = state,
+        )
+        sequential_retrieved.append(one_retrieved)
+    sequential_retrieved = torch.cat(sequential_retrieved, dim = -2)
+    assert torch.allclose(parallel_retrieved, sequential_retrieved, atol = 1e-6)
 @pytest.mark.parametrize('seq_len', (1023, 17))
 @pytest.mark.parametrize('sliding', (True, False))
 def test_flex(
@@ -157,3 +206,28 @@ def test_flex(
     out_non_flex, _ = attn(seq, disable_flex_attn = True)
     assert torch.allclose(out_flex, out_non_flex, atol = 1e-5)
+@torch_default_dtype(torch.float64)
+def test_assoc_scan():
+    from titans_pytorch.titans import AssocScan
+    torch.set_default_dtype(torch.float64)
+    scan = AssocScan()
+    seq_len = 128
+    mid_point = seq_len // 2
+    gates = torch.randn(2, seq_len, 512).sigmoid()
+    inputs = torch.randn(2, seq_len, 512)
+    output = scan(gates, inputs)
+    gates1, gates2 = gates[:, :mid_point], gates[:, mid_point:]
+    inputs1, inputs2 = inputs[:, :mid_point], inputs[:, mid_point:]
+    first_half = scan(gates1, inputs1)
+    second_half = scan(gates2, inputs2, prev = first_half[:, -1])
+    assert second_half.shape == inputs2.shape
+    assert torch.allclose(output[:, -1], second_half[:, -1], atol = 1e-6)

{titans_pytorch-0.1.22 → titans_pytorch-0.1.26}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -510,10 +510,7 @@ class MemoryAsContextTransformer(Module):
         layers = tuple(range(1, depth + 1))
-        if not exists(neural_memory_layers):
-            neural_memory_layers = layers if has_longterm_mems else ()
-        assert not (num_longterm_mem_tokens > 0 and len(neural_memory_layers) == 0), 'empty `neural_memory_layers` when longterm memory tokens are present'
+        neural_memory_layers = default(neural_memory_layers, layers)
         # mem, attn, and feedforward layers
@@ -535,9 +532,10 @@ class MemoryAsContextTransformer(Module):
             )
             mem = None
+            mem_hyper_conn = None
             if layer in neural_memory_layers:
-                assert has_longterm_mems, '`num_longterm_mem_tokens` must be greater than 0'
+                mem_hyper_conn = init_hyper_conn(dim = dim, add_branch_out_to_residual = not neural_mem_gate_attn_output)
                 mem = NeuralMemory(
                     dim = dim,
@@ -545,10 +543,12 @@ class MemoryAsContextTransformer(Module):
                     **neural_memory_kwargs
                 )
             ff = FeedForward(dim = dim, mult = ff_mult)
             self.layers.append(ModuleList([
-                init_hyper_conn(dim = dim, branch = mem, add_branch_out_to_residual = not neural_mem_gate_attn_output) if exists(mem) else None,
+                mem_hyper_conn,
+                mem,
                 init_hyper_conn(dim = dim, branch = attn),
                 init_hyper_conn(dim = dim, branch = ff)
             ]))
@@ -691,8 +691,18 @@ class MemoryAsContextTransformer(Module):
         # kv caching
         is_inferencing = exists(cache)
-        cache = iter(default(cache, []))
+        assert not (is_inferencing and self.num_longterm_mem_tokens > 0)
+        if not exists(cache):
+            cache = (None, None)
+        kv_caches, neural_mem_caches = cache
+        kv_caches = iter(default(kv_caches, []))
+        neural_mem_caches = iter(default(neural_mem_caches, []))
         next_kv_caches = []
+        next_neural_mem_caches = []
         # value residual
@@ -711,21 +721,37 @@ class MemoryAsContextTransformer(Module):
         x = self.expand_streams(x)
-        for mem, attn, ff in self.layers:
+        for mem_hyper_conn, mem, attn, ff in self.layers:
             retrieved = None
             attn_out_gates = None
+            next_neural_mem_cache = None
             # maybe neural memory
             if exists(mem):
-                retrieved, mem_kv_aux_loss = mem(x, return_aux_kv_loss = True)
-                kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
+                mem_input, add_residual = mem_hyper_conn(x)
+                if not is_inferencing:
+                    retrieved, mem_kv_aux_loss = mem(
+                        mem_input,
+                        return_aux_kv_loss = True
+                    )
+                    kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
+                else:
+                    retrieved, next_neural_mem_cache = mem.forward_inference(
+                        mem_input,
+                        seq_index = seq_len - 1,
+                        state = next(neural_mem_caches, None)
+                    )
                 if self.gate_attn_output:
                     attn_out_gates = retrieved.sigmoid()
                 else:
-                    seq = retrieved
+                    x = add_residual(retrieved)
             # attention
@@ -735,12 +761,15 @@ class MemoryAsContextTransformer(Module):
                 disable_flex_attn = disable_flex_attn,
                 flex_attn_fn = flex_attn_fn,
                 output_gating = attn_out_gates,
-                cache = next(cache, None)
+                cache = next(kv_caches, None)
             )
             value_residual = default(value_residual, values)
+            # caches
             next_kv_caches.append(next_kv_cache)
+            next_neural_mem_caches.append(next_neural_mem_cache)
             # feedforward
@@ -775,7 +804,7 @@ class MemoryAsContextTransformer(Module):
             if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
                 next_kv_caches = next_kv_caches[..., 0:0, :]
-            return logits, next_kv_caches
+            return logits, (next_kv_caches, next_neural_mem_caches)
         ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

{titans_pytorch-0.1.22 → titans_pytorch-0.1.26}/titans_pytorch/titans.py RENAMED Viewed

@@ -44,6 +44,16 @@ def default(v, d):
 def xnor(x, y):
     return not (x ^ y)
+def safe_cat(inputs, dim = -2):
+    inputs = tuple(filter(exists, inputs))
+    if len(inputs) == 0:
+        return None
+    elif len(inputs) == 1:
+        return inputs[0]
+    return cat(inputs, dim = dim)
 def identity(t):
     return t
@@ -314,11 +324,26 @@ class AssocScan(Module):
         super().__init__()
         self.use_accelerated = use_accelerated
-    def forward(self, gates, inputs):
+    def forward(
+        self,
+        gates,
+        inputs,
+        prev = None,
+        remove_prev = None
+    ):
+        remove_prev = default(remove_prev, exists(prev))
+        if exists(prev):
+            inputs, _ = pack([prev, inputs], 'b * d')
+            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
         if not self.use_accelerated:
-            _, outputs = associative_scan(binary_operator, (gates, inputs))
-            return outputs
+            _, out = associative_scan(binary_operator, (gates, inputs))
+            if remove_prev:
+                out = out[:, 1:]
+            return out
         from accelerated_scan.triton import scan as triton_scan
         from accelerated_scan.warp import scan as warp_scan
@@ -341,7 +366,12 @@ class AssocScan(Module):
             outputs = rearrange(outputs, 'b d n -> b n d')
             return outputs
-        return accelerate_scan_fn(gates, inputs)
+        out = accelerate_scan_fn(gates, inputs)
+        if remove_prev:
+            out = out[:, 1:]
+        return out
 # main neural memory
@@ -370,7 +400,6 @@ class NeuralMemory(Module):
         post_rmsnorm = True,
         qk_rmsnorm = False,
         accept_value_residual = False,
-        learned_mem_model_weights = True,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
         activation: Module | None = None,
@@ -418,9 +447,6 @@ class NeuralMemory(Module):
         if not exists(model):
             model = MemoryMLP(dim_head, **default_model_kwargs)
-        if not learned_mem_model_weights:
-            model.requires_grad_(False)
         assert not exists(next(model.buffers(), None)), 'model cannot have buffers for now'
         # the memory is the weights of the model
@@ -522,16 +548,9 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
-    def init_weights_and_momentum(self, zero_weights = False):
-        params = TensorDict(dict(self.memory_model.named_parameters()))
-        init_weights = params
-        init_momentum = params.clone().zero_()
-        if zero_weights:
-            init_weights = params.clone().zero_()
-        return init_weights, init_momentum
+    def init_weights(self):
+        weights = TensorDict(dict(self.memory_model.named_parameters()))
+        return weights
     def init_empty_memory_embed(self, batch, seq_len):
         return repeat(self.empty_memory_embed, 'd -> b n d', b = batch, n = seq_len)
@@ -539,7 +558,8 @@ class NeuralMemory(Module):
     def store_memories(
         self,
         seq,
-        past_state: tuple[dict[str, Tensor], dict[str, Tensor]],
+        weights: dict[str, Tensor],
+        past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
         value_residual = None
@@ -551,8 +571,7 @@ class NeuralMemory(Module):
         # handle edge case
         if seq_len < chunk_size:
-            past_weight, _ = past_state
-            return TensorDict(past_weight).clone().zero_(), self.zero
+            return TensorDict(weights).clone().zero_(), self.zero
         seq = self.store_norm(seq)
@@ -563,10 +582,9 @@ class NeuralMemory(Module):
         seq = seq[:, :round_down_seq_len]
-        # get the weights of the memory network
+        # weights of the memory network
-        past_state = tuple(TensorDict(d) for d in past_state)
-        curr_weights, past_momentum = past_state
+        weights = TensorDict(weights)
         # derive learned hparams for optimization of memory network
@@ -616,7 +634,7 @@ class NeuralMemory(Module):
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
-        grads, aux_kv_recon_loss = self.per_sample_grad_fn(dict(curr_weights), keys, adaptive_lr, values)
+        grads, aux_kv_recon_loss = self.per_sample_grad_fn(dict(weights), keys, adaptive_lr, values)
         grads = TensorDict(grads)
@@ -638,12 +656,23 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: -t)
+        # past states
+        if not exists(past_state):
+            empty_dict = {key: None for key in weights.keys()}
+            past_state = (empty_dict, empty_dict)
+        past_last_update, past_last_momentum = past_state
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict() if has_momentum else None
         updates = TensorDict()
-        for param_name, surprise in surprises.items():
+        next_last_update = TensorDict()
+        next_last_momentum = TensorDict()
+        for (param_name, surprise), (_, last_update), (_, last_momentum) in zip(surprises.items(), past_last_update.items(), past_last_momentum.items()):
             surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
@@ -652,23 +681,27 @@ class NeuralMemory(Module):
             # derive momentum with associative scan - eq (10)
             if has_momentum:
-                update = self.assoc_scan(adaptive_momentum, surprise) # momentum is S / surprise in the paper
+                update = self.assoc_scan(adaptive_momentum, surprise, prev = last_momentum) # momentum is S / surprise in the paper
                 momentum = update
+                next_last_momentum[param_name] = momentum[:, -1]
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = self.assoc_scan(1. - decay_factor, update)
+            update = self.assoc_scan(1. - decay_factor, update, prev = last_update)
+            next_last_update[param_name] = update[:, -1]
             updates[param_name] = inverse_pack(update)
             if has_momentum:
                 next_momentum[param_name] = inverse_pack(momentum)
-        # compute the next weight per batch
+        # compute next states for inference, or titans-xl like training
-        last_update = updates.apply(lambda t: t[:, -1])
+        next_state = (next_last_update, next_last_momentum)
-        output = (updates, orig_values)
+        # returns
+        output = (updates, next_state, orig_values)
         if not return_aux_kv_loss:
             return output
@@ -678,7 +711,7 @@ class NeuralMemory(Module):
     def retrieve_memories(
         self,
         seq,
-        past_weights: dict[str, Tensor] | None = None,
+        past_weights: dict[str, Tensor],
         chunk_size = None
     ):
         chunk_size = default(chunk_size, self.retrieve_chunk_size)
@@ -700,13 +733,7 @@ class NeuralMemory(Module):
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
-        curr_weights = TensorDict(dict(self.memory_model.named_parameters()))
-        if exists(past_weights):
-            past_weights = TensorDict(past_weights)
-            assert past_weights.keys() == curr_weights.keys()
-            curr_weights = curr_weights + past_weights
+        curr_weights = TensorDict(past_weights)
         # sequence Float['b n d'] to queries
@@ -753,10 +780,77 @@ class NeuralMemory(Module):
         return values[:, :seq_len]
+    def forward_inference(
+        self,
+        token: Tensor,
+        seq_index = None, # the index of the token in the sequence, starts at 0
+        state = None,
+    ):
+        # unpack previous state
+        if not exists(state):
+            state = (None, None, None)
+        cache_store_seq, past_states, updates = state
+        seq_index = default(seq_index, 0)
+        curr_seq_len = seq_index + 1
+        batch = token.shape[0]
+        if token.ndim == 2:
+            token = rearrange(token, 'b d -> b 1 d')
+        # get memory model weights
+        weights = self.init_weights()
+        # increment the sequence cache which is at most the chunk size
+        cache_store_seq = safe_cat((cache_store_seq, token), dim = -2)
+        # early return empty memory, when no memories are stored for steps < first chunk size
+        if curr_seq_len < self.chunk_size:
+            empty_mem = self.init_empty_memory_embed(batch, 1)
+            return empty_mem, (cache_store_seq, past_states, updates)
+        # store if storage sequence cache hits the chunk size
+        next_states = past_states
+        store_seq_cache_len = cache_store_seq.shape[-2]
+        if not exists(updates):
+            updates = weights.clone().zero_()
+            updates = updates.apply(lambda t: repeat(t, '... -> b 1 ...', b = batch))
+        if store_seq_cache_len == self.chunk_size:
+            next_updates, next_states, _ = self.store_memories(
+                cache_store_seq,
+                weights,
+                past_state = past_states
+            )
+            updates = next_updates
+            cache_store_seq = None
+        # retrieve
+        retrieved = self.retrieve_memories(token, updates + weights, chunk_size = 1)
+        # next state tuple
+        next_state = (cache_store_seq, next_states, updates)
+        return retrieved, next_state
     def forward(
         self,
         seq,
         store_seq = None,
+        mem_model_weights: dict[str, Tensor] | None = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
@@ -773,20 +867,15 @@ class NeuralMemory(Module):
             return out, self.zero
-        if exists(past_state):
-            past_state = tuple(TensorDict(d) for d in past_state)
-        if not exists(past_state):
-            past_state = self.init_weights_and_momentum()
+        if not exists(mem_model_weights):
+            mem_model_weights = self.init_weights()
         store_seq = default(store_seq, seq)
         store_chunk_size = default(store_chunk_size, chunk_size)
-        (updates, values), aux_kv_recon_loss = self.store_memories(store_seq, past_state, chunk_size = store_chunk_size, return_aux_kv_loss = True)
-        past_weights, _ = past_state
+        (updates, next_state, values), aux_kv_recon_loss = self.store_memories(store_seq, mem_model_weights, chunk_size = store_chunk_size, return_aux_kv_loss = True)
-        retrieved = self.retrieve_memories(seq, past_weights + updates, chunk_size = chunk_size)
+        retrieved = self.retrieve_memories(seq, mem_model_weights + updates, chunk_size = chunk_size)
         output = retrieved