PyPI - titans-pytorch - Versions diffs - 0.1.23__py3-none-any.whl → 0.1.27__py3-none-any.whl - Mend

titans-pytorch 0.1.23py3-none-any.whl → 0.1.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -510,10 +510,7 @@ class MemoryAsContextTransformer(Module):
         layers = tuple(range(1, depth + 1))
-        if not exists(neural_memory_layers):
-            neural_memory_layers = layers if has_longterm_mems else ()
-        assert not (num_longterm_mem_tokens > 0 and len(neural_memory_layers) == 0), 'empty `neural_memory_layers` when longterm memory tokens are present'
+        neural_memory_layers = default(neural_memory_layers, layers)
         # mem, attn, and feedforward layers
@@ -535,9 +532,10 @@ class MemoryAsContextTransformer(Module):
             )
             mem = None
+            mem_hyper_conn = None
             if layer in neural_memory_layers:
-                assert has_longterm_mems, '`num_longterm_mem_tokens` must be greater than 0'
+                mem_hyper_conn = init_hyper_conn(dim = dim, add_branch_out_to_residual = not neural_mem_gate_attn_output)
                 mem = NeuralMemory(
                     dim = dim,
@@ -545,10 +543,12 @@ class MemoryAsContextTransformer(Module):
                     **neural_memory_kwargs
                 )
             ff = FeedForward(dim = dim, mult = ff_mult)
             self.layers.append(ModuleList([
-                init_hyper_conn(dim = dim, branch = mem, add_branch_out_to_residual = not neural_mem_gate_attn_output) if exists(mem) else None,
+                mem_hyper_conn,
+                mem,
                 init_hyper_conn(dim = dim, branch = attn),
                 init_hyper_conn(dim = dim, branch = ff)
             ]))
@@ -691,8 +691,18 @@ class MemoryAsContextTransformer(Module):
         # kv caching
         is_inferencing = exists(cache)
-        cache = iter(default(cache, []))
+        assert not (is_inferencing and self.num_longterm_mem_tokens > 0)
+        if not exists(cache):
+            cache = (None, None)
+        kv_caches, neural_mem_caches = cache
+        kv_caches = iter(default(kv_caches, []))
+        neural_mem_caches = iter(default(neural_mem_caches, []))
         next_kv_caches = []
+        next_neural_mem_caches = []
         # value residual
@@ -711,21 +721,37 @@ class MemoryAsContextTransformer(Module):
         x = self.expand_streams(x)
-        for mem, attn, ff in self.layers:
+        for mem_hyper_conn, mem, attn, ff in self.layers:
             retrieved = None
             attn_out_gates = None
+            next_neural_mem_cache = None
             # maybe neural memory
             if exists(mem):
-                retrieved, mem_kv_aux_loss = mem(x, return_aux_kv_loss = True)
-                kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
+                mem_input, add_residual = mem_hyper_conn(x)
+                if not is_inferencing:
+                    retrieved, mem_kv_aux_loss = mem(
+                        mem_input,
+                        return_aux_kv_loss = True
+                    )
+                    kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
+                    next_neural_mem_cache = (seq_len, None, None, None)
+                else:
+                    retrieved, next_neural_mem_cache = mem.forward_inference(
+                        mem_input,
+                        state = next(neural_mem_caches, None)
+                    )
                 if self.gate_attn_output:
                     attn_out_gates = retrieved.sigmoid()
                 else:
-                    seq = retrieved
+                    x = add_residual(retrieved)
             # attention
@@ -735,12 +761,15 @@ class MemoryAsContextTransformer(Module):
                 disable_flex_attn = disable_flex_attn,
                 flex_attn_fn = flex_attn_fn,
                 output_gating = attn_out_gates,
-                cache = next(cache, None)
+                cache = next(kv_caches, None)
             )
             value_residual = default(value_residual, values)
+            # caches
             next_kv_caches.append(next_kv_cache)
+            next_neural_mem_caches.append(next_neural_mem_cache)
             # feedforward
@@ -775,7 +804,7 @@ class MemoryAsContextTransformer(Module):
             if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
                 next_kv_caches = next_kv_caches[..., 0:0, :]
-            return logits, next_kv_caches
+            return logits, (next_kv_caches, next_neural_mem_caches)
         ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

titans_pytorch/titans.py CHANGED Viewed

@@ -324,15 +324,26 @@ class AssocScan(Module):
         super().__init__()
         self.use_accelerated = use_accelerated
-    def forward(self, gates, inputs, prev = None):
+    def forward(
+        self,
+        gates,
+        inputs,
+        prev = None,
+        remove_prev = None
+    ):
+        remove_prev = default(remove_prev, exists(prev))
         if exists(prev):
             inputs, _ = pack([prev, inputs], 'b * d')
             gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
         if not self.use_accelerated:
-            _, outputs = associative_scan(binary_operator, (gates, inputs))
-            return outputs
+            _, out = associative_scan(binary_operator, (gates, inputs))
+            if remove_prev:
+                out = out[:, 1:]
+            return out
         from accelerated_scan.triton import scan as triton_scan
         from accelerated_scan.warp import scan as warp_scan
@@ -355,7 +366,12 @@ class AssocScan(Module):
             outputs = rearrange(outputs, 'b d n -> b n d')
             return outputs
-        return accelerate_scan_fn(gates, inputs)
+        out = accelerate_scan_fn(gates, inputs)
+        if remove_prev:
+            out = out[:, 1:]
+        return out
 # main neural memory
@@ -384,7 +400,6 @@ class NeuralMemory(Module):
         post_rmsnorm = True,
         qk_rmsnorm = False,
         accept_value_residual = False,
-        learned_mem_model_weights = True,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
         activation: Module | None = None,
@@ -432,9 +447,6 @@ class NeuralMemory(Module):
         if not exists(model):
             model = MemoryMLP(dim_head, **default_model_kwargs)
-        if not learned_mem_model_weights:
-            model.requires_grad_(False)
         assert not exists(next(model.buffers(), None)), 'model cannot have buffers for now'
         # the memory is the weights of the model
@@ -536,16 +548,9 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
-    def init_weights_and_momentum(self, zero_weights = False):
-        params = TensorDict(dict(self.memory_model.named_parameters()))
-        init_weights = params
-        init_momentum = params.clone().zero_()
-        if zero_weights:
-            init_weights = params.clone().zero_()
-        return init_weights, init_momentum
+    def init_weights(self):
+        weights = TensorDict(dict(self.memory_model.named_parameters()))
+        return weights
     def init_empty_memory_embed(self, batch, seq_len):
         return repeat(self.empty_memory_embed, 'd -> b n d', b = batch, n = seq_len)
@@ -553,7 +558,8 @@ class NeuralMemory(Module):
     def store_memories(
         self,
         seq,
-        past_state: tuple[dict[str, Tensor], dict[str, Tensor]],
+        weights: dict[str, Tensor],
+        past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
         value_residual = None
@@ -565,8 +571,7 @@ class NeuralMemory(Module):
         # handle edge case
         if seq_len < chunk_size:
-            past_weight, _ = past_state
-            return TensorDict(past_weight).clone().zero_(), self.zero
+            return TensorDict(weights).clone().zero_(), self.zero
         seq = self.store_norm(seq)
@@ -577,10 +582,9 @@ class NeuralMemory(Module):
         seq = seq[:, :round_down_seq_len]
-        # get the weights of the memory network
+        # weights of the memory network
-        past_state = tuple(TensorDict(d) for d in past_state)
-        curr_weights, past_momentum = past_state
+        weights = TensorDict(weights)
         # derive learned hparams for optimization of memory network
@@ -630,7 +634,7 @@ class NeuralMemory(Module):
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
-        grads, aux_kv_recon_loss = self.per_sample_grad_fn(dict(curr_weights), keys, adaptive_lr, values)
+        grads, aux_kv_recon_loss = self.per_sample_grad_fn(dict(weights), keys, adaptive_lr, values)
         grads = TensorDict(grads)
@@ -652,12 +656,23 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: -t)
+        # past states
+        if not exists(past_state):
+            empty_dict = {key: None for key in weights.keys()}
+            past_state = (empty_dict, empty_dict)
+        past_last_update, past_last_momentum = past_state
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict() if has_momentum else None
         updates = TensorDict()
-        for param_name, surprise in surprises.items():
+        next_last_update = TensorDict()
+        next_last_momentum = TensorDict()
+        for (param_name, surprise), (_, last_update), (_, last_momentum) in zip(surprises.items(), past_last_update.items(), past_last_momentum.items()):
             surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
@@ -666,23 +681,27 @@ class NeuralMemory(Module):
             # derive momentum with associative scan - eq (10)
             if has_momentum:
-                update = self.assoc_scan(adaptive_momentum, surprise) # momentum is S / surprise in the paper
+                update = self.assoc_scan(adaptive_momentum, surprise, prev = last_momentum) # momentum is S / surprise in the paper
                 momentum = update
+                next_last_momentum[param_name] = momentum[:, -1]
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = self.assoc_scan(1. - decay_factor, update)
+            update = self.assoc_scan(1. - decay_factor, update, prev = last_update)
+            next_last_update[param_name] = update[:, -1]
             updates[param_name] = inverse_pack(update)
             if has_momentum:
                 next_momentum[param_name] = inverse_pack(momentum)
-        # compute the next weight per batch
+        # compute next states for inference, or titans-xl like training
-        last_update = updates.apply(lambda t: t[:, -1])
+        next_state = (next_last_update, next_last_momentum)
-        output = (updates, orig_values)
+        # returns
+        output = (updates, next_state, orig_values)
         if not return_aux_kv_loss:
             return output
@@ -764,21 +783,25 @@ class NeuralMemory(Module):
     def forward_inference(
         self,
         token: Tensor,
-        seq_index = None, # the index of the token in the sequence, starts at 0
-        mem_model_state = None,
-        cache_store_seq = None
+        state = None,
     ):
-        seq_index = default(seq_index, 0)
+        # unpack previous state
+        if not exists(state):
+            state = (0, None, None, None)
+        seq_index, cache_store_seq, past_states, updates = state
         curr_seq_len = seq_index + 1
         batch = token.shape[0]
         if token.ndim == 2:
             token = rearrange(token, 'b d -> b 1 d')
-        # init memory model if needed
+        # get memory model weights
-        if not exists(mem_model_state):
-            mem_model_state = self.init_weights_and_momentum()
+        weights = self.init_weights()
         # increment the sequence cache which is at most the chunk size
@@ -789,32 +812,43 @@ class NeuralMemory(Module):
         if curr_seq_len < self.chunk_size:
             empty_mem = self.init_empty_memory_embed(batch, 1)
-            return empty_mem, cache_store_seq, mem_model_state
+            return empty_mem, (curr_seq_len, cache_store_seq, past_states, updates)
         # store if storage sequence cache hits the chunk size
+        next_states = past_states
         store_seq_cache_len = cache_store_seq.shape[-2]
+        if not exists(updates):
+            updates = weights.clone().zero_()
+            updates = updates.apply(lambda t: repeat(t, '... -> b 1 ...', b = batch))
         if store_seq_cache_len == self.chunk_size:
-            updates, _ = self.store_memories(cache_store_seq, mem_model_state)
-            past_weights, past_momentum = mem_model_state
-            mem_model_state = (past_weights + updates, past_momentum)
+            next_updates, next_states, _ = self.store_memories(
+                cache_store_seq,
+                weights,
+                past_state = past_states
+            )
+            updates = next_updates
             cache_store_seq = None
         # retrieve
-        past_weights, _ = mem_model_state
+        retrieved = self.retrieve_memories(token, updates + weights, chunk_size = 1)
+        # next state tuple
-        retrieved = self.retrieve_memories(token, past_weights, chunk_size = 1)
+        next_state = (curr_seq_len, cache_store_seq, next_states, updates)
-        return retrieved, cache_store_seq, mem_model_state
+        return retrieved, next_state
     def forward(
         self,
         seq,
         store_seq = None,
+        mem_model_weights: dict[str, Tensor] | None = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
@@ -831,20 +865,15 @@ class NeuralMemory(Module):
             return out, self.zero
-        if exists(past_state):
-            past_state = tuple(TensorDict(d) for d in past_state)
-        if not exists(past_state):
-            past_state = self.init_weights_and_momentum()
+        if not exists(mem_model_weights):
+            mem_model_weights = self.init_weights()
         store_seq = default(store_seq, seq)
         store_chunk_size = default(store_chunk_size, chunk_size)
-        (updates, values), aux_kv_recon_loss = self.store_memories(store_seq, past_state, chunk_size = store_chunk_size, return_aux_kv_loss = True)
-        past_weights, _ = past_state
+        (updates, next_state, values), aux_kv_recon_loss = self.store_memories(store_seq, mem_model_weights, chunk_size = store_chunk_size, return_aux_kv_loss = True)
-        retrieved = self.retrieve_memories(seq, past_weights + updates, chunk_size = chunk_size)
+        retrieved = self.retrieve_memories(seq, mem_model_weights + updates, chunk_size = chunk_size)
         output = retrieved

{titans_pytorch-0.1.23.dist-info → titans_pytorch-0.1.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.23
+Version: 0.1.27
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -35,7 +35,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
 Requires-Dist: accelerated-scan>=0.2.0
-Requires-Dist: axial-positional-embedding>=0.3.9
+Requires-Dist: axial-positional-embedding>=0.3.10
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
 Requires-Dist: hyper-connections>=0.1.9

titans_pytorch-0.1.27.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+titans_pytorch/__init__.py,sha256=u0tta_KqhOdfzCEDWT9P4_jejJEK2q1XxhsEzB5MnQU,223
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=Staf9hRQ44QAL23bSGh4VSB8NeGtMri-JdiZdgirJiU,23587
+titans_pytorch/titans.py,sha256=gjoDcTsvw5X2d1I2xq4cM45YJIBqtLFuws8_jVylW_4,25746
+titans_pytorch-0.1.27.dist-info/METADATA,sha256=AZ5-_d9o_khm6jaky1zoKyXB1hDQNifbS061v_b4McQ,6815
+titans_pytorch-0.1.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.1.27.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.1.27.dist-info/RECORD,,

titans_pytorch-0.1.23.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-titans_pytorch/__init__.py,sha256=u0tta_KqhOdfzCEDWT9P4_jejJEK2q1XxhsEzB5MnQU,223
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=Ejq1r3GQQnlT1Fo4McaOOie19t1HjwVlYbD90GLQCYI,22859
-titans_pytorch/titans.py,sha256=WbagKMYDs-3NoW2j_pAyHEnvR9QzH3A9WntHuV_FKOo,25109
-titans_pytorch-0.1.23.dist-info/METADATA,sha256=H7QbLscawNObHGeoTbnKbf-NOqkMqWCu4yWeZJ0yKMA,6814
-titans_pytorch-0.1.23.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.1.23.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.1.23.dist-info/RECORD,,

{titans_pytorch-0.1.23.dist-info → titans_pytorch-0.1.27.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.1.23.dist-info → titans_pytorch-0.1.27.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.1.23__py3-none-any.whl → 0.1.27__py3-none-any.whl

titans-pytorch 0.1.23py3-none-any.whl → 0.1.27py3-none-any.whl