PyPI - titans-pytorch - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

titans-pytorch 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 from typing import Callable
 from math import ceil
+from copy import deepcopy
 from functools import partial
 from collections import namedtuple
@@ -485,11 +486,13 @@ class MemoryAsContextTransformer(Module):
         heads = 8,
         ff_mult = 4,
         num_residual_streams = 4,
+        neural_memory_model: Module | None = None,
         neural_memory_kwargs: dict = dict(),
         neural_memory_layers: tuple[int, ...] | None = None,
         aux_kv_recon_loss_weight = 0.,
         use_flex_attn = False,
-        sliding_window_attn = False
+        sliding_window_attn = False,
+        weight_tie_memory_model = False
     ):
         super().__init__()
@@ -523,6 +526,15 @@ class MemoryAsContextTransformer(Module):
         neural_memory_layers = default(neural_memory_layers, layers)
+        # weight tying neural memory model
+        maybe_copy = deepcopy if not weight_tie_memory_model else identity
+        if weight_tie_memory_model:
+            assert exists(neural_memory_model), '`neural_memory_model` must be explicitly set'
+        self.weight_tie_memory_model = weight_tie_memory_model
         # mem, attn, and feedforward layers
         for layer in layers:
@@ -551,6 +563,7 @@ class MemoryAsContextTransformer(Module):
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
+                    model = maybe_copy(neural_memory_model),
                     **neural_memory_kwargs
                 )
@@ -683,7 +696,7 @@ class MemoryAsContextTransformer(Module):
         # math
-        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size
+        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, weight_tie_memory_model = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.weight_tie_memory_model
         seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
@@ -736,6 +749,10 @@ class MemoryAsContextTransformer(Module):
         next_kv_caches = []
         next_neural_mem_caches = []
+        # weight tied neural memory
+        neural_memory_updates = None
         # value residual
         value_residual = None
@@ -769,7 +786,8 @@ class MemoryAsContextTransformer(Module):
                 if not is_inferencing:
                     (retrieved, next_neural_mem_cache), mem_kv_aux_loss = mem(
                         mem_input,
-                        return_aux_kv_loss = True
+                        return_aux_kv_loss = True,
+                        prev_layer_updates = neural_memory_updates
                     )
                     kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
@@ -777,9 +795,13 @@ class MemoryAsContextTransformer(Module):
                 else:
                     retrieved, next_neural_mem_cache = mem.forward_inference(
                         mem_input,
-                        state = next(neural_mem_caches, None)
+                        state = next(neural_mem_caches, None),
+                        prev_layer_updates = neural_memory_updates
                     )
+                if weight_tie_memory_model:
+                    neural_memory_updates = next_neural_mem_cache.updates
                 if self.gate_attn_output:
                     attn_out_gates = retrieved.sigmoid()
                 else:

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Callable
 import math
 from functools import partial
+from collections import namedtuple
 import torch
 from torch import nn, cat, Tensor
@@ -33,6 +34,8 @@ w - num memory network weight parameters
 LinearNoBias = partial(Linear, bias = False)
+NeuralMemCache = namedtuple('NeuralMemCache', ['seq', 'cache_store_segment', 'states', 'updates'])
 # functions
 def exists(v):
@@ -605,7 +608,7 @@ class NeuralMemory(Module):
         # improvise (or perhaps correcting to) a solution
         if exists(prev_layer_updates):
-            prev_layer_updates = TensorDict(weights)
+            prev_layer_updates = TensorDict(prev_layer_updates)
             weights = weights + prev_layer_updates
@@ -657,6 +660,11 @@ class NeuralMemory(Module):
         adaptive_lr = rearrange(adaptive_lr, 'b (n c) -> (b n) c', c = chunk_size)
+        # flatten batch and time if surprise depends on previous layer memory model
+        if exists(prev_layer_updates):
+            weights = weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
         grads, aux_kv_recon_loss = per_sample_grad_fn(dict(weights), keys, adaptive_lr, values)
@@ -737,7 +745,8 @@ class NeuralMemory(Module):
         self,
         seq,
         past_weights: dict[str, Tensor],
-        chunk_size = None
+        chunk_size = None,
+        prev_layer_updates: dict[str, Tensor] | None = None
     ):
         chunk_size = default(chunk_size, self.retrieve_chunk_size)
         batch, seq_len = seq.shape[:2]
@@ -760,6 +769,9 @@ class NeuralMemory(Module):
         curr_weights = TensorDict(past_weights)
+        if exists(prev_layer_updates):
+            curr_weights = curr_weights + TensorDict(prev_layer_updates)
         # sequence Float['b n d'] to queries
         queries = self.to_queries(seq)
@@ -810,6 +822,7 @@ class NeuralMemory(Module):
         self,
         token: Tensor,
         state = None,
+        prev_layer_updates: dict[str, Tensor] | None = None
     ):
         # unpack previous state
@@ -838,7 +851,7 @@ class NeuralMemory(Module):
         if curr_seq_len < self.chunk_size:
             empty_mem = self.init_empty_memory_embed(batch, 1)
-            return empty_mem, (curr_seq_len, cache_store_seq, past_states, updates)
+            return empty_mem, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
         # store if storage sequence cache hits the chunk size
@@ -848,13 +861,20 @@ class NeuralMemory(Module):
         if not exists(updates):
             updates = weights.clone().zero_()
             updates = updates.apply(lambda t: repeat(t, '... -> b 1 ...', b = batch))
+        else:
+            updates = updates.apply(lambda t: t[:, -1:])
+        if exists(prev_layer_updates):
+            prev_layer_updates = TensorDict(prev_layer_updates)
+            prev_layer_updates = prev_layer_updates.apply(lambda t: t[:, -1:])
         if store_seq_cache_len == self.chunk_size:
             next_updates, next_states, _ = self.store_memories(
                 cache_store_seq,
                 weights,
-                past_state = past_states
+                past_state = past_states,
+                prev_layer_updates = prev_layer_updates,
             )
             updates = next_updates
@@ -866,7 +886,7 @@ class NeuralMemory(Module):
         # next state tuple
-        next_state = (curr_seq_len, cache_store_seq, next_states, updates)
+        next_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
         return retrieved, next_state
@@ -880,7 +900,8 @@ class NeuralMemory(Module):
         chunk_size = None,
         store_chunk_size = None,
         return_values = False,
-        return_next_state = False
+        return_next_state = False,
+        prev_layer_updates: dict[str, Tensor] | None = None
     ):
         batch, seq_len = seq.shape[:2]
@@ -899,15 +920,30 @@ class NeuralMemory(Module):
         if not exists(mem_model_weights):
             mem_model_weights = self.init_weights()
+        # store
         store_seq = default(store_seq, seq)
         store_seq_len = store_seq.shape[-2]
         store_chunk_size = default(store_chunk_size, chunk_size, self.store_chunk_size)
         remainder = store_seq_len % store_chunk_size
-        (updates, next_state, values), aux_kv_recon_loss = self.store_memories(store_seq, mem_model_weights, chunk_size = store_chunk_size, return_aux_kv_loss = True)
+        (updates, next_state, values), aux_kv_recon_loss = self.store_memories(
+            store_seq,
+            mem_model_weights,
+            chunk_size = store_chunk_size,
+            prev_layer_updates = prev_layer_updates,
+            return_aux_kv_loss = True
+        )
+        # retrieve
-        retrieved = self.retrieve_memories(seq, mem_model_weights + updates, chunk_size = chunk_size)
+        retrieved = self.retrieve_memories(
+            seq,
+            mem_model_weights + updates,
+            chunk_size = chunk_size,
+            prev_layer_updates = prev_layer_updates
+        )
         # determine state for the storing of memories
         # for transformer-xl like training with neural memory as well as inferencing with initial prompt
@@ -917,9 +953,7 @@ class NeuralMemory(Module):
         if remainder > 0:
             cache_store_seq = store_seq[:, -remainder:]
-        updates = updates.apply(lambda t: t[:, -1:])
-        next_store_state = (seq_len, cache_store_seq, next_state, updates)
+        next_store_state = NeuralMemCache(seq_len, cache_store_seq, next_state, updates)
         output = (retrieved, next_store_state)

{titans_pytorch-0.1.32.dist-info → titans_pytorch-0.1.34.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.32
+Version: 0.1.34
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.1.34.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+titans_pytorch/__init__.py,sha256=rMT99CPQFH4Gudp0FmVPWGKfhBf6xksGEaFEcOVdqjs,230
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=JvA4mhQaW9LD4j6boRUfLfjyzDCtjqybIr4Ajeio8n8,25708
+titans_pytorch/neural_memory.py,sha256=nNAxhkubuHCGs3bty_eA_yBhWqepPZJgKKvkWXO6IK4,28653
+titans_pytorch-0.1.34.dist-info/METADATA,sha256=pVgjCX_YTT9_5WPcFfXpoaBvzrg1-esvwS0kPpeJAYU,6826
+titans_pytorch-0.1.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.1.34.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.1.34.dist-info/RECORD,,

titans_pytorch-0.1.32.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-titans_pytorch/__init__.py,sha256=rMT99CPQFH4Gudp0FmVPWGKfhBf6xksGEaFEcOVdqjs,230
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=Cui-hCl6X4UVGmuyoKCSKWbag9Yrc-a2MrfVkHM-z0A,24828
-titans_pytorch/neural_memory.py,sha256=Vfo1z1VztPDDXgFxjkiyOP29daDE7KTdnZeWXifvCJI,27456
-titans_pytorch-0.1.32.dist-info/METADATA,sha256=_HPPht8nhLwH9GzLyZI-fh8JBSEoSxkENCSU2xuU_6A,6826
-titans_pytorch-0.1.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.1.32.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.1.32.dist-info/RECORD,,

{titans_pytorch-0.1.32.dist-info → titans_pytorch-0.1.34.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.1.32.dist-info → titans_pytorch-0.1.34.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

titans-pytorch 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl