PyPI - titans-pytorch - Versions diffs - 0.3.15__tar.gz → 0.3.19__tar.gz - Mend

titans-pytorch 0.3.15tar.gz → 0.3.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.3.15 → titans_pytorch-0.3.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.15
+Version: 0.3.19
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -38,7 +38,7 @@ Requires-Dist: accelerated-scan>=0.2.0
 Requires-Dist: axial-positional-embedding>=0.3.10
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
-Requires-Dist: hyper-connections>=0.1.10
+Requires-Dist: hyper-connections>=0.1.11
 Requires-Dist: ninja
 Requires-Dist: rotary-embedding-torch
 Requires-Dist: tensordict

{titans_pytorch-0.3.15 → titans_pytorch-0.3.19}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.15"
+version = "0.3.19"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -29,7 +29,7 @@ dependencies = [
     "axial_positional_embedding>=0.3.10",
     "einops>=0.8.0",
     "einx>=0.3.0",
-    "hyper-connections>=0.1.10",
+    "hyper-connections>=0.1.11",
     "Ninja",
     "rotary-embedding-torch",
     "tensordict",

{titans_pytorch-0.3.15 → titans_pytorch-0.3.19}/tests/test_titans.py RENAMED Viewed

@@ -200,6 +200,7 @@ def test_neural_mem_chaining_with_batch_size():
 @pytest.mark.parametrize('neural_mem_segment_len', (8, 16))
 @pytest.mark.parametrize('neural_mem_weight_residual', (False, True))
 @pytest.mark.parametrize('neural_mem_batch_size', (None, 64))
+@pytest.mark.parametrize('neural_mem_kv_receives_diff_views', (False, True))
 @pytest.mark.parametrize('neural_mem_momentum', (False, True))
 def test_mac(
     seq_len,
@@ -209,6 +210,7 @@ def test_mac(
     neural_mem_segment_len,
     neural_mem_weight_residual,
     neural_mem_batch_size,
+    neural_mem_kv_receives_diff_views,
     neural_mem_momentum
 ):
     transformer = MemoryAsContextTransformer(
@@ -221,6 +223,7 @@ def test_mac(
         neural_mem_gate_attn_output = neural_mem_gate_attn_output,
         neural_memory_segment_len = neural_mem_segment_len,
         neural_memory_batch_size = neural_mem_batch_size,
+        neural_memory_kv_receives_diff_views = neural_mem_kv_receives_diff_views,
         neural_mem_weight_residual = neural_mem_weight_residual,
         neural_memory_kwargs = dict(
             momentum = neural_mem_momentum

{titans_pytorch-0.3.15 → titans_pytorch-0.3.19}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -483,6 +483,7 @@ class MemoryAsContextTransformer(Module):
         num_longterm_mem_tokens = 0,
         num_persist_mem_tokens = 0,
         neural_memory_batch_size = None,
+        neural_memory_kv_receives_diff_views = False,
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
@@ -560,13 +561,14 @@ class MemoryAsContextTransformer(Module):
             mem_hyper_conn = None
             if layer in neural_memory_layers:
-                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output)
+                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output, num_input_views = 2 if neural_memory_kv_receives_diff_views else 1)
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
                     batch_size = neural_memory_batch_size,
                     model = deepcopy(neural_memory_model),
+                    kv_receives_diff_views = neural_memory_kv_receives_diff_views,
                     accept_weight_residual = neural_mem_weight_residual and not is_first_neural_mem,
                     **neural_memory_kwargs
                 )

{titans_pytorch-0.3.15 → titans_pytorch-0.3.19}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -231,6 +231,7 @@ class NeuralMemory(Module):
         momentum_order = 1,
         learned_momentum_combine = False,
         learned_combine_include_zeroth = False,
+        kv_receives_diff_views = False, # to address an issue raised by a phd student (who will be credited if experiments are green). basically the issue raised is that the memory MLP is only learning Wk @ Wv linear mapping and that may not be expressive enough. we will use hyper connections to allow the network to choose different previous layer inputs as keys / values and see if that does anything
         pre_rmsnorm = True,
         post_rmsnorm = False,
         qk_rmsnorm = False,
@@ -265,6 +266,10 @@ class NeuralMemory(Module):
         self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
+        # key values receiving different views
+        self.kv_receives_diff_views = kv_receives_diff_views
         # norms
         self.retrieve_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
@@ -358,7 +363,9 @@ class NeuralMemory(Module):
         # keys and values for storing to the model
-        self.to_keys_values = Sequential(LinearNoBias(dim, dim_inner * 2), activation)
+        self.to_keys = Sequential(LinearNoBias(dim, dim_inner), activation)
+        self.to_values = Sequential(LinearNoBias(dim, dim_inner), activation)
         self.store_memory_loss_fn = store_memory_loss_fn
         # `chunk_size` refers to chunk size used for storing to memory model weights
@@ -504,7 +511,14 @@ class NeuralMemory(Module):
         seq_index = 0,
         prev_weights = None
     ):
-        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, self.store_chunk_size
+        if self.kv_receives_diff_views:
+            _, batch, seq_len = seq.shape[:3]
+        else:
+            batch, seq_len = seq.shape[:2]
+        # shapes and variables
+        heads, chunk_size = self.heads, self.store_chunk_size
         # curtail sequence by multiple of the chunk size
         # only a complete chunk of the sequence provides the memory for the next chunk
@@ -512,7 +526,7 @@ class NeuralMemory(Module):
         round_down_seq_len = round_down_multiple(seq_len, chunk_size)
         num_chunks = round_down_seq_len // chunk_size
-        seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
+        seq, remainder = seq[..., :round_down_seq_len, :], seq[..., round_down_seq_len:, :]
         next_seq_len_index = seq_index + round_down_seq_len
@@ -528,10 +542,19 @@ class NeuralMemory(Module):
         weights_for_surprise = repeat_dict_values(weights, 'b ... -> b n ...', n = num_chunks)
-        # derive learned hparams for optimization of memory network
+        # initial norm
         seq = self.store_norm(seq)
+        # handle keys and values coming from different sequences from hyper connection
+        values_seq = seq
+        if self.kv_receives_diff_views:
+            seq, values_seq = seq
+        # derive learned hparams for optimization of memory network
         adaptive_lr = self.to_adaptive_step(seq)
         adaptive_lr = self.adaptive_step_transform(adaptive_lr)
@@ -555,7 +578,8 @@ class NeuralMemory(Module):
         # keys and values
-        keys, values = self.to_keys_values(seq).chunk(2, dim = -1)
+        keys = self.to_keys(seq)
+        values = self.to_values(values_seq)
         # maybe multi head
@@ -915,6 +939,9 @@ class NeuralMemory(Module):
             last_update, _ = next_neural_mem_state.states
             updates = rearrange_dict_values(last_update, 'b ... -> b 1 ...')
+        if self.kv_receives_diff_views:
+            seq = seq[0]
         retrieved = self.retrieve_memories(
             seq,
             updates