PyPI - titans-pytorch - Versions diffs - 0.3.19__tar.gz → 0.3.21__tar.gz - Mend

titans-pytorch 0.3.19tar.gz → 0.3.21tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.3.19 → titans_pytorch-0.3.21}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.19
+Version: 0.3.21
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.3.19 → titans_pytorch-0.3.21}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.19"
+version = "0.3.21"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.3.19 → titans_pytorch-0.3.21}/tests/test_titans.py RENAMED Viewed

@@ -200,7 +200,7 @@ def test_neural_mem_chaining_with_batch_size():
 @pytest.mark.parametrize('neural_mem_segment_len', (8, 16))
 @pytest.mark.parametrize('neural_mem_weight_residual', (False, True))
 @pytest.mark.parametrize('neural_mem_batch_size', (None, 64))
-@pytest.mark.parametrize('neural_mem_kv_receives_diff_views', (False, True))
+@pytest.mark.parametrize('neural_mem_qkv_receives_diff_views', (False, True))
 @pytest.mark.parametrize('neural_mem_momentum', (False, True))
 def test_mac(
     seq_len,
@@ -210,7 +210,7 @@ def test_mac(
     neural_mem_segment_len,
     neural_mem_weight_residual,
     neural_mem_batch_size,
-    neural_mem_kv_receives_diff_views,
+    neural_mem_qkv_receives_diff_views,
     neural_mem_momentum
 ):
     transformer = MemoryAsContextTransformer(
@@ -223,7 +223,7 @@ def test_mac(
         neural_mem_gate_attn_output = neural_mem_gate_attn_output,
         neural_memory_segment_len = neural_mem_segment_len,
         neural_memory_batch_size = neural_mem_batch_size,
-        neural_memory_kv_receives_diff_views = neural_mem_kv_receives_diff_views,
+        neural_memory_qkv_receives_diff_views = neural_mem_qkv_receives_diff_views,
         neural_mem_weight_residual = neural_mem_weight_residual,
         neural_memory_kwargs = dict(
             momentum = neural_mem_momentum

{titans_pytorch-0.3.19 → titans_pytorch-0.3.21}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -483,7 +483,7 @@ class MemoryAsContextTransformer(Module):
         num_longterm_mem_tokens = 0,
         num_persist_mem_tokens = 0,
         neural_memory_batch_size = None,
-        neural_memory_kv_receives_diff_views = False,
+        neural_memory_qkv_receives_diff_views = False,
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
@@ -523,6 +523,8 @@ class MemoryAsContextTransformer(Module):
         # hyper conection
+        assert not (num_residual_streams <= 1 and neural_memory_qkv_receives_diff_views), 'allow neural memory queries, keys, values to be derived from different combinations of the residual streams can only work if hyper connections has greater than 1 residual stream'
         init_hyper_conn, self.expand_streams, self.reduce_streams = get_init_and_expand_reduce_stream_functions(num_residual_streams, dim = dim, add_stream_embed = True, disable = num_residual_streams == 1)
         self.layers = ModuleList([])
@@ -561,14 +563,14 @@ class MemoryAsContextTransformer(Module):
             mem_hyper_conn = None
             if layer in neural_memory_layers:
-                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output, num_input_views = 2 if neural_memory_kv_receives_diff_views else 1)
+                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output, num_input_views = 3 if neural_memory_qkv_receives_diff_views else 1)
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
                     batch_size = neural_memory_batch_size,
                     model = deepcopy(neural_memory_model),
-                    kv_receives_diff_views = neural_memory_kv_receives_diff_views,
+                    qkv_receives_diff_views = neural_memory_qkv_receives_diff_views,
                     accept_weight_residual = neural_mem_weight_residual and not is_first_neural_mem,
                     **neural_memory_kwargs
                 )

{titans_pytorch-0.3.19 → titans_pytorch-0.3.21}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -231,7 +231,7 @@ class NeuralMemory(Module):
         momentum_order = 1,
         learned_momentum_combine = False,
         learned_combine_include_zeroth = False,
-        kv_receives_diff_views = False, # to address an issue raised by a phd student (who will be credited if experiments are green). basically the issue raised is that the memory MLP is only learning Wk @ Wv linear mapping and that may not be expressive enough. we will use hyper connections to allow the network to choose different previous layer inputs as keys / values and see if that does anything
+        qkv_receives_diff_views = False, # to address an issue raised by a phd student (who will be credited if experiments are green). basically the issue raised is that the memory MLP is only learning Wk @ Wv linear mapping and that may not be expressive enough. we will use hyper connections to allow the network to choose different previous layer inputs as keys / values and see if that does anything
         pre_rmsnorm = True,
         post_rmsnorm = False,
         qk_rmsnorm = False,
@@ -268,7 +268,7 @@ class NeuralMemory(Module):
         # key values receiving different views
-        self.kv_receives_diff_views = kv_receives_diff_views
+        self.qkv_receives_diff_views = qkv_receives_diff_views
         # norms
@@ -511,7 +511,7 @@ class NeuralMemory(Module):
         seq_index = 0,
         prev_weights = None
     ):
-        if self.kv_receives_diff_views:
+        if self.qkv_receives_diff_views:
             _, batch, seq_len = seq.shape[:3]
         else:
             batch, seq_len = seq.shape[:2]
@@ -550,7 +550,7 @@ class NeuralMemory(Module):
         values_seq = seq
-        if self.kv_receives_diff_views:
+        if self.qkv_receives_diff_views:
             seq, values_seq = seq
         # derive learned hparams for optimization of memory network
@@ -820,10 +820,23 @@ class NeuralMemory(Module):
         state: NeuralMemState | None = None,
         prev_weights = None
     ):
-        if seq.ndim == 2:
-            seq = rearrange(seq, 'b d -> b 1 d')
+        is_multi_input = self.qkv_receives_diff_views
-        is_single_token = seq.shape[1] == 1
+        # handle single token
+        if seq.ndim == 2 or (is_multi_input and seq.ndim == 3):
+            seq = rearrange(seq, '... b d -> ... b 1 d')
+        is_single_token = seq.shape[-2] == 1
+        # if different views for qkv, then
+        if is_multi_input:
+            retrieve_seq, seq = seq[0], seq[1:]
+        else:
+            retrieve_seq = seq
+        # handle previous state init
         if not exists(state):
             state = (0, None, None, None, None)
@@ -839,8 +852,6 @@ class NeuralMemory(Module):
         if exists(cache_store_seq):
             store_seq = safe_cat((cache_store_seq, store_seq))
-        # functions
         # compute split sizes of sequence
         # for now manually update weights to last update at the correct boundaries
@@ -939,11 +950,8 @@ class NeuralMemory(Module):
             last_update, _ = next_neural_mem_state.states
             updates = rearrange_dict_values(last_update, 'b ... -> b 1 ...')
-        if self.kv_receives_diff_views:
-            seq = seq[0]
         retrieved = self.retrieve_memories(
-            seq,
+            retrieve_seq,
             updates
         )