PyPI - titans-pytorch - Versions diffs - 0.1.11__tar.gz → 0.1.14__tar.gz - Mend

titans-pytorch 0.1.11tar.gz → 0.1.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.1.11 → titans_pytorch-0.1.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.11
+Version: 0.1.14
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -137,12 +137,13 @@ $ python train_mac.py
 ```
 ```bibtex
-@software{Kyrylov_Accelerated_Scan_2024,
-    author  = {Kyrylov, Volodymyr},
-    doi     = {10.5281/zenodo.10600962},
-    title   = {Accelerated Scan},
-    version = {0.1.2},
-    year    = {2024}
+@article{Sun2024LearningT,
+    title   = {Learning to (Learn at Test Time): RNNs with Expressive Hidden States},
+    author  = {Yu Sun and Xinhao Li and Karan Dalal and Jiarui Xu and Arjun Vikram and Genghan Zhang and Yann Dubois and Xinlei Chen and Xiaolong Wang and Oluwasanmi Koyejo and Tatsunori Hashimoto and Carlos Guestrin},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2407.04620},
+    url     = {https://api.semanticscholar.org/CorpusID:271039606}
 }
 ```
@@ -154,3 +155,44 @@ $ python train_mac.py
     url     = {https://api.semanticscholar.org/CorpusID:274598177}
 }
 ```
+```bibtex
+@inproceedings{Nguyen2024TurningUT,
+    title   = {Turning Up the Heat: Min-p Sampling for Creative and Coherent LLM Outputs},
+    author  = {Minh Nguyen and Andrew Baker and Clement Neo and Allen Roush and Andreas Kirsch and Ravid Shwartz-Ziv},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:270870613}
+}
+```
+```bibtex
+@article{Zhu2024HyperConnections,
+    title   = {Hyper-Connections},
+    author  = {Defa Zhu and Hongzhi Huang and Zihao Huang and Yutao Zeng and Yunyao Mao and Banggu Wu and Qiyang Min and Xun Zhou},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2409.19606},
+    url     = {https://api.semanticscholar.org/CorpusID:272987528}
+}
+```
+```bibtex
+@article{Zhou2024ValueRL,
+    title   = {Value Residual Learning For Alleviating Attention Concentration In Transformers},
+    author  = {Zhanchao Zhou and Tianyi Wu and Zhiyun Jiang and Zhenzhong Lan},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2410.17897},
+    url     = {https://api.semanticscholar.org/CorpusID:273532030}
+}
+```
+```bibtex
+@software{Kyrylov_Accelerated_Scan_2024,
+    author  = {Kyrylov, Volodymyr},
+    doi     = {10.5281/zenodo.10600962},
+    title   = {Accelerated Scan},
+    version = {0.1.2},
+    year    = {2024}
+}
+```

{titans_pytorch-0.1.11 → titans_pytorch-0.1.14}/README.md RENAMED Viewed

@@ -83,12 +83,13 @@ $ python train_mac.py
 ```
 ```bibtex
-@software{Kyrylov_Accelerated_Scan_2024,
-    author  = {Kyrylov, Volodymyr},
-    doi     = {10.5281/zenodo.10600962},
-    title   = {Accelerated Scan},
-    version = {0.1.2},
-    year    = {2024}
+@article{Sun2024LearningT,
+    title   = {Learning to (Learn at Test Time): RNNs with Expressive Hidden States},
+    author  = {Yu Sun and Xinhao Li and Karan Dalal and Jiarui Xu and Arjun Vikram and Genghan Zhang and Yann Dubois and Xinlei Chen and Xiaolong Wang and Oluwasanmi Koyejo and Tatsunori Hashimoto and Carlos Guestrin},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2407.04620},
+    url     = {https://api.semanticscholar.org/CorpusID:271039606}
 }
 ```
@@ -100,3 +101,44 @@ $ python train_mac.py
     url     = {https://api.semanticscholar.org/CorpusID:274598177}
 }
 ```
+```bibtex
+@inproceedings{Nguyen2024TurningUT,
+    title   = {Turning Up the Heat: Min-p Sampling for Creative and Coherent LLM Outputs},
+    author  = {Minh Nguyen and Andrew Baker and Clement Neo and Allen Roush and Andreas Kirsch and Ravid Shwartz-Ziv},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:270870613}
+}
+```
+```bibtex
+@article{Zhu2024HyperConnections,
+    title   = {Hyper-Connections},
+    author  = {Defa Zhu and Hongzhi Huang and Zihao Huang and Yutao Zeng and Yunyao Mao and Banggu Wu and Qiyang Min and Xun Zhou},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2409.19606},
+    url     = {https://api.semanticscholar.org/CorpusID:272987528}
+}
+```
+```bibtex
+@article{Zhou2024ValueRL,
+    title   = {Value Residual Learning For Alleviating Attention Concentration In Transformers},
+    author  = {Zhanchao Zhou and Tianyi Wu and Zhiyun Jiang and Zhenzhong Lan},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2410.17897},
+    url     = {https://api.semanticscholar.org/CorpusID:273532030}
+}
+```
+```bibtex
+@software{Kyrylov_Accelerated_Scan_2024,
+    author  = {Kyrylov, Volodymyr},
+    doi     = {10.5281/zenodo.10600962},
+    title   = {Accelerated Scan},
+    version = {0.1.2},
+    year    = {2024}
+}
+```

{titans_pytorch-0.1.11 → titans_pytorch-0.1.14}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.11"
+version = "0.1.14"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.1.11 → titans_pytorch-0.1.14}/tests/test_titans.py RENAMED Viewed

@@ -12,6 +12,7 @@ def exists(v):
 @pytest.mark.parametrize('silu', (False, True))
 @pytest.mark.parametrize('learned_mem_model_weights', (False, True))
 @pytest.mark.parametrize('attn_pool_chunks', (False, True))
+@pytest.mark.parametrize('momentum', (False, True))
 @pytest.mark.parametrize('max_grad_norm', (None, 2.))
 @pytest.mark.parametrize('per_parameter_lr_modulation', (False, True))
 def test_titans(
@@ -19,6 +20,7 @@ def test_titans(
     silu,
     learned_mem_model_weights,
     attn_pool_chunks,
+    momentum,
     max_grad_norm,
     per_parameter_lr_modulation
 ):
@@ -28,6 +30,7 @@ def test_titans(
         activation = nn.SiLU() if silu else None,
         attn_pool_chunks = attn_pool_chunks,
         max_grad_norm = max_grad_norm,
+        momentum = momentum,
         per_parameter_lr_modulation = per_parameter_lr_modulation,
         learned_mem_model_weights = learned_mem_model_weights
     )
@@ -66,6 +69,19 @@ def test_retrieve_store_diff_seq():
     assert retrieve_seq.shape == retrieved.shape
+def test_overriding_chunk_size():
+    mem = NeuralMemory(
+        dim = 384,
+        chunk_size = 64,
+    )
+    seq = torch.randn(2, 128 * 16, 384)
+    store_seq = torch.randn(2, 128 * 8, 384)
+    retrieved = mem(seq, store_seq, chunk_size = 16, store_chunk_size = 8)
+    assert seq.shape == retrieved.shape
 @pytest.mark.parametrize('seq_len', (1023, 17))
 @pytest.mark.parametrize('num_persist_mem_tokens', (0, 16))
 @pytest.mark.parametrize('num_longterm_mem_tokens', (0, 16))

{titans_pytorch-0.1.11 → titans_pytorch-0.1.14}/titans_pytorch/titans.py RENAMED Viewed

@@ -99,7 +99,23 @@ class MultiheadRMSNorm(Module):
     def forward(self, x):
         return self.rmsnorm(x) * (self.gamma + 1.)
-# attention pool
+# chunk pooling
+class AveragePool(Module):
+    def __init__(
+        self,
+        chunk_size
+    ):
+        super().__init__()
+        self.chunk_size = chunk_size
+    def forward(
+        self,
+        x,
+        chunk_size = None
+    ):
+        chunk_size = default(chunk_size, self.chunk_size)
+        return reduce(x, 'b (n c) d -> b n d', 'mean', c = chunk_size)
 class AttentionPool(Module):
     def __init__(
@@ -111,7 +127,7 @@ class AttentionPool(Module):
         taken from Enformer https://www.nature.com/articles/s41592-021-01252-x , in turn taken from somewhere else
         """
         super().__init__()
-        self.split_chunks = Rearrange('b (n c) d -> b n c d', c = chunk_size)
+        self.chunk_size = chunk_size
         self.to_attn_logits = nn.Linear(dim, dim)
         # default to average pool
@@ -121,9 +137,13 @@ class AttentionPool(Module):
     def forward(
         self,
-        x
+        x,
+        chunk_size = None
     ):
-        x = self.split_chunks(x)
+        chunk_size = default(chunk_size, self.chunk_size)
+        x = rearrange(x, 'b (n c) d -> b n c d', c = chunk_size)
         attn_logits = self.to_attn_logits(x)
         attn = attn_logits.softmax(dim = -2)
@@ -303,6 +323,7 @@ class NeuralMemory(Module):
         per_parameter_lr_modulation = False, # allow outer network to control learning rate per weight matrix of memory network
         max_mem_layer_modulation = 1e1, # max of 10.
         attn_pool_chunks = False,
+        momentum = True,
         pre_rmsnorm = True,
         post_rmsnorm = True,
         learned_mem_model_weights = True,
@@ -394,17 +415,16 @@ class NeuralMemory(Module):
         assert not (attn_pool_chunks and chunk_size == 1), '`attn_pool_chunks` cannot be set to True if `chunk_size` is set to 1'
         if not attn_pool_chunks:
-            chunk_reduce_module = Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size)
+            self.reduce_to_chunk_rep = AveragePool(chunk_size = chunk_size)
         else:
-            chunk_reduce_module = AttentionPool(dim, chunk_size = chunk_size)
+            self.reduce_to_chunk_rep = AttentionPool(dim, chunk_size = chunk_size)
         # learned adaptive learning rate and momentum
         self.to_momentum = Sequential(
-            chunk_reduce_module,
             LinearNoBias(dim, heads),
             Rearrange('b n h -> (b h) n 1')
-        )
+        ) if momentum else None
         self.to_adaptive_step = Sequential(
             LinearNoBias(dim, heads),
@@ -419,7 +439,6 @@ class NeuralMemory(Module):
         # per layer learning rate modulation
         self.to_layer_modulation = Sequential(
-            chunk_reduce_module,
             LinearNoBias(dim, heads * self.num_memory_parameter_tensors),
             Rearrange('b n (h w) -> w (b h) n', h = heads),
             nn.Sigmoid()
@@ -434,7 +453,6 @@ class NeuralMemory(Module):
         # weight decay factor
         self.to_decay_factor = Sequential(
-            chunk_reduce_module,
             LinearNoBias(dim, heads),
             Rearrange('b n h -> (b h) n 1')
         )
@@ -445,12 +463,15 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
-    def init_weights_and_momentum(self):
+    def init_weights_and_momentum(self, zero_weights = False):
         params = TensorDict(dict(self.memory_model.named_parameters()))
-        init_weights = params.clone().zero_()
+        init_weights = params
         init_momentum = params.clone().zero_()
+        if zero_weights:
+            init_weights = params.clone().zero_()
         return init_weights, init_momentum
     def init_empty_memory_embed(self, batch, seq_len):
@@ -460,9 +481,10 @@ class NeuralMemory(Module):
         self,
         seq,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]],
-        return_aux_kv_loss = False
+        return_aux_kv_loss = False,
+        chunk_size = None
     ):
-        seq_len, chunk_size = seq.shape[-2], self.store_chunk_size
+        seq_len, chunk_size = seq.shape[-2], default(chunk_size, self.store_chunk_size)
         # handle edge case
@@ -479,27 +501,28 @@ class NeuralMemory(Module):
         seq = seq[:, :round_down_seq_len]
-        # curr weights + past weights, in the case that the initial weights are learned
-        curr_weights = TensorDict(dict(self.memory_model.named_parameters()))
+        # get the weights of the memory network
         past_state = tuple(TensorDict(d) for d in past_state)
-        past_weights, past_momentum = past_state
-        curr_weights = curr_weights + past_weights
+        curr_weights, past_momentum = past_state
-        # pack batch and sequence dimension
+        # derive learned hparams for optimization of memory network
         adaptive_lr = self.to_adaptive_step(seq)
         adaptive_lr = self.adaptive_step_transform(adaptive_lr)
-        adaptive_momentum = self.to_momentum(seq).sigmoid()
-        decay_factor = self.to_decay_factor(seq).sigmoid()
+        chunked_seq = self.reduce_to_chunk_rep(seq, chunk_size = chunk_size)
+        decay_factor = self.to_decay_factor(chunked_seq).sigmoid()
         need_layer_lr_mod = exists(self.to_layer_modulation)
+        has_momentum = exists(self.to_momentum)
+        if has_momentum:
+            adaptive_momentum = self.to_momentum(chunked_seq).sigmoid()
         if need_layer_lr_mod:
-            layer_lr_mod = self.to_layer_modulation(seq) * self.max_mem_layer_modulation
+            layer_lr_mod = self.to_layer_modulation(chunked_seq) * self.max_mem_layer_modulation
         # keys and values
@@ -575,23 +598,29 @@ class NeuralMemory(Module):
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
-        next_momentum = TensorDict()
+        next_momentum = TensorDict() if has_momentum else None
         updates = TensorDict()
         for param_name, surprise in surprises.items():
             surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
+            update = surprise
             # derive momentum with associative scan - eq (10)
-            momentum = scan_fn(adaptive_momentum, surprise) # momentum is S / surprise in the paper
+            if has_momentum:
+                update = scan_fn(adaptive_momentum, surprise) # momentum is S / surprise in the paper
+                momentum = update
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = scan_fn(1. - decay_factor, momentum)
+            update = scan_fn(1. - decay_factor, update)
             updates[param_name] = inverse_pack(update)
-            next_momentum[param_name] = inverse_pack(momentum)
+            if has_momentum:
+                next_momentum[param_name] = inverse_pack(momentum)
         # compute the next weight per batch
@@ -606,8 +635,9 @@ class NeuralMemory(Module):
         self,
         seq,
         past_weights: dict[str, Tensor] | None = None,
+        chunk_size = None
     ):
-        chunk_size = self.retrieve_chunk_size
+        chunk_size = default(chunk_size, self.retrieve_chunk_size)
         batch, seq_len = seq.shape[:2]
         seq = self.retrieve_norm(seq)
@@ -680,7 +710,9 @@ class NeuralMemory(Module):
         seq,
         store_seq = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        return_aux_kv_loss = False
+        return_aux_kv_loss = False,
+        chunk_size = None,
+        store_chunk_size = None
     ):
         batch, seq_len = seq.shape[:2]
@@ -699,12 +731,13 @@ class NeuralMemory(Module):
             past_state = self.init_weights_and_momentum()
         store_seq = default(store_seq, seq)
+        store_chunk_size = default(store_chunk_size, chunk_size)
-        updates, aux_kv_recon_loss = self.store_memories(store_seq, past_state, return_aux_kv_loss = True)
+        updates, aux_kv_recon_loss = self.store_memories(store_seq, past_state, chunk_size = store_chunk_size, return_aux_kv_loss = True)
         past_weights, _ = past_state
-        retrieved = self.retrieve_memories(seq, past_weights + updates)
+        retrieved = self.retrieve_memories(seq, past_weights + updates, chunk_size = chunk_size)
         if not return_aux_kv_loss:
             return retrieved

{titans_pytorch-0.1.11 → titans_pytorch-0.1.14}/train_mac.py RENAMED Viewed

@@ -31,6 +31,7 @@ NUM_PERSIST_MEM = 4
 NUM_LONGTERM_MEM = 4
 NEURAL_MEM_LAYERS = (2, 4)
 NEURAL_MEM_GATE_ATTN_OUTPUT = True
+NEURAL_MEM_MOMENTUM = True
 WINDOW_SIZE = 32
 NEURAL_MEM_SEGMENT_LEN = WINDOW_SIZE // 2 # set smaller for more granularity for learning rate / momentum etc
 SLIDING_WINDOWS = True
@@ -88,6 +89,7 @@ model = MemoryAsContextTransformer(
         dim_head = 64,
         heads = 4,
         attn_pool_chunks = STORE_ATTN_POOL_CHUNKS,
+        momentum = NEURAL_MEM_MOMENTUM,
         use_accelerated_scan = USE_ACCELERATED_SCAN,
         learned_mem_model_weights = LEARNED_MEM_MODEL_WEIGHTS,
         default_model_kwargs = dict(