PyPI - titans-pytorch - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

titans-pytorch 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

titans_pytorch/titans.py CHANGED Viewed

@@ -99,7 +99,23 @@ class MultiheadRMSNorm(Module):
     def forward(self, x):
         return self.rmsnorm(x) * (self.gamma + 1.)
-# attention pool
+# chunk pooling
+class AveragePool(Module):
+    def __init__(
+        self,
+        chunk_size
+    ):
+        super().__init__()
+        self.chunk_size = chunk_size
+    def forward(
+        self,
+        x,
+        chunk_size = None
+    ):
+        chunk_size = default(chunk_size, self.chunk_size)
+        return reduce(x, 'b (n c) d -> b n d', 'mean', c = chunk_size)
 class AttentionPool(Module):
     def __init__(
@@ -111,7 +127,7 @@ class AttentionPool(Module):
         taken from Enformer https://www.nature.com/articles/s41592-021-01252-x , in turn taken from somewhere else
         """
         super().__init__()
-        self.split_chunks = Rearrange('b (n c) d -> b n c d', c = chunk_size)
+        self.chunk_size = chunk_size
         self.to_attn_logits = nn.Linear(dim, dim)
         # default to average pool
@@ -121,9 +137,13 @@ class AttentionPool(Module):
     def forward(
         self,
-        x
+        x,
+        chunk_size = None
     ):
-        x = self.split_chunks(x)
+        chunk_size = default(chunk_size, self.chunk_size)
+        x = rearrange(x, 'b (n c) d -> b n c d', c = chunk_size)
         attn_logits = self.to_attn_logits(x)
         attn = attn_logits.softmax(dim = -2)
@@ -394,14 +414,13 @@ class NeuralMemory(Module):
         assert not (attn_pool_chunks and chunk_size == 1), '`attn_pool_chunks` cannot be set to True if `chunk_size` is set to 1'
         if not attn_pool_chunks:
-            chunk_reduce_module = Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size)
+            self.reduce_to_chunk_rep = AveragePool(chunk_size = chunk_size)
         else:
-            chunk_reduce_module = AttentionPool(dim, chunk_size = chunk_size)
+            self.reduce_to_chunk_rep = AttentionPool(dim, chunk_size = chunk_size)
         # learned adaptive learning rate and momentum
         self.to_momentum = Sequential(
-            chunk_reduce_module,
             LinearNoBias(dim, heads),
             Rearrange('b n h -> (b h) n 1')
         )
@@ -419,7 +438,6 @@ class NeuralMemory(Module):
         # per layer learning rate modulation
         self.to_layer_modulation = Sequential(
-            chunk_reduce_module,
             LinearNoBias(dim, heads * self.num_memory_parameter_tensors),
             Rearrange('b n (h w) -> w (b h) n', h = heads),
             nn.Sigmoid()
@@ -434,7 +452,6 @@ class NeuralMemory(Module):
         # weight decay factor
         self.to_decay_factor = Sequential(
-            chunk_reduce_module,
             LinearNoBias(dim, heads),
             Rearrange('b n h -> (b h) n 1')
         )
@@ -460,9 +477,10 @@ class NeuralMemory(Module):
         self,
         seq,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]],
-        return_aux_kv_loss = False
+        return_aux_kv_loss = False,
+        chunk_size = None
     ):
-        seq_len, chunk_size = seq.shape[-2], self.store_chunk_size
+        seq_len, chunk_size = seq.shape[-2], default(chunk_size, self.store_chunk_size)
         # handle edge case
@@ -488,18 +506,20 @@ class NeuralMemory(Module):
         curr_weights = curr_weights + past_weights
-        # pack batch and sequence dimension
+        # derive learned hparams for optimization of memory network
         adaptive_lr = self.to_adaptive_step(seq)
         adaptive_lr = self.adaptive_step_transform(adaptive_lr)
-        adaptive_momentum = self.to_momentum(seq).sigmoid()
-        decay_factor = self.to_decay_factor(seq).sigmoid()
+        chunked_seq = self.reduce_to_chunk_rep(seq, chunk_size = chunk_size)
+        adaptive_momentum = self.to_momentum(chunked_seq).sigmoid()
+        decay_factor = self.to_decay_factor(chunked_seq).sigmoid()
         need_layer_lr_mod = exists(self.to_layer_modulation)
         if need_layer_lr_mod:
-            layer_lr_mod = self.to_layer_modulation(seq) * self.max_mem_layer_modulation
+            layer_lr_mod = self.to_layer_modulation(chunked_seq) * self.max_mem_layer_modulation
         # keys and values
@@ -606,8 +626,9 @@ class NeuralMemory(Module):
         self,
         seq,
         past_weights: dict[str, Tensor] | None = None,
+        chunk_size = None
     ):
-        chunk_size = self.retrieve_chunk_size
+        chunk_size = default(chunk_size, self.retrieve_chunk_size)
         batch, seq_len = seq.shape[:2]
         seq = self.retrieve_norm(seq)
@@ -680,7 +701,9 @@ class NeuralMemory(Module):
         seq,
         store_seq = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        return_aux_kv_loss = False
+        return_aux_kv_loss = False,
+        chunk_size = None,
+        store_chunk_size = None
     ):
         batch, seq_len = seq.shape[:2]
@@ -699,12 +722,13 @@ class NeuralMemory(Module):
             past_state = self.init_weights_and_momentum()
         store_seq = default(store_seq, seq)
+        store_chunk_size = default(store_chunk_size, chunk_size)
-        updates, aux_kv_recon_loss = self.store_memories(store_seq, past_state, return_aux_kv_loss = True)
+        updates, aux_kv_recon_loss = self.store_memories(store_seq, past_state, chunk_size = store_chunk_size, return_aux_kv_loss = True)
         past_weights, _ = past_state
-        retrieved = self.retrieve_memories(seq, past_weights + updates)
+        retrieved = self.retrieve_memories(seq, past_weights + updates, chunk_size = chunk_size)
         if not return_aux_kv_loss:
             return retrieved

{titans_pytorch-0.1.10.dist-info → titans_pytorch-0.1.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.10
+Version: 0.1.12
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -35,7 +35,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
 Requires-Dist: accelerated-scan>=0.2.0
-Requires-Dist: axial-positional-embedding>=0.3.6
+Requires-Dist: axial-positional-embedding>=0.3.7
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
 Requires-Dist: hyper-connections>=0.1.8
@@ -137,12 +137,13 @@ $ python train_mac.py
 ```
 ```bibtex
-@software{Kyrylov_Accelerated_Scan_2024,
-    author  = {Kyrylov, Volodymyr},
-    doi     = {10.5281/zenodo.10600962},
-    title   = {Accelerated Scan},
-    version = {0.1.2},
-    year    = {2024}
+@article{Sun2024LearningT,
+    title   = {Learning to (Learn at Test Time): RNNs with Expressive Hidden States},
+    author  = {Yu Sun and Xinhao Li and Karan Dalal and Jiarui Xu and Arjun Vikram and Genghan Zhang and Yann Dubois and Xinlei Chen and Xiaolong Wang and Oluwasanmi Koyejo and Tatsunori Hashimoto and Carlos Guestrin},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2407.04620},
+    url     = {https://api.semanticscholar.org/CorpusID:271039606}
 }
 ```
@@ -154,3 +155,44 @@ $ python train_mac.py
     url     = {https://api.semanticscholar.org/CorpusID:274598177}
 }
 ```
+```bibtex
+@inproceedings{Nguyen2024TurningUT,
+    title   = {Turning Up the Heat: Min-p Sampling for Creative and Coherent LLM Outputs},
+    author  = {Minh Nguyen and Andrew Baker and Clement Neo and Allen Roush and Andreas Kirsch and Ravid Shwartz-Ziv},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:270870613}
+}
+```
+```bibtex
+@article{Zhu2024HyperConnections,
+    title   = {Hyper-Connections},
+    author  = {Defa Zhu and Hongzhi Huang and Zihao Huang and Yutao Zeng and Yunyao Mao and Banggu Wu and Qiyang Min and Xun Zhou},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2409.19606},
+    url     = {https://api.semanticscholar.org/CorpusID:272987528}
+}
+```
+```bibtex
+@article{Zhou2024ValueRL,
+    title   = {Value Residual Learning For Alleviating Attention Concentration In Transformers},
+    author  = {Zhanchao Zhou and Tianyi Wu and Zhiyun Jiang and Zhenzhong Lan},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2410.17897},
+    url     = {https://api.semanticscholar.org/CorpusID:273532030}
+}
+```
+```bibtex
+@software{Kyrylov_Accelerated_Scan_2024,
+    author  = {Kyrylov, Volodymyr},
+    doi     = {10.5281/zenodo.10600962},
+    title   = {Accelerated Scan},
+    version = {0.1.2},
+    year    = {2024}
+}
+```

titans_pytorch-0.1.12.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+titans_pytorch/__init__.py,sha256=u0tta_KqhOdfzCEDWT9P4_jejJEK2q1XxhsEzB5MnQU,223
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=zxknstaI_Uz47Y8WvZ3S7geJ-TNdqKV5Rvj0Jlw8njs,19271
+titans_pytorch/titans.py,sha256=eDTqAIDZjSLd34t8M-dCaqVf_s0wZ9jhVIOfXF7E9ts,21887
+titans_pytorch-0.1.12.dist-info/METADATA,sha256=dL8HpHt6V5gN8p8px7sc2IgJGqXthE7rULKIrRFCwF8,6340
+titans_pytorch-0.1.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.1.12.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.1.12.dist-info/RECORD,,

titans_pytorch-0.1.10.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-titans_pytorch/__init__.py,sha256=u0tta_KqhOdfzCEDWT9P4_jejJEK2q1XxhsEzB5MnQU,223
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=zxknstaI_Uz47Y8WvZ3S7geJ-TNdqKV5Rvj0Jlw8njs,19271
-titans_pytorch/titans.py,sha256=gZvYk1j6aBMp0uE6l1a2GH_4ea9W2uXKytJb3CDPTlk,21162
-titans_pytorch-0.1.10.dist-info/METADATA,sha256=o2D4Zau9GLBZmsj2qzq7agWckPnBJhDtIeTj2cMgy7Q,4769
-titans_pytorch-0.1.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.1.10.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.1.10.dist-info/RECORD,,

{titans_pytorch-0.1.10.dist-info → titans_pytorch-0.1.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.1.10.dist-info → titans_pytorch-0.1.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

titans-pytorch 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl