PyPI - titans-pytorch - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

titans-pytorch 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -152,6 +152,30 @@ def softclamp_grad_norm(t, max_value):
     t = t * (clamped_norm / norm)
     return inverse(t)
+# spectral norming the surprise update w/ newton schulz matrix iter
+# Keller Jordan et al. from OSS w/ nanogpt, now being used for two works, Atlas and 'TTT done right'
+def newtonschulz5(
+    t,
+    steps = 5,
+    eps = 1e-7,
+    coefs = (3.4445, -4.7750, 2.0315)
+):
+    if t.ndim <= 3:
+        return t
+    t, inv_pack = pack_one_with_inverse(t, '* i j')
+    t = t / t.norm(dim = (-1, -2), keepdim = True).clamp(min = eps)
+    a, b, c = coefs
+    for _ in range(steps):
+        A = t @ t.transpose(-1, -2)
+        B = b * A + c * A @ A
+        t = a * t + B @ t
+    return inv_pack(t)
 # multi head rmsnorm
 class MultiheadRMSNorm(Module):
@@ -254,6 +278,7 @@ class NeuralMemory(Module):
         init_momentum_bias = None,
         init_decay_bias = None,
         accept_weight_residual = False,
+        spectral_norm_surprises = False,
         gated_transition = False,
         mem_model_norm_add_residual = True, # by default, layernorm output and add residual as proposed in TTT paper, but could be removed
         default_model_kwargs: dict = dict(
@@ -465,6 +490,10 @@ class NeuralMemory(Module):
         self.max_grad_norm = max_grad_norm
+        # spectral norming the surprises before update, a la Muon from Jordan et al.
+        self.spectral_norm_surprises = spectral_norm_surprises
         # weight decay factor
         self.to_decay_factor = Sequential(
@@ -748,6 +777,11 @@ class NeuralMemory(Module):
                 else:
                     update = einsum(combine_momentums, momentums, 'o b n, o b n ... -> b n ...')
+            # maybe spectral norm surprises
+            if self.spectral_norm_surprises:
+                update = newtonschulz5(update)
             # use associative scan again for learned forgetting (weight decay) - eq (13)
             update = self.assoc_scan(1. - decay_factor, update, prev = last_update, remove_prev = False)

{titans_pytorch-0.4.7.dist-info → titans_pytorch-0.4.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.4.7
+Version: 0.4.8
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -207,3 +207,31 @@ $ python train_mac.py
     url     = {https://arxiv.org/abs/2501.12352},
 }
 ```
+```bibtex
+@misc{jordan2024muon,
+    author  = {Keller Jordan and Yuchen Jin and Vlado Boza and Jiacheng You and
+                    Franz Cesista and Laker Newhouse and Jeremy Bernstein},
+    title   = {Muon: An optimizer for hidden layers in neural networks},
+    year    = {2024},
+    url     = {https://kellerjordan.github.io/posts/muon/}
+}
+```
+```bibtex
+@inproceedings{Zhang2025TestTimeTD,
+    title   = {Test-Time Training Done Right},
+    author  = {Tianyuan Zhang and Sai Bi and Yicong Hong and Kai Zhang and Fujun Luan and Songlin Yang and Kalyan Sunkavalli and William T. Freeman and Hao Tan},
+    year    = {2025},
+    url     = {https://api.semanticscholar.org/CorpusID:279071244}
+}
+```
+```bibtex
+@inproceedings{Behrouz2025ATLASLT,
+    title  = {ATLAS: Learning to Optimally Memorize the Context at Test Time},
+    author = {Ali Behrouz and Ze-Minghui Li and Praneeth Kacham and Majid Daliri and Yuan Deng and Peilin Zhong and Meisam Razaviyayn and Vahab S. Mirrokni},
+    year   = {2025},
+    url    = {https://api.semanticscholar.org/CorpusID:278996373}
+}
+```

titans_pytorch-0.4.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+titans_pytorch/__init__.py,sha256=sVTOuRUkaIYabFExdLY6s1qXm1UwHHz_J19H8ZV-X74,338
+titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
+titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
+titans_pytorch/neural_memory.py,sha256=ypWXN8koY8pXt7IvlcMR1QM7cYJnWK_iYLEHy2pjx88,34277
+titans_pytorch-0.4.8.dist-info/METADATA,sha256=BbhF0oiPGdcgxrBGJziZvbvXUmS5lVlGVxpUGPmP0O8,7873
+titans_pytorch-0.4.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.4.8.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.4.8.dist-info/RECORD,,

titans_pytorch-0.4.7.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-titans_pytorch/__init__.py,sha256=sVTOuRUkaIYabFExdLY6s1qXm1UwHHz_J19H8ZV-X74,338
-titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
-titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
-titans_pytorch/neural_memory.py,sha256=EhHptv-9q3PUTJwX9kKAdYMfWueM-JB_kZ3SmRoAdjM,33356
-titans_pytorch-0.4.7.dist-info/METADATA,sha256=MP0qHzoAM0AZuWg0gL2VOnmpx9HXdHwo5xx2CL0ugso,6797
-titans_pytorch-0.4.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.4.7.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.4.7.dist-info/RECORD,,

{titans_pytorch-0.4.7.dist-info → titans_pytorch-0.4.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.4.7.dist-info → titans_pytorch-0.4.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

titans-pytorch 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl