titans-pytorch 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -152,6 +152,39 @@ def softclamp_grad_norm(t, max_value):
152
152
  t = t * (clamped_norm / norm)
153
153
  return inverse(t)
154
154
 
155
+ # spectral norming the surprise update w/ newton schulz matrix iter
156
+ # Keller Jordan et al. from OSS w/ nanogpt, now being used for two works, Atlas and 'TTT done right'
157
+
158
+ def newtonschulz5(
159
+ t,
160
+ steps = 5,
161
+ eps = 1e-7,
162
+ coefs = (3.4445, -4.7750, 2.0315)
163
+ ):
164
+ if t.ndim <= 3:
165
+ return t
166
+
167
+ shape = t.shape
168
+ should_transpose = shape[2] > shape[-1]
169
+
170
+ if should_transpose:
171
+ t = t.transpose(-1, -2)
172
+
173
+ t, inv_pack = pack_one_with_inverse(t, '* i j')
174
+ t = t / t.norm(dim = (-1, -2), keepdim = True).clamp(min = eps)
175
+
176
+ a, b, c = coefs
177
+
178
+ for _ in range(steps):
179
+ A = t @ t.transpose(-1, -2)
180
+ B = b * A + c * A @ A
181
+ t = a * t + B @ t
182
+
183
+ if should_transpose:
184
+ t = t.transpose(-1, -2)
185
+
186
+ return inv_pack(t)
187
+
155
188
  # multi head rmsnorm
156
189
 
157
190
  class MultiheadRMSNorm(Module):
@@ -254,6 +287,7 @@ class NeuralMemory(Module):
254
287
  init_momentum_bias = None,
255
288
  init_decay_bias = None,
256
289
  accept_weight_residual = False,
290
+ spectral_norm_surprises = False,
257
291
  gated_transition = False,
258
292
  mem_model_norm_add_residual = True, # by default, layernorm output and add residual as proposed in TTT paper, but could be removed
259
293
  default_model_kwargs: dict = dict(
@@ -465,6 +499,10 @@ class NeuralMemory(Module):
465
499
 
466
500
  self.max_grad_norm = max_grad_norm
467
501
 
502
+ # spectral norming the surprises before update, a la Muon from Jordan et al.
503
+
504
+ self.spectral_norm_surprises = spectral_norm_surprises
505
+
468
506
  # weight decay factor
469
507
 
470
508
  self.to_decay_factor = Sequential(
@@ -748,6 +786,11 @@ class NeuralMemory(Module):
748
786
  else:
749
787
  update = einsum(combine_momentums, momentums, 'o b n, o b n ... -> b n ...')
750
788
 
789
+ # maybe spectral norm surprises
790
+
791
+ if self.spectral_norm_surprises:
792
+ update = newtonschulz5(update)
793
+
751
794
  # use associative scan again for learned forgetting (weight decay) - eq (13)
752
795
 
753
796
  update = self.assoc_scan(1. - decay_factor, update, prev = last_update, remove_prev = False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: titans-pytorch
3
- Version: 0.4.7
3
+ Version: 0.4.9
4
4
  Summary: Titans
5
5
  Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
6
6
  Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -207,3 +207,31 @@ $ python train_mac.py
207
207
  url = {https://arxiv.org/abs/2501.12352},
208
208
  }
209
209
  ```
210
+
211
+ ```bibtex
212
+ @misc{jordan2024muon,
213
+ author = {Keller Jordan and Yuchen Jin and Vlado Boza and Jiacheng You and
214
+ Franz Cesista and Laker Newhouse and Jeremy Bernstein},
215
+ title = {Muon: An optimizer for hidden layers in neural networks},
216
+ year = {2024},
217
+ url = {https://kellerjordan.github.io/posts/muon/}
218
+ }
219
+ ```
220
+
221
+ ```bibtex
222
+ @inproceedings{Zhang2025TestTimeTD,
223
+ title = {Test-Time Training Done Right},
224
+ author = {Tianyuan Zhang and Sai Bi and Yicong Hong and Kai Zhang and Fujun Luan and Songlin Yang and Kalyan Sunkavalli and William T. Freeman and Hao Tan},
225
+ year = {2025},
226
+ url = {https://api.semanticscholar.org/CorpusID:279071244}
227
+ }
228
+ ```
229
+
230
+ ```bibtex
231
+ @inproceedings{Behrouz2025ATLASLT,
232
+ title = {ATLAS: Learning to Optimally Memorize the Context at Test Time},
233
+ author = {Ali Behrouz and Ze-Minghui Li and Praneeth Kacham and Majid Daliri and Yuan Deng and Peilin Zhong and Meisam Razaviyayn and Vahab S. Mirrokni},
234
+ year = {2025},
235
+ url = {https://api.semanticscholar.org/CorpusID:278996373}
236
+ }
237
+ ```
@@ -0,0 +1,8 @@
1
+ titans_pytorch/__init__.py,sha256=sVTOuRUkaIYabFExdLY6s1qXm1UwHHz_J19H8ZV-X74,338
2
+ titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
3
+ titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
4
+ titans_pytorch/neural_memory.py,sha256=JCK9t0dAYB6estqw9rrWENkI6qpsKF9QQf_MwXnWuJ0,34458
5
+ titans_pytorch-0.4.9.dist-info/METADATA,sha256=ony2yYgXUfdwP6QyM9o3BFSbdmH0HvSwA-BuClBacpQ,7873
6
+ titans_pytorch-0.4.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
+ titans_pytorch-0.4.9.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
8
+ titans_pytorch-0.4.9.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- titans_pytorch/__init__.py,sha256=sVTOuRUkaIYabFExdLY6s1qXm1UwHHz_J19H8ZV-X74,338
2
- titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
3
- titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
4
- titans_pytorch/neural_memory.py,sha256=EhHptv-9q3PUTJwX9kKAdYMfWueM-JB_kZ3SmRoAdjM,33356
5
- titans_pytorch-0.4.7.dist-info/METADATA,sha256=MP0qHzoAM0AZuWg0gL2VOnmpx9HXdHwo5xx2CL0ugso,6797
6
- titans_pytorch-0.4.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- titans_pytorch-0.4.7.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
8
- titans_pytorch-0.4.7.dist-info/RECORD,,