titans-pytorch 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- titans_pytorch/neural_memory.py +43 -0
- {titans_pytorch-0.4.7.dist-info → titans_pytorch-0.4.9.dist-info}/METADATA +29 -1
- titans_pytorch-0.4.9.dist-info/RECORD +8 -0
- titans_pytorch-0.4.7.dist-info/RECORD +0 -8
- {titans_pytorch-0.4.7.dist-info → titans_pytorch-0.4.9.dist-info}/WHEEL +0 -0
- {titans_pytorch-0.4.7.dist-info → titans_pytorch-0.4.9.dist-info}/licenses/LICENSE +0 -0
titans_pytorch/neural_memory.py
CHANGED
@@ -152,6 +152,39 @@ def softclamp_grad_norm(t, max_value):
|
|
152
152
|
t = t * (clamped_norm / norm)
|
153
153
|
return inverse(t)
|
154
154
|
|
155
|
+
# spectral norming the surprise update w/ newton schulz matrix iter
|
156
|
+
# Keller Jordan et al. from OSS w/ nanogpt, now being used for two works, Atlas and 'TTT done right'
|
157
|
+
|
158
|
+
def newtonschulz5(
|
159
|
+
t,
|
160
|
+
steps = 5,
|
161
|
+
eps = 1e-7,
|
162
|
+
coefs = (3.4445, -4.7750, 2.0315)
|
163
|
+
):
|
164
|
+
if t.ndim <= 3:
|
165
|
+
return t
|
166
|
+
|
167
|
+
shape = t.shape
|
168
|
+
should_transpose = shape[2] > shape[-1]
|
169
|
+
|
170
|
+
if should_transpose:
|
171
|
+
t = t.transpose(-1, -2)
|
172
|
+
|
173
|
+
t, inv_pack = pack_one_with_inverse(t, '* i j')
|
174
|
+
t = t / t.norm(dim = (-1, -2), keepdim = True).clamp(min = eps)
|
175
|
+
|
176
|
+
a, b, c = coefs
|
177
|
+
|
178
|
+
for _ in range(steps):
|
179
|
+
A = t @ t.transpose(-1, -2)
|
180
|
+
B = b * A + c * A @ A
|
181
|
+
t = a * t + B @ t
|
182
|
+
|
183
|
+
if should_transpose:
|
184
|
+
t = t.transpose(-1, -2)
|
185
|
+
|
186
|
+
return inv_pack(t)
|
187
|
+
|
155
188
|
# multi head rmsnorm
|
156
189
|
|
157
190
|
class MultiheadRMSNorm(Module):
|
@@ -254,6 +287,7 @@ class NeuralMemory(Module):
|
|
254
287
|
init_momentum_bias = None,
|
255
288
|
init_decay_bias = None,
|
256
289
|
accept_weight_residual = False,
|
290
|
+
spectral_norm_surprises = False,
|
257
291
|
gated_transition = False,
|
258
292
|
mem_model_norm_add_residual = True, # by default, layernorm output and add residual as proposed in TTT paper, but could be removed
|
259
293
|
default_model_kwargs: dict = dict(
|
@@ -465,6 +499,10 @@ class NeuralMemory(Module):
|
|
465
499
|
|
466
500
|
self.max_grad_norm = max_grad_norm
|
467
501
|
|
502
|
+
# spectral norming the surprises before update, a la Muon from Jordan et al.
|
503
|
+
|
504
|
+
self.spectral_norm_surprises = spectral_norm_surprises
|
505
|
+
|
468
506
|
# weight decay factor
|
469
507
|
|
470
508
|
self.to_decay_factor = Sequential(
|
@@ -748,6 +786,11 @@ class NeuralMemory(Module):
|
|
748
786
|
else:
|
749
787
|
update = einsum(combine_momentums, momentums, 'o b n, o b n ... -> b n ...')
|
750
788
|
|
789
|
+
# maybe spectral norm surprises
|
790
|
+
|
791
|
+
if self.spectral_norm_surprises:
|
792
|
+
update = newtonschulz5(update)
|
793
|
+
|
751
794
|
# use associative scan again for learned forgetting (weight decay) - eq (13)
|
752
795
|
|
753
796
|
update = self.assoc_scan(1. - decay_factor, update, prev = last_update, remove_prev = False)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: titans-pytorch
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.9
|
4
4
|
Summary: Titans
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
|
@@ -207,3 +207,31 @@ $ python train_mac.py
|
|
207
207
|
url = {https://arxiv.org/abs/2501.12352},
|
208
208
|
}
|
209
209
|
```
|
210
|
+
|
211
|
+
```bibtex
|
212
|
+
@misc{jordan2024muon,
|
213
|
+
author = {Keller Jordan and Yuchen Jin and Vlado Boza and Jiacheng You and
|
214
|
+
Franz Cesista and Laker Newhouse and Jeremy Bernstein},
|
215
|
+
title = {Muon: An optimizer for hidden layers in neural networks},
|
216
|
+
year = {2024},
|
217
|
+
url = {https://kellerjordan.github.io/posts/muon/}
|
218
|
+
}
|
219
|
+
```
|
220
|
+
|
221
|
+
```bibtex
|
222
|
+
@inproceedings{Zhang2025TestTimeTD,
|
223
|
+
title = {Test-Time Training Done Right},
|
224
|
+
author = {Tianyuan Zhang and Sai Bi and Yicong Hong and Kai Zhang and Fujun Luan and Songlin Yang and Kalyan Sunkavalli and William T. Freeman and Hao Tan},
|
225
|
+
year = {2025},
|
226
|
+
url = {https://api.semanticscholar.org/CorpusID:279071244}
|
227
|
+
}
|
228
|
+
```
|
229
|
+
|
230
|
+
```bibtex
|
231
|
+
@inproceedings{Behrouz2025ATLASLT,
|
232
|
+
title = {ATLAS: Learning to Optimally Memorize the Context at Test Time},
|
233
|
+
author = {Ali Behrouz and Ze-Minghui Li and Praneeth Kacham and Majid Daliri and Yuan Deng and Peilin Zhong and Meisam Razaviyayn and Vahab S. Mirrokni},
|
234
|
+
year = {2025},
|
235
|
+
url = {https://api.semanticscholar.org/CorpusID:278996373}
|
236
|
+
}
|
237
|
+
```
|
@@ -0,0 +1,8 @@
|
|
1
|
+
titans_pytorch/__init__.py,sha256=sVTOuRUkaIYabFExdLY6s1qXm1UwHHz_J19H8ZV-X74,338
|
2
|
+
titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
|
3
|
+
titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
|
4
|
+
titans_pytorch/neural_memory.py,sha256=JCK9t0dAYB6estqw9rrWENkI6qpsKF9QQf_MwXnWuJ0,34458
|
5
|
+
titans_pytorch-0.4.9.dist-info/METADATA,sha256=ony2yYgXUfdwP6QyM9o3BFSbdmH0HvSwA-BuClBacpQ,7873
|
6
|
+
titans_pytorch-0.4.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
7
|
+
titans_pytorch-0.4.9.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
8
|
+
titans_pytorch-0.4.9.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
titans_pytorch/__init__.py,sha256=sVTOuRUkaIYabFExdLY6s1qXm1UwHHz_J19H8ZV-X74,338
|
2
|
-
titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
|
3
|
-
titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
|
4
|
-
titans_pytorch/neural_memory.py,sha256=EhHptv-9q3PUTJwX9kKAdYMfWueM-JB_kZ3SmRoAdjM,33356
|
5
|
-
titans_pytorch-0.4.7.dist-info/METADATA,sha256=MP0qHzoAM0AZuWg0gL2VOnmpx9HXdHwo5xx2CL0ugso,6797
|
6
|
-
titans_pytorch-0.4.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
7
|
-
titans_pytorch-0.4.7.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
8
|
-
titans_pytorch-0.4.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|