titans-pytorch 0.4.7__tar.gz → 0.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: titans-pytorch
3
- Version: 0.4.7
3
+ Version: 0.4.8
4
4
  Summary: Titans
5
5
  Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
6
6
  Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -207,3 +207,31 @@ $ python train_mac.py
207
207
  url = {https://arxiv.org/abs/2501.12352},
208
208
  }
209
209
  ```
210
+
211
+ ```bibtex
212
+ @misc{jordan2024muon,
213
+ author = {Keller Jordan and Yuchen Jin and Vlado Boza and Jiacheng You and
214
+ Franz Cesista and Laker Newhouse and Jeremy Bernstein},
215
+ title = {Muon: An optimizer for hidden layers in neural networks},
216
+ year = {2024},
217
+ url = {https://kellerjordan.github.io/posts/muon/}
218
+ }
219
+ ```
220
+
221
+ ```bibtex
222
+ @inproceedings{Zhang2025TestTimeTD,
223
+ title = {Test-Time Training Done Right},
224
+ author = {Tianyuan Zhang and Sai Bi and Yicong Hong and Kai Zhang and Fujun Luan and Songlin Yang and Kalyan Sunkavalli and William T. Freeman and Hao Tan},
225
+ year = {2025},
226
+ url = {https://api.semanticscholar.org/CorpusID:279071244}
227
+ }
228
+ ```
229
+
230
+ ```bibtex
231
+ @inproceedings{Behrouz2025ATLASLT,
232
+ title = {ATLAS: Learning to Optimally Memorize the Context at Test Time},
233
+ author = {Ali Behrouz and Ze-Minghui Li and Praneeth Kacham and Majid Daliri and Yuan Deng and Peilin Zhong and Meisam Razaviyayn and Vahab S. Mirrokni},
234
+ year = {2025},
235
+ url = {https://api.semanticscholar.org/CorpusID:278996373}
236
+ }
237
+ ```
@@ -153,3 +153,31 @@ $ python train_mac.py
153
153
  url = {https://arxiv.org/abs/2501.12352},
154
154
  }
155
155
  ```
156
+
157
+ ```bibtex
158
+ @misc{jordan2024muon,
159
+ author = {Keller Jordan and Yuchen Jin and Vlado Boza and Jiacheng You and
160
+ Franz Cesista and Laker Newhouse and Jeremy Bernstein},
161
+ title = {Muon: An optimizer for hidden layers in neural networks},
162
+ year = {2024},
163
+ url = {https://kellerjordan.github.io/posts/muon/}
164
+ }
165
+ ```
166
+
167
+ ```bibtex
168
+ @inproceedings{Zhang2025TestTimeTD,
169
+ title = {Test-Time Training Done Right},
170
+ author = {Tianyuan Zhang and Sai Bi and Yicong Hong and Kai Zhang and Fujun Luan and Songlin Yang and Kalyan Sunkavalli and William T. Freeman and Hao Tan},
171
+ year = {2025},
172
+ url = {https://api.semanticscholar.org/CorpusID:279071244}
173
+ }
174
+ ```
175
+
176
+ ```bibtex
177
+ @inproceedings{Behrouz2025ATLASLT,
178
+ title = {ATLAS: Learning to Optimally Memorize the Context at Test Time},
179
+ author = {Ali Behrouz and Ze-Minghui Li and Praneeth Kacham and Majid Daliri and Yuan Deng and Peilin Zhong and Meisam Razaviyayn and Vahab S. Mirrokni},
180
+ year = {2025},
181
+ url = {https://api.semanticscholar.org/CorpusID:278996373}
182
+ }
183
+ ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "titans-pytorch"
3
- version = "0.4.7"
3
+ version = "0.4.8"
4
4
  description = "Titans"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -152,6 +152,30 @@ def softclamp_grad_norm(t, max_value):
152
152
  t = t * (clamped_norm / norm)
153
153
  return inverse(t)
154
154
 
155
+ # spectral norming the surprise update w/ newton schulz matrix iter
156
+ # Keller Jordan et al. from OSS w/ nanogpt, now being used for two works, Atlas and 'TTT done right'
157
+
158
+ def newtonschulz5(
159
+ t,
160
+ steps = 5,
161
+ eps = 1e-7,
162
+ coefs = (3.4445, -4.7750, 2.0315)
163
+ ):
164
+ if t.ndim <= 3:
165
+ return t
166
+
167
+ t, inv_pack = pack_one_with_inverse(t, '* i j')
168
+ t = t / t.norm(dim = (-1, -2), keepdim = True).clamp(min = eps)
169
+
170
+ a, b, c = coefs
171
+
172
+ for _ in range(steps):
173
+ A = t @ t.transpose(-1, -2)
174
+ B = b * A + c * A @ A
175
+ t = a * t + B @ t
176
+
177
+ return inv_pack(t)
178
+
155
179
  # multi head rmsnorm
156
180
 
157
181
  class MultiheadRMSNorm(Module):
@@ -254,6 +278,7 @@ class NeuralMemory(Module):
254
278
  init_momentum_bias = None,
255
279
  init_decay_bias = None,
256
280
  accept_weight_residual = False,
281
+ spectral_norm_surprises = False,
257
282
  gated_transition = False,
258
283
  mem_model_norm_add_residual = True, # by default, layernorm output and add residual as proposed in TTT paper, but could be removed
259
284
  default_model_kwargs: dict = dict(
@@ -465,6 +490,10 @@ class NeuralMemory(Module):
465
490
 
466
491
  self.max_grad_norm = max_grad_norm
467
492
 
493
+ # spectral norming the surprises before update, a la Muon from Jordan et al.
494
+
495
+ self.spectral_norm_surprises = spectral_norm_surprises
496
+
468
497
  # weight decay factor
469
498
 
470
499
  self.to_decay_factor = Sequential(
@@ -748,6 +777,11 @@ class NeuralMemory(Module):
748
777
  else:
749
778
  update = einsum(combine_momentums, momentums, 'o b n, o b n ... -> b n ...')
750
779
 
780
+ # maybe spectral norm surprises
781
+
782
+ if self.spectral_norm_surprises:
783
+ update = newtonschulz5(update)
784
+
751
785
  # use associative scan again for learned forgetting (weight decay) - eq (13)
752
786
 
753
787
  update = self.assoc_scan(1. - decay_factor, update, prev = last_update, remove_prev = False)
@@ -49,6 +49,7 @@ STORE_ATTN_POOL_CHUNKS = True # whether to use attention pooli
49
49
  MEMORY_MODEL_PER_LAYER_LEARNED_LR = True
50
50
  NEURAL_MEM_WEIGHT_RESIDUAL = True # learning to accept contributions from the weights of the previous neural mem layer brings about significant improvements. this was improvised and not in the paper, but inspired by the value residual learning free lunch paper
51
51
  NEURAL_MEM_QKV_RECEIVES_DIFF_VIEW = True # will allow the neural memory to select what layers from which to derive queries / keys / values, effectively allowing it to graft itself to the transformer in any way to be beneficial. this is to address an issue from a phd student who noted that the mem network is learning nothing more than wk @ wv. this also generalizes all possible ways to connect the neural memory to a transformer, a sort of NAS
52
+ NEURAL_MEM_SPEC_NORM_SURPRISES = True # applying lessons from Muon optimizer to surprise updates, by spectral norming the surprises
52
53
 
53
54
  # experiment related
54
55
 
@@ -121,7 +122,8 @@ model = MemoryAsContextTransformer(
121
122
  momentum_order = NEURAL_MEM_MOMENTUM_ORDER,
122
123
  default_step_transform_max_lr = NEURAL_MEM_MAX_LR,
123
124
  use_accelerated_scan = USE_ACCELERATED_SCAN,
124
- per_parameter_lr_modulation = MEMORY_MODEL_PER_LAYER_LEARNED_LR
125
+ per_parameter_lr_modulation = MEMORY_MODEL_PER_LAYER_LEARNED_LR,
126
+ spectral_norm_surprises = NEURAL_MEM_SPEC_NORM_SURPRISES
125
127
  )
126
128
  ).cuda()
127
129
 
File without changes
File without changes
File without changes