titans-pytorch 0.0.47__py3-none-any.whl → 0.0.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- titans_pytorch/mac_transformer.py +28 -4
- titans_pytorch/titans.py +2 -2
- {titans_pytorch-0.0.47.dist-info → titans_pytorch-0.0.49.dist-info}/METADATA +1 -1
- titans_pytorch-0.0.49.dist-info/RECORD +8 -0
- titans_pytorch-0.0.47.dist-info/RECORD +0 -8
- {titans_pytorch-0.0.47.dist-info → titans_pytorch-0.0.49.dist-info}/WHEEL +0 -0
- {titans_pytorch-0.0.47.dist-info → titans_pytorch-0.0.49.dist-info}/licenses/LICENSE +0 -0
|
@@ -95,7 +95,7 @@ class SegmentedAttention(Module):
|
|
|
95
95
|
dim_head = 64,
|
|
96
96
|
heads = 8,
|
|
97
97
|
accept_value_residual = False,
|
|
98
|
-
attend_kwargs: dict = dict()
|
|
98
|
+
attend_kwargs: dict = dict(),
|
|
99
99
|
):
|
|
100
100
|
super().__init__()
|
|
101
101
|
self.norm = nn.RMSNorm(dim)
|
|
@@ -201,6 +201,7 @@ class MemoryAsContextTransformer(Module):
|
|
|
201
201
|
num_residual_streams = 4,
|
|
202
202
|
neural_memory_kwargs: dict = dict(),
|
|
203
203
|
neural_memory_layers: tuple[int, ...] | None = None,
|
|
204
|
+
aux_kv_recon_loss_weight = 0.
|
|
204
205
|
):
|
|
205
206
|
super().__init__()
|
|
206
207
|
|
|
@@ -276,10 +277,18 @@ class MemoryAsContextTransformer(Module):
|
|
|
276
277
|
|
|
277
278
|
self.to_logits = LinearNoBias(dim, num_tokens)
|
|
278
279
|
|
|
280
|
+
# auxiliary loss on kv recon
|
|
281
|
+
|
|
282
|
+
self.has_aux_kv_recon_loss = aux_kv_recon_loss_weight > 0.
|
|
283
|
+
self.aux_kv_recon_loss_weight = aux_kv_recon_loss_weight
|
|
284
|
+
|
|
285
|
+
self.register_buffer('zero', torch.tensor(0.), persistent = False)
|
|
286
|
+
|
|
279
287
|
def forward(
|
|
280
288
|
self,
|
|
281
289
|
x,
|
|
282
|
-
return_loss = False
|
|
290
|
+
return_loss = False,
|
|
291
|
+
return_loss_breakdown = False
|
|
283
292
|
):
|
|
284
293
|
|
|
285
294
|
if return_loss:
|
|
@@ -317,6 +326,10 @@ class MemoryAsContextTransformer(Module):
|
|
|
317
326
|
|
|
318
327
|
value_residual = None
|
|
319
328
|
|
|
329
|
+
# aux losses
|
|
330
|
+
|
|
331
|
+
kv_recon_losses = self.zero
|
|
332
|
+
|
|
320
333
|
# expand and reduce streams for hyper connections
|
|
321
334
|
|
|
322
335
|
x = self.expand_streams(x)
|
|
@@ -324,7 +337,8 @@ class MemoryAsContextTransformer(Module):
|
|
|
324
337
|
for (attn, ff), maybe_neural_mem in zip(self.layers, self.neural_mem_layers):
|
|
325
338
|
|
|
326
339
|
if exists(maybe_neural_mem):
|
|
327
|
-
x = maybe_neural_mem(x)
|
|
340
|
+
x, aux_kv_loss = maybe_neural_mem(x, return_aux_kv_loss = True)
|
|
341
|
+
kv_recon_losses = kv_recon_losses + aux_kv_loss
|
|
328
342
|
|
|
329
343
|
x, values = attn(x, value_residual = value_residual)
|
|
330
344
|
|
|
@@ -351,4 +365,14 @@ class MemoryAsContextTransformer(Module):
|
|
|
351
365
|
if not return_loss:
|
|
352
366
|
return logits
|
|
353
367
|
|
|
354
|
-
|
|
368
|
+
ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)
|
|
369
|
+
|
|
370
|
+
losses = ar_loss
|
|
371
|
+
|
|
372
|
+
if self.has_aux_kv_recon_loss:
|
|
373
|
+
losses = losses + kv_recon_losses * self.aux_kv_recon_loss_weight
|
|
374
|
+
|
|
375
|
+
if not return_loss_breakdown:
|
|
376
|
+
return losses
|
|
377
|
+
|
|
378
|
+
return losses, (ar_loss, kv_recon_losses)
|
titans_pytorch/titans.py
CHANGED
|
@@ -218,7 +218,7 @@ class NeuralMemory(Module):
|
|
|
218
218
|
pred = functional_call(self.memory_model, params, inputs)
|
|
219
219
|
loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
|
|
220
220
|
weighted_loss = loss * loss_weights
|
|
221
|
-
return weighted_loss.sum(),
|
|
221
|
+
return weighted_loss.sum(), weighted_loss.mean()
|
|
222
222
|
|
|
223
223
|
self.per_sample_grad_fn = vmap(grad(forward_and_loss, has_aux = True), in_dims = (None, 0, 0, 0))
|
|
224
224
|
|
|
@@ -409,7 +409,7 @@ class NeuralMemory(Module):
|
|
|
409
409
|
if not return_aux_kv_loss:
|
|
410
410
|
return updates, next_state
|
|
411
411
|
|
|
412
|
-
return updates, next_state, aux_kv_recon_loss
|
|
412
|
+
return updates, next_state, aux_kv_recon_loss.mean()
|
|
413
413
|
|
|
414
414
|
def retrieve_memories(
|
|
415
415
|
self,
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
titans_pytorch/__init__.py,sha256=wnv_Cjdjqh_h5IqLkQ8xrTtA2K663ITEn-1JeeHofTo,150
|
|
2
|
+
titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
|
|
3
|
+
titans_pytorch/mac_transformer.py,sha256=EMhxPt86Vr6LFvPm0OLMFYLaIY19khU9yIHkIhl2EMA,10316
|
|
4
|
+
titans_pytorch/titans.py,sha256=tV2ej2PGUhMjSmDFV_wowX5q9hyp4SM4Jv3eJNu7cy8,15518
|
|
5
|
+
titans_pytorch-0.0.49.dist-info/METADATA,sha256=hEpYHDqm_gffXybcotEmsK6o-siKrE7HwT_UgbOd-4o,4210
|
|
6
|
+
titans_pytorch-0.0.49.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
7
|
+
titans_pytorch-0.0.49.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
|
8
|
+
titans_pytorch-0.0.49.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
titans_pytorch/__init__.py,sha256=wnv_Cjdjqh_h5IqLkQ8xrTtA2K663ITEn-1JeeHofTo,150
|
|
2
|
-
titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
|
|
3
|
-
titans_pytorch/mac_transformer.py,sha256=mF8PYAjeAjLas1gkYybgzZX1AVK82A_ps_LY00ofYYs,9565
|
|
4
|
-
titans_pytorch/titans.py,sha256=ZcWxx6n-f8ttojRnK9fExavmT1bS-QSCRHQn7ldv7J0,15502
|
|
5
|
-
titans_pytorch-0.0.47.dist-info/METADATA,sha256=HjZxbJlnqsSgbioQz6KHWJb--8n18WDdL2T-jz-CFKc,4210
|
|
6
|
-
titans_pytorch-0.0.47.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
7
|
-
titans_pytorch-0.0.47.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
|
8
|
-
titans_pytorch-0.0.47.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|