x-transformers 2.7.3__tar.gz → 2.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {x_transformers-2.7.3 → x_transformers-2.7.5}/PKG-INFO +1 -1
  2. {x_transformers-2.7.3 → x_transformers-2.7.5}/pyproject.toml +1 -1
  3. {x_transformers-2.7.3 → x_transformers-2.7.5}/tests/test_x_transformers.py +26 -0
  4. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/x_transformers.py +26 -3
  5. {x_transformers-2.7.3 → x_transformers-2.7.5}/.github/FUNDING.yml +0 -0
  6. {x_transformers-2.7.3 → x_transformers-2.7.5}/.github/workflows/python-publish.yml +0 -0
  7. {x_transformers-2.7.3 → x_transformers-2.7.5}/.github/workflows/python-test.yaml +0 -0
  8. {x_transformers-2.7.3 → x_transformers-2.7.5}/.gitignore +0 -0
  9. {x_transformers-2.7.3 → x_transformers-2.7.5}/LICENSE +0 -0
  10. {x_transformers-2.7.3 → x_transformers-2.7.5}/README.md +0 -0
  11. {x_transformers-2.7.3 → x_transformers-2.7.5}/data/README.md +0 -0
  12. {x_transformers-2.7.3 → x_transformers-2.7.5}/data/enwik8.gz +0 -0
  13. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/all-attention.png +0 -0
  14. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/attention-on-attention.png +0 -0
  15. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/cosine-sim-attention.png +0 -0
  16. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/deepnorm.png +0 -0
  17. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias-linear.png +0 -0
  18. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias-log.png +0 -0
  19. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  20. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias.png +0 -0
  21. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/enhanced-recurrence.png +0 -0
  22. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/fcm.png +0 -0
  23. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/ffglu.png +0 -0
  24. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/flash-attention.png +0 -0
  25. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/gate_values.png +0 -0
  26. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/gating.png +0 -0
  27. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/length-extrapolation-scale.png +0 -0
  28. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/macaron-1.png +0 -0
  29. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/macaron-2.png +0 -0
  30. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/memory-transformer.png +0 -0
  31. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/normformer.png +0 -0
  32. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/pia.png +0 -0
  33. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/qknorm-analysis.png +0 -0
  34. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/resi_dual.png +0 -0
  35. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/residual_attn.png +0 -0
  36. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/rezero.png +0 -0
  37. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/rotary.png +0 -0
  38. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/sandwich-2.png +0 -0
  39. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/sandwich.png +0 -0
  40. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/sandwich_norm.png +0 -0
  41. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/scalenorm.png +0 -0
  42. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/talking-heads.png +0 -0
  43. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/topk-attention.png +0 -0
  44. {x_transformers-2.7.3 → x_transformers-2.7.5}/images/xval.png +0 -0
  45. {x_transformers-2.7.3 → x_transformers-2.7.5}/train_belief_state.py +0 -0
  46. {x_transformers-2.7.3 → x_transformers-2.7.5}/train_copy.py +0 -0
  47. {x_transformers-2.7.3 → x_transformers-2.7.5}/train_entropy_tokenizer.py +0 -0
  48. {x_transformers-2.7.3 → x_transformers-2.7.5}/train_enwik8.py +0 -0
  49. {x_transformers-2.7.3 → x_transformers-2.7.5}/train_length_extrapolate.py +0 -0
  50. {x_transformers-2.7.3 → x_transformers-2.7.5}/train_parity.py +0 -0
  51. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/__init__.py +0 -0
  52. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/attend.py +0 -0
  53. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/autoregressive_wrapper.py +0 -0
  54. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/belief_state_wrapper.py +0 -0
  55. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/continuous.py +0 -0
  56. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/dpo.py +0 -0
  57. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/entropy_based_tokenizer.py +0 -0
  58. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/multi_input.py +0 -0
  59. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/neo_mlp.py +0 -0
  60. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/nonautoregressive_wrapper.py +0 -0
  61. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/up_wrapper.py +0 -0
  62. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  63. {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.7.3
3
+ Version: 2.7.5
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.7.3"
3
+ version = "2.7.5"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -1314,3 +1314,29 @@ def test_simple_mdlm(
1314
1314
 
1315
1315
  loss = nar(seq)
1316
1316
  loss.loss.backward()
1317
+
1318
+ def test_qk_clip_attn():
1319
+ from x_transformers import Attention
1320
+
1321
+ x = torch.randn(1, 1024, 512)
1322
+
1323
+ attn = Attention(dim = 512, dim_out = 384)
1324
+
1325
+ out, intermediates = attn(x, return_intermediates = True)
1326
+
1327
+ attn.qk_clip_(intermediates, tau = 100)
1328
+
1329
+ def test_qk_clip_attn_layers():
1330
+ from x_transformers import TransformerWrapper, Decoder
1331
+
1332
+ model = TransformerWrapper(
1333
+ num_tokens = 256,
1334
+ max_seq_len = 1024,
1335
+ attn_layers = Decoder(dim = 512, depth = 2)
1336
+ )
1337
+
1338
+ seq = torch.randint(0, 256, (1, 1024))
1339
+
1340
+ out, intermediates = model(seq, return_intermediates = True)
1341
+
1342
+ model.attn_qk_clip_(intermediates)
@@ -1637,10 +1637,12 @@ class Attention(Module):
1637
1637
  q_weight = self.to_q.weight
1638
1638
  k_weight = self.to_k.weight
1639
1639
 
1640
- q_dim, k_dim, heads = q_weight.shape[0], k_weight.shape[0], qk_weight_scale.numel()
1640
+ qk_dim, heads = q_weight.shape[0], qk_weight_scale.numel()
1641
1641
 
1642
- q_weight.mul_(repeat(qk_weight_scale, 'h -> (h expand)', expand = q_dim // heads))
1643
- k_weight.mul_(repeat(qk_weight_scale, 'h -> (h expand)', expand = k_dim // heads))
1642
+ qk_weight_scale = repeat(qk_weight_scale, 'h -> (h expand)', expand = qk_dim // heads)
1643
+
1644
+ q_weight.mul_(qk_weight_scale)
1645
+ k_weight.mul_(qk_weight_scale)
1644
1646
 
1645
1647
  def forward(
1646
1648
  self,
@@ -2460,6 +2462,20 @@ class AttentionLayers(Module):
2460
2462
 
2461
2463
  self.can_cache_kv = all([module.can_cache_kv for module in self.modules() if isinstance(module, Attention)])
2462
2464
 
2465
+ def attn_qk_clip_(
2466
+ self,
2467
+ intermediates: LayerIntermediates,
2468
+ tau = 100.
2469
+ ):
2470
+ # pairs up the attention intermediates with each attention module and does qk clip proposed by kimi team
2471
+
2472
+ for (_, layer, _), layer_type, attn_inter in zip(self.layers, self.layer_types, intermediates.attn_intermediates):
2473
+
2474
+ if layer_type not in ('a', 'c'):
2475
+ continue
2476
+
2477
+ layer.qk_clip_(attn_inter, tau = tau)
2478
+
2463
2479
  def forward(
2464
2480
  self,
2465
2481
  x,
@@ -3190,6 +3206,13 @@ class TransformerWrapper(Module):
3190
3206
  if not isinstance(self.pos_emb, always):
3191
3207
  nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
3192
3208
 
3209
+ def attn_qk_clip_(
3210
+ self,
3211
+ intermediates: LayerIntermediates,
3212
+ tau = 100.
3213
+ ):
3214
+ self.attn_layers.attn_qk_clip_(intermediates, tau = tau)
3215
+
3193
3216
  def forward(
3194
3217
  self,
3195
3218
  x,
File without changes
File without changes