x-transformers 2.0.2__tar.gz → 2.0.3__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. {x_transformers-2.0.2 → x_transformers-2.0.3}/PKG-INFO +12 -2
  2. {x_transformers-2.0.2 → x_transformers-2.0.3}/README.md +11 -1
  3. {x_transformers-2.0.2 → x_transformers-2.0.3}/pyproject.toml +1 -1
  4. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/x_transformers.py +2 -1
  5. {x_transformers-2.0.2 → x_transformers-2.0.3}/.github/FUNDING.yml +0 -0
  6. {x_transformers-2.0.2 → x_transformers-2.0.3}/.github/workflows/python-publish.yml +0 -0
  7. {x_transformers-2.0.2 → x_transformers-2.0.3}/.github/workflows/python-test.yaml +0 -0
  8. {x_transformers-2.0.2 → x_transformers-2.0.3}/.gitignore +0 -0
  9. {x_transformers-2.0.2 → x_transformers-2.0.3}/LICENSE +0 -0
  10. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/all-attention.png +0 -0
  11. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/attention-on-attention.png +0 -0
  12. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/cosine-sim-attention.png +0 -0
  13. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/deepnorm.png +0 -0
  14. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias-linear.png +0 -0
  15. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias-log.png +0 -0
  16. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  17. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias.png +0 -0
  18. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/enhanced-recurrence.png +0 -0
  19. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/fcm.png +0 -0
  20. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/ffglu.png +0 -0
  21. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/flash-attention.png +0 -0
  22. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/gate_values.png +0 -0
  23. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/gating.png +0 -0
  24. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/length-extrapolation-scale.png +0 -0
  25. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/macaron-1.png +0 -0
  26. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/macaron-2.png +0 -0
  27. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/memory-transformer.png +0 -0
  28. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/normformer.png +0 -0
  29. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/pia.png +0 -0
  30. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/qknorm-analysis.png +0 -0
  31. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/resi_dual.png +0 -0
  32. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/residual_attn.png +0 -0
  33. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/rezero.png +0 -0
  34. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/rotary.png +0 -0
  35. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/sandwich-2.png +0 -0
  36. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/sandwich.png +0 -0
  37. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/sandwich_norm.png +0 -0
  38. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/scalenorm.png +0 -0
  39. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/talking-heads.png +0 -0
  40. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/topk-attention.png +0 -0
  41. {x_transformers-2.0.2 → x_transformers-2.0.3}/images/xval.png +0 -0
  42. {x_transformers-2.0.2 → x_transformers-2.0.3}/tests/test_x_transformers.py +0 -0
  43. {x_transformers-2.0.2 → x_transformers-2.0.3}/train_copy.py +0 -0
  44. {x_transformers-2.0.2 → x_transformers-2.0.3}/train_enwik8.py +0 -0
  45. {x_transformers-2.0.2 → x_transformers-2.0.3}/train_parity.py +0 -0
  46. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/__init__.py +0 -0
  47. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/attend.py +0 -0
  48. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/autoregressive_wrapper.py +0 -0
  49. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/continuous.py +0 -0
  50. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/dpo.py +0 -0
  51. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/multi_input.py +0 -0
  52. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/neo_mlp.py +0 -0
  53. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/nonautoregressive_wrapper.py +0 -0
  54. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  55. {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.0.2
3
+ Version: 2.0.3
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -950,7 +950,8 @@ model_xl = TransformerWrapper(
950
950
  dim = 512,
951
951
  depth = 6,
952
952
  heads = 8,
953
- rotary_pos_emb = True
953
+ rotary_pos_emb = True,
954
+ rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
954
955
  )
955
956
  )
956
957
 
@@ -1839,6 +1840,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
1839
1840
  }
1840
1841
  ```
1841
1842
 
1843
+ ```bibtex
1844
+ @inproceedings{Yang2025RopeTN,
1845
+ title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
1846
+ author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
1847
+ year = {2025},
1848
+ url = {https://api.semanticscholar.org/CorpusID:276079501}
1849
+ }
1850
+ ```
1851
+
1842
1852
  ```bibtex
1843
1853
  @inproceedings{Chen2023ExtendingCW,
1844
1854
  title = {Extending Context Window of Large Language Models via Positional Interpolation},
@@ -901,7 +901,8 @@ model_xl = TransformerWrapper(
901
901
  dim = 512,
902
902
  depth = 6,
903
903
  heads = 8,
904
- rotary_pos_emb = True
904
+ rotary_pos_emb = True,
905
+ rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
905
906
  )
906
907
  )
907
908
 
@@ -1790,6 +1791,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
1790
1791
  }
1791
1792
  ```
1792
1793
 
1794
+ ```bibtex
1795
+ @inproceedings{Yang2025RopeTN,
1796
+ title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
1797
+ author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
1798
+ year = {2025},
1799
+ url = {https://api.semanticscholar.org/CorpusID:276079501}
1800
+ }
1801
+ ```
1802
+
1793
1803
  ```bibtex
1794
1804
  @inproceedings{Chen2023ExtendingCW,
1795
1805
  title = {Extending Context Window of Large Language Models via Positional Interpolation},
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.0.2"
3
+ version = "2.0.3"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -1845,6 +1845,7 @@ class AttentionLayers(Module):
1845
1845
  rotary_interpolation_factor = 1.,
1846
1846
  rotary_xpos_scale_base = 512,
1847
1847
  rotary_base_rescale_factor = 1.,
1848
+ rotate_num_heads = None,
1848
1849
  weight_tie_layers = False,
1849
1850
  custom_layers: tuple[str, ...] | None = None,
1850
1851
  layers_execute_order: tuple[int, ...] | None = None,
@@ -2147,7 +2148,7 @@ class AttentionLayers(Module):
2147
2148
 
2148
2149
  if layer_type == 'a':
2149
2150
  self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
2150
- layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, **attn_kwargs)
2151
+ layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
2151
2152
  is_first_self_attn = False
2152
2153
  elif layer_type == 'c':
2153
2154
  layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})
File without changes