x-transformers 2.0.2__py3-none-any.whl → 2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1282,7 +1282,7 @@ class Attention(Module):
1282
1282
  dim_kv_input = dim_latent_kv
1283
1283
 
1284
1284
  if exists(latent_rope_subheads):
1285
- assert not exists(rotate_num_heads)
1285
+ assert not exists(rotate_num_heads), '`rotate_num_heads` cannot be set when multi-latent attention is being used'
1286
1286
  rotate_num_heads = latent_rope_subheads
1287
1287
 
1288
1288
  k_dim = dim_head * (kv_heads - latent_rope_subheads)
@@ -1845,6 +1845,7 @@ class AttentionLayers(Module):
1845
1845
  rotary_interpolation_factor = 1.,
1846
1846
  rotary_xpos_scale_base = 512,
1847
1847
  rotary_base_rescale_factor = 1.,
1848
+ rotate_num_heads = None,
1848
1849
  weight_tie_layers = False,
1849
1850
  custom_layers: tuple[str, ...] | None = None,
1850
1851
  layers_execute_order: tuple[int, ...] | None = None,
@@ -2147,7 +2148,7 @@ class AttentionLayers(Module):
2147
2148
 
2148
2149
  if layer_type == 'a':
2149
2150
  self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
2150
- layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, **attn_kwargs)
2151
+ layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
2151
2152
  is_first_self_attn = False
2152
2153
  elif layer_type == 'c':
2153
2154
  layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.0.2
3
+ Version: 2.0.4
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -950,7 +950,8 @@ model_xl = TransformerWrapper(
950
950
  dim = 512,
951
951
  depth = 6,
952
952
  heads = 8,
953
- rotary_pos_emb = True
953
+ rotary_pos_emb = True,
954
+ rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
954
955
  )
955
956
  )
956
957
 
@@ -1839,6 +1840,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
1839
1840
  }
1840
1841
  ```
1841
1842
 
1843
+ ```bibtex
1844
+ @inproceedings{Yang2025RopeTN,
1845
+ title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
1846
+ author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
1847
+ year = {2025},
1848
+ url = {https://api.semanticscholar.org/CorpusID:276079501}
1849
+ }
1850
+ ```
1851
+
1842
1852
  ```bibtex
1843
1853
  @inproceedings{Chen2023ExtendingCW,
1844
1854
  title = {Extending Context Window of Large Language Models via Positional Interpolation},
@@ -6,10 +6,10 @@ x_transformers/dpo.py,sha256=xt4OuOWhU8pN3OKN2LZAaC2NC8iiEnchqqcrPWVqf0o,3521
6
6
  x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
7
7
  x_transformers/neo_mlp.py,sha256=XCNnnop9WLarcxap1kGuYc1x8GHvwkZiDRnXOxSl3Po,3452
8
8
  x_transformers/nonautoregressive_wrapper.py,sha256=2NU58hYMgn-4Jzg3mie-mXb0XH_dCN7fjlzd3K1rLUY,10510
9
- x_transformers/x_transformers.py,sha256=1s8KCSfHXMN9TKLFdS-RzzCskBDkh4CuBk2_XRb6IXk,107537
9
+ x_transformers/x_transformers.py,sha256=iE4m38BUwCB1aENGLV5dMsIuu1t3CElEBKuXfkJfPA4,107685
10
10
  x_transformers/xl_autoregressive_wrapper.py,sha256=CvZMJ6A6PA-Y_bQAhnORwjJBSl6Vjq2IdW5KTdk8NI8,4195
11
11
  x_transformers/xval.py,sha256=7S00kCuab4tWQa-vf-z-XfzADjVj48MoFIr7VSIvttg,8575
12
- x_transformers-2.0.2.dist-info/METADATA,sha256=tNdI3H2S4HnnGK1hPY3l94FoXH3SB9vGAb55pcah6Yw,86506
13
- x_transformers-2.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
- x_transformers-2.0.2.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
15
- x_transformers-2.0.2.dist-info/RECORD,,
12
+ x_transformers-2.0.4.dist-info/METADATA,sha256=UbaywSq7GvNJLub5VFrsooDeUgohEzWWBtA9ZnNOxkI,86938
13
+ x_transformers-2.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ x_transformers-2.0.4.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
15
+ x_transformers-2.0.4.dist-info/RECORD,,