x-transformers 2.0.2__py3-none-any.whl → 2.0.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1282,7 +1282,7 @@ class Attention(Module):
1282
1282
  dim_kv_input = dim_latent_kv
1283
1283
 
1284
1284
  if exists(latent_rope_subheads):
1285
- assert not exists(rotate_num_heads)
1285
+ assert not exists(rotate_num_heads), '`rotate_num_heads` cannot be set when multi-latent attention is being used'
1286
1286
  rotate_num_heads = latent_rope_subheads
1287
1287
 
1288
1288
  k_dim = dim_head * (kv_heads - latent_rope_subheads)
@@ -1845,6 +1845,7 @@ class AttentionLayers(Module):
1845
1845
  rotary_interpolation_factor = 1.,
1846
1846
  rotary_xpos_scale_base = 512,
1847
1847
  rotary_base_rescale_factor = 1.,
1848
+ rotate_num_heads = None,
1848
1849
  weight_tie_layers = False,
1849
1850
  custom_layers: tuple[str, ...] | None = None,
1850
1851
  layers_execute_order: tuple[int, ...] | None = None,
@@ -2147,7 +2148,7 @@ class AttentionLayers(Module):
2147
2148
 
2148
2149
  if layer_type == 'a':
2149
2150
  self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
2150
- layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, **attn_kwargs)
2151
+ layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
2151
2152
  is_first_self_attn = False
2152
2153
  elif layer_type == 'c':
2153
2154
  layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.0.2
3
+ Version: 2.0.4
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -950,7 +950,8 @@ model_xl = TransformerWrapper(
950
950
  dim = 512,
951
951
  depth = 6,
952
952
  heads = 8,
953
- rotary_pos_emb = True
953
+ rotary_pos_emb = True,
954
+ rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
954
955
  )
955
956
  )
956
957
 
@@ -1839,6 +1840,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
1839
1840
  }
1840
1841
  ```
1841
1842
 
1843
+ ```bibtex
1844
+ @inproceedings{Yang2025RopeTN,
1845
+ title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
1846
+ author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
1847
+ year = {2025},
1848
+ url = {https://api.semanticscholar.org/CorpusID:276079501}
1849
+ }
1850
+ ```
1851
+
1842
1852
  ```bibtex
1843
1853
  @inproceedings{Chen2023ExtendingCW,
1844
1854
  title = {Extending Context Window of Large Language Models via Positional Interpolation},
@@ -6,10 +6,10 @@ x_transformers/dpo.py,sha256=xt4OuOWhU8pN3OKN2LZAaC2NC8iiEnchqqcrPWVqf0o,3521
6
6
  x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
7
7
  x_transformers/neo_mlp.py,sha256=XCNnnop9WLarcxap1kGuYc1x8GHvwkZiDRnXOxSl3Po,3452
8
8
  x_transformers/nonautoregressive_wrapper.py,sha256=2NU58hYMgn-4Jzg3mie-mXb0XH_dCN7fjlzd3K1rLUY,10510
9
- x_transformers/x_transformers.py,sha256=1s8KCSfHXMN9TKLFdS-RzzCskBDkh4CuBk2_XRb6IXk,107537
9
+ x_transformers/x_transformers.py,sha256=iE4m38BUwCB1aENGLV5dMsIuu1t3CElEBKuXfkJfPA4,107685
10
10
  x_transformers/xl_autoregressive_wrapper.py,sha256=CvZMJ6A6PA-Y_bQAhnORwjJBSl6Vjq2IdW5KTdk8NI8,4195
11
11
  x_transformers/xval.py,sha256=7S00kCuab4tWQa-vf-z-XfzADjVj48MoFIr7VSIvttg,8575
12
- x_transformers-2.0.2.dist-info/METADATA,sha256=tNdI3H2S4HnnGK1hPY3l94FoXH3SB9vGAb55pcah6Yw,86506
13
- x_transformers-2.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
- x_transformers-2.0.2.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
15
- x_transformers-2.0.2.dist-info/RECORD,,
12
+ x_transformers-2.0.4.dist-info/METADATA,sha256=UbaywSq7GvNJLub5VFrsooDeUgohEzWWBtA9ZnNOxkI,86938
13
+ x_transformers-2.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ x_transformers-2.0.4.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
15
+ x_transformers-2.0.4.dist-info/RECORD,,