x-transformers 2.0.2__py3-none-any.whl → 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- x_transformers/x_transformers.py +3 -2
- {x_transformers-2.0.2.dist-info → x_transformers-2.0.4.dist-info}/METADATA +12 -2
- {x_transformers-2.0.2.dist-info → x_transformers-2.0.4.dist-info}/RECORD +5 -5
- {x_transformers-2.0.2.dist-info → x_transformers-2.0.4.dist-info}/WHEEL +0 -0
- {x_transformers-2.0.2.dist-info → x_transformers-2.0.4.dist-info}/licenses/LICENSE +0 -0
x_transformers/x_transformers.py
CHANGED
@@ -1282,7 +1282,7 @@ class Attention(Module):
|
|
1282
1282
|
dim_kv_input = dim_latent_kv
|
1283
1283
|
|
1284
1284
|
if exists(latent_rope_subheads):
|
1285
|
-
assert not exists(rotate_num_heads)
|
1285
|
+
assert not exists(rotate_num_heads), '`rotate_num_heads` cannot be set when multi-latent attention is being used'
|
1286
1286
|
rotate_num_heads = latent_rope_subheads
|
1287
1287
|
|
1288
1288
|
k_dim = dim_head * (kv_heads - latent_rope_subheads)
|
@@ -1845,6 +1845,7 @@ class AttentionLayers(Module):
|
|
1845
1845
|
rotary_interpolation_factor = 1.,
|
1846
1846
|
rotary_xpos_scale_base = 512,
|
1847
1847
|
rotary_base_rescale_factor = 1.,
|
1848
|
+
rotate_num_heads = None,
|
1848
1849
|
weight_tie_layers = False,
|
1849
1850
|
custom_layers: tuple[str, ...] | None = None,
|
1850
1851
|
layers_execute_order: tuple[int, ...] | None = None,
|
@@ -2147,7 +2148,7 @@ class AttentionLayers(Module):
|
|
2147
2148
|
|
2148
2149
|
if layer_type == 'a':
|
2149
2150
|
self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
|
2150
|
-
layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, **attn_kwargs)
|
2151
|
+
layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
|
2151
2152
|
is_first_self_attn = False
|
2152
2153
|
elif layer_type == 'c':
|
2153
2154
|
layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: x-transformers
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.4
|
4
4
|
Summary: X-Transformers
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/x-transformers/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/x-transformers
|
@@ -950,7 +950,8 @@ model_xl = TransformerWrapper(
|
|
950
950
|
dim = 512,
|
951
951
|
depth = 6,
|
952
952
|
heads = 8,
|
953
|
-
rotary_pos_emb = True
|
953
|
+
rotary_pos_emb = True,
|
954
|
+
rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
|
954
955
|
)
|
955
956
|
)
|
956
957
|
|
@@ -1839,6 +1840,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
1839
1840
|
}
|
1840
1841
|
```
|
1841
1842
|
|
1843
|
+
```bibtex
|
1844
|
+
@inproceedings{Yang2025RopeTN,
|
1845
|
+
title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
|
1846
|
+
author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
|
1847
|
+
year = {2025},
|
1848
|
+
url = {https://api.semanticscholar.org/CorpusID:276079501}
|
1849
|
+
}
|
1850
|
+
```
|
1851
|
+
|
1842
1852
|
```bibtex
|
1843
1853
|
@inproceedings{Chen2023ExtendingCW,
|
1844
1854
|
title = {Extending Context Window of Large Language Models via Positional Interpolation},
|
@@ -6,10 +6,10 @@ x_transformers/dpo.py,sha256=xt4OuOWhU8pN3OKN2LZAaC2NC8iiEnchqqcrPWVqf0o,3521
|
|
6
6
|
x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
|
7
7
|
x_transformers/neo_mlp.py,sha256=XCNnnop9WLarcxap1kGuYc1x8GHvwkZiDRnXOxSl3Po,3452
|
8
8
|
x_transformers/nonautoregressive_wrapper.py,sha256=2NU58hYMgn-4Jzg3mie-mXb0XH_dCN7fjlzd3K1rLUY,10510
|
9
|
-
x_transformers/x_transformers.py,sha256=
|
9
|
+
x_transformers/x_transformers.py,sha256=iE4m38BUwCB1aENGLV5dMsIuu1t3CElEBKuXfkJfPA4,107685
|
10
10
|
x_transformers/xl_autoregressive_wrapper.py,sha256=CvZMJ6A6PA-Y_bQAhnORwjJBSl6Vjq2IdW5KTdk8NI8,4195
|
11
11
|
x_transformers/xval.py,sha256=7S00kCuab4tWQa-vf-z-XfzADjVj48MoFIr7VSIvttg,8575
|
12
|
-
x_transformers-2.0.
|
13
|
-
x_transformers-2.0.
|
14
|
-
x_transformers-2.0.
|
15
|
-
x_transformers-2.0.
|
12
|
+
x_transformers-2.0.4.dist-info/METADATA,sha256=UbaywSq7GvNJLub5VFrsooDeUgohEzWWBtA9ZnNOxkI,86938
|
13
|
+
x_transformers-2.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
14
|
+
x_transformers-2.0.4.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
|
15
|
+
x_transformers-2.0.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|