x-transformers 2.0.2__tar.gz → 2.0.3__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {x_transformers-2.0.2 → x_transformers-2.0.3}/PKG-INFO +12 -2
- {x_transformers-2.0.2 → x_transformers-2.0.3}/README.md +11 -1
- {x_transformers-2.0.2 → x_transformers-2.0.3}/pyproject.toml +1 -1
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/x_transformers.py +2 -1
- {x_transformers-2.0.2 → x_transformers-2.0.3}/.github/FUNDING.yml +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/.github/workflows/python-publish.yml +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/.github/workflows/python-test.yaml +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/.gitignore +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/LICENSE +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/all-attention.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/attention-on-attention.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/cosine-sim-attention.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/deepnorm.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias-linear.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias-log.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias-sinusoidal.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/dynamic-pos-bias.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/enhanced-recurrence.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/fcm.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/ffglu.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/flash-attention.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/gate_values.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/gating.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/length-extrapolation-scale.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/macaron-1.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/macaron-2.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/memory-transformer.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/normformer.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/pia.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/qknorm-analysis.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/resi_dual.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/residual_attn.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/rezero.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/rotary.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/sandwich-2.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/sandwich.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/sandwich_norm.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/scalenorm.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/talking-heads.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/topk-attention.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/images/xval.png +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/tests/test_x_transformers.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/train_copy.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/train_enwik8.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/train_parity.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/__init__.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/attend.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/autoregressive_wrapper.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/continuous.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/dpo.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/multi_input.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/neo_mlp.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/nonautoregressive_wrapper.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/xl_autoregressive_wrapper.py +0 -0
- {x_transformers-2.0.2 → x_transformers-2.0.3}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: x-transformers
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.3
|
4
4
|
Summary: X-Transformers
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/x-transformers/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/x-transformers
|
@@ -950,7 +950,8 @@ model_xl = TransformerWrapper(
|
|
950
950
|
dim = 512,
|
951
951
|
depth = 6,
|
952
952
|
heads = 8,
|
953
|
-
rotary_pos_emb = True
|
953
|
+
rotary_pos_emb = True,
|
954
|
+
rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
|
954
955
|
)
|
955
956
|
)
|
956
957
|
|
@@ -1839,6 +1840,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
1839
1840
|
}
|
1840
1841
|
```
|
1841
1842
|
|
1843
|
+
```bibtex
|
1844
|
+
@inproceedings{Yang2025RopeTN,
|
1845
|
+
title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
|
1846
|
+
author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
|
1847
|
+
year = {2025},
|
1848
|
+
url = {https://api.semanticscholar.org/CorpusID:276079501}
|
1849
|
+
}
|
1850
|
+
```
|
1851
|
+
|
1842
1852
|
```bibtex
|
1843
1853
|
@inproceedings{Chen2023ExtendingCW,
|
1844
1854
|
title = {Extending Context Window of Large Language Models via Positional Interpolation},
|
@@ -901,7 +901,8 @@ model_xl = TransformerWrapper(
|
|
901
901
|
dim = 512,
|
902
902
|
depth = 6,
|
903
903
|
heads = 8,
|
904
|
-
rotary_pos_emb = True
|
904
|
+
rotary_pos_emb = True,
|
905
|
+
rotate_num_heads = 4 # only rotate 4 out of the 8 attention heads
|
905
906
|
)
|
906
907
|
)
|
907
908
|
|
@@ -1790,6 +1791,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
1790
1791
|
}
|
1791
1792
|
```
|
1792
1793
|
|
1794
|
+
```bibtex
|
1795
|
+
@inproceedings{Yang2025RopeTN,
|
1796
|
+
title = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
|
1797
|
+
author = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
|
1798
|
+
year = {2025},
|
1799
|
+
url = {https://api.semanticscholar.org/CorpusID:276079501}
|
1800
|
+
}
|
1801
|
+
```
|
1802
|
+
|
1793
1803
|
```bibtex
|
1794
1804
|
@inproceedings{Chen2023ExtendingCW,
|
1795
1805
|
title = {Extending Context Window of Large Language Models via Positional Interpolation},
|
@@ -1845,6 +1845,7 @@ class AttentionLayers(Module):
|
|
1845
1845
|
rotary_interpolation_factor = 1.,
|
1846
1846
|
rotary_xpos_scale_base = 512,
|
1847
1847
|
rotary_base_rescale_factor = 1.,
|
1848
|
+
rotate_num_heads = None,
|
1848
1849
|
weight_tie_layers = False,
|
1849
1850
|
custom_layers: tuple[str, ...] | None = None,
|
1850
1851
|
layers_execute_order: tuple[int, ...] | None = None,
|
@@ -2147,7 +2148,7 @@ class AttentionLayers(Module):
|
|
2147
2148
|
|
2148
2149
|
if layer_type == 'a':
|
2149
2150
|
self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
|
2150
|
-
layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, **attn_kwargs)
|
2151
|
+
layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
|
2151
2152
|
is_first_self_attn = False
|
2152
2153
|
elif layer_type == 'c':
|
2153
2154
|
layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|