x-transformers 2.11.23__tar.gz → 2.11.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of x-transformers might be problematic. Click here for more details.
- {x_transformers-2.11.23 → x_transformers-2.11.24}/PKG-INFO +13 -1
- {x_transformers-2.11.23 → x_transformers-2.11.24}/README.md +12 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/pyproject.toml +1 -1
- {x_transformers-2.11.23 → x_transformers-2.11.24}/tests/test_x_transformers.py +20 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/x_transformers.py +29 -1
- {x_transformers-2.11.23 → x_transformers-2.11.24}/.github/FUNDING.yml +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/.github/workflows/python-publish.yml +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/.github/workflows/python-test.yaml +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/.gitignore +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/LICENSE +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/data/README.md +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/data/enwik8.gz +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/all-attention.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/attention-on-attention.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/cosine-sim-attention.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/deepnorm.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias-linear.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias-log.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias-sinusoidal.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/enhanced-recurrence.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/fcm.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/ffglu.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/flash-attention.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/gate_values.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/gating.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/length-extrapolation-scale.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/macaron-1.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/macaron-2.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/memory-transformer.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/normformer.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/pia.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/qknorm-analysis.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/resi_dual.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/residual_attn.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/rezero.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/rotary.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/sandwich-2.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/sandwich.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/sandwich_norm.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/scalenorm.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/talking-heads.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/topk-attention.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/images/xval.png +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_belief_state.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_copy.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_entropy_tokenizer.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_enwik8.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_free.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_gpt_vae.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_length_extrapolate.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_parity.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/train_with_muon.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/__init__.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/attend.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/autoregressive_wrapper.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/belief_state_wrapper.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/continuous.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/dpo.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/entropy_based_tokenizer.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/free_transformer.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/gpt_vae.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/multi_input.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/neo_mlp.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/nonautoregressive_wrapper.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/up_wrapper.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/xl_autoregressive_wrapper.py +0 -0
- {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/xval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: x-transformers
|
|
3
|
-
Version: 2.11.
|
|
3
|
+
Version: 2.11.24
|
|
4
4
|
Summary: X-Transformers
|
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/x-transformers/
|
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/x-transformers
|
|
@@ -2618,4 +2618,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
|
2618
2618
|
}
|
|
2619
2619
|
```
|
|
2620
2620
|
|
|
2621
|
+
```bibtex
|
|
2622
|
+
@misc{chen2025strongernormalizationfreetransformers,
|
|
2623
|
+
title = {Stronger Normalization-Free Transformers},
|
|
2624
|
+
author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
|
|
2625
|
+
year = {2025},
|
|
2626
|
+
eprint = {2512.10938},
|
|
2627
|
+
archivePrefix = {arXiv},
|
|
2628
|
+
primaryClass = {cs.LG},
|
|
2629
|
+
url = {https://arxiv.org/abs/2512.10938},
|
|
2630
|
+
}
|
|
2631
|
+
```
|
|
2632
|
+
|
|
2621
2633
|
*solve intelligence... then use that to solve everything else.* - Demis Hassabis
|
|
@@ -2569,4 +2569,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
|
2569
2569
|
}
|
|
2570
2570
|
```
|
|
2571
2571
|
|
|
2572
|
+
```bibtex
|
|
2573
|
+
@misc{chen2025strongernormalizationfreetransformers,
|
|
2574
|
+
title = {Stronger Normalization-Free Transformers},
|
|
2575
|
+
author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
|
|
2576
|
+
year = {2025},
|
|
2577
|
+
eprint = {2512.10938},
|
|
2578
|
+
archivePrefix = {arXiv},
|
|
2579
|
+
primaryClass = {cs.LG},
|
|
2580
|
+
url = {https://arxiv.org/abs/2512.10938},
|
|
2581
|
+
}
|
|
2582
|
+
```
|
|
2583
|
+
|
|
2572
2584
|
*solve intelligence... then use that to solve everything else.* - Demis Hassabis
|
|
@@ -1488,3 +1488,23 @@ def test_belief_attn(
|
|
|
1488
1488
|
x = torch.randint(0, 256, (1, 10))
|
|
1489
1489
|
|
|
1490
1490
|
logits = model(x)
|
|
1491
|
+
|
|
1492
|
+
def test_derf():
|
|
1493
|
+
from x_transformers import TransformerWrapper, Decoder
|
|
1494
|
+
|
|
1495
|
+
model = TransformerWrapper(
|
|
1496
|
+
num_tokens = 256,
|
|
1497
|
+
max_seq_len = 1024,
|
|
1498
|
+
attn_layers = Decoder(
|
|
1499
|
+
dim = 512,
|
|
1500
|
+
depth = 6,
|
|
1501
|
+
heads = 8,
|
|
1502
|
+
attn_kv_heads = 4,
|
|
1503
|
+
rotary_pos_emb = True,
|
|
1504
|
+
use_derf = True
|
|
1505
|
+
)
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
x = torch.randint(0, 256, (1, 10))
|
|
1509
|
+
|
|
1510
|
+
logits = model(x)
|
|
@@ -941,6 +941,31 @@ class DynamicTanh(Module):
|
|
|
941
941
|
gamma = self.gamma + self.gamma_offset
|
|
942
942
|
return (x * pre_tanh_scale).tanh() * gamma + self.beta
|
|
943
943
|
|
|
944
|
+
class Derf(Module):
|
|
945
|
+
""" https://arxiv.org/abs/2512.10938 """
|
|
946
|
+
def __init__(
|
|
947
|
+
self,
|
|
948
|
+
dim,
|
|
949
|
+
init_alpha = 0.5,
|
|
950
|
+
init_bias = 0.,
|
|
951
|
+
unit_offset = False
|
|
952
|
+
):
|
|
953
|
+
super().__init__()
|
|
954
|
+
scale_offset = 1. if unit_offset else 0.
|
|
955
|
+
|
|
956
|
+
self.alpha = nn.Parameter(tensor(init_alpha) - scale_offset)
|
|
957
|
+
self.s = nn.Parameter(tensor(init_bias))
|
|
958
|
+
|
|
959
|
+
self.gamma = nn.Parameter(torch.ones(dim) - scale_offset)
|
|
960
|
+
self.beta = nn.Parameter(torch.zeros(dim))
|
|
961
|
+
|
|
962
|
+
self.scale_offset = scale_offset
|
|
963
|
+
|
|
964
|
+
def forward(self, x):
|
|
965
|
+
x = x * (self.alpha + self.scale_offset) + self.s
|
|
966
|
+
activated = torch.erf(x)
|
|
967
|
+
return activated * (self.gamma + self.scale_offset) + self.beta
|
|
968
|
+
|
|
944
969
|
# residual and residual gates
|
|
945
970
|
|
|
946
971
|
class Residual(Module):
|
|
@@ -2123,6 +2148,7 @@ class AttentionLayers(Module):
|
|
|
2123
2148
|
use_scalenorm = False,
|
|
2124
2149
|
use_rmsnorm = False,
|
|
2125
2150
|
use_dynamic_tanh = False,
|
|
2151
|
+
use_derf = False,
|
|
2126
2152
|
dynamic_tanh_init_alpha = 1.,
|
|
2127
2153
|
use_simple_rmsnorm = False,
|
|
2128
2154
|
use_adaptive_layernorm = False,
|
|
@@ -2277,7 +2303,7 @@ class AttentionLayers(Module):
|
|
|
2277
2303
|
|
|
2278
2304
|
# determine norm
|
|
2279
2305
|
|
|
2280
|
-
assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
|
|
2306
|
+
assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_derf, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
|
|
2281
2307
|
|
|
2282
2308
|
norm_need_condition = False
|
|
2283
2309
|
dim_condition = default(dim_condition, dim)
|
|
@@ -2295,6 +2321,8 @@ class AttentionLayers(Module):
|
|
|
2295
2321
|
elif use_dynamic_tanh:
|
|
2296
2322
|
assert pre_norm, 'dynamic tanh norm only tested for pre-norm'
|
|
2297
2323
|
norm_class = partial(DynamicTanh, init_alpha = dynamic_tanh_init_alpha)
|
|
2324
|
+
elif use_derf:
|
|
2325
|
+
norm_class = Derf
|
|
2298
2326
|
elif use_adaptive_layernorm:
|
|
2299
2327
|
norm_need_condition = True
|
|
2300
2328
|
norm_class = partial(AdaptiveLayerNorm, dim_condition = dim_condition * dim_condition_mult)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/nonautoregressive_wrapper.py
RENAMED
|
File without changes
|
|
File without changes
|
{x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/xl_autoregressive_wrapper.py
RENAMED
|
File without changes
|
|
File without changes
|