x-transformers 2.11.22__tar.gz → 2.11.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of x-transformers might be problematic. Click here for more details.
- {x_transformers-2.11.22 → x_transformers-2.11.24}/PKG-INFO +13 -1
- {x_transformers-2.11.22 → x_transformers-2.11.24}/README.md +12 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/pyproject.toml +1 -1
- {x_transformers-2.11.22 → x_transformers-2.11.24}/tests/test_x_transformers.py +21 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/x_transformers.py +31 -2
- {x_transformers-2.11.22 → x_transformers-2.11.24}/.github/FUNDING.yml +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/.github/workflows/python-publish.yml +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/.github/workflows/python-test.yaml +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/.gitignore +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/LICENSE +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/data/README.md +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/data/enwik8.gz +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/all-attention.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/attention-on-attention.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/cosine-sim-attention.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/deepnorm.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias-linear.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias-log.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias-sinusoidal.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/enhanced-recurrence.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/fcm.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/ffglu.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/flash-attention.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/gate_values.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/gating.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/length-extrapolation-scale.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/macaron-1.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/macaron-2.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/memory-transformer.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/normformer.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/pia.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/qknorm-analysis.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/resi_dual.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/residual_attn.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/rezero.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/rotary.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/sandwich-2.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/sandwich.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/sandwich_norm.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/scalenorm.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/talking-heads.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/topk-attention.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/images/xval.png +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_belief_state.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_copy.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_entropy_tokenizer.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_enwik8.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_free.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_gpt_vae.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_length_extrapolate.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_parity.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/train_with_muon.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/__init__.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/attend.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/autoregressive_wrapper.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/belief_state_wrapper.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/continuous.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/dpo.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/entropy_based_tokenizer.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/free_transformer.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/gpt_vae.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/multi_input.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/neo_mlp.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/nonautoregressive_wrapper.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/up_wrapper.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/xl_autoregressive_wrapper.py +0 -0
- {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/xval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: x-transformers
|
|
3
|
-
Version: 2.11.
|
|
3
|
+
Version: 2.11.24
|
|
4
4
|
Summary: X-Transformers
|
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/x-transformers/
|
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/x-transformers
|
|
@@ -2618,4 +2618,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
|
2618
2618
|
}
|
|
2619
2619
|
```
|
|
2620
2620
|
|
|
2621
|
+
```bibtex
|
|
2622
|
+
@misc{chen2025strongernormalizationfreetransformers,
|
|
2623
|
+
title = {Stronger Normalization-Free Transformers},
|
|
2624
|
+
author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
|
|
2625
|
+
year = {2025},
|
|
2626
|
+
eprint = {2512.10938},
|
|
2627
|
+
archivePrefix = {arXiv},
|
|
2628
|
+
primaryClass = {cs.LG},
|
|
2629
|
+
url = {https://arxiv.org/abs/2512.10938},
|
|
2630
|
+
}
|
|
2631
|
+
```
|
|
2632
|
+
|
|
2621
2633
|
*solve intelligence... then use that to solve everything else.* - Demis Hassabis
|
|
@@ -2569,4 +2569,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
|
2569
2569
|
}
|
|
2570
2570
|
```
|
|
2571
2571
|
|
|
2572
|
+
```bibtex
|
|
2573
|
+
@misc{chen2025strongernormalizationfreetransformers,
|
|
2574
|
+
title = {Stronger Normalization-Free Transformers},
|
|
2575
|
+
author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
|
|
2576
|
+
year = {2025},
|
|
2577
|
+
eprint = {2512.10938},
|
|
2578
|
+
archivePrefix = {arXiv},
|
|
2579
|
+
primaryClass = {cs.LG},
|
|
2580
|
+
url = {https://arxiv.org/abs/2512.10938},
|
|
2581
|
+
}
|
|
2582
|
+
```
|
|
2583
|
+
|
|
2572
2584
|
*solve intelligence... then use that to solve everything else.* - Demis Hassabis
|
|
@@ -1478,6 +1478,7 @@ def test_belief_attn(
|
|
|
1478
1478
|
dim = 512,
|
|
1479
1479
|
depth = 6,
|
|
1480
1480
|
heads = 8,
|
|
1481
|
+
attn_kv_heads = 4,
|
|
1481
1482
|
rotary_pos_emb = True,
|
|
1482
1483
|
attn_orthog_projected_values = orthog_project,
|
|
1483
1484
|
attn_orthog_projected_values_per_head = orthog_project_per_head
|
|
@@ -1487,3 +1488,23 @@ def test_belief_attn(
|
|
|
1487
1488
|
x = torch.randint(0, 256, (1, 10))
|
|
1488
1489
|
|
|
1489
1490
|
logits = model(x)
|
|
1491
|
+
|
|
1492
|
+
def test_derf():
|
|
1493
|
+
from x_transformers import TransformerWrapper, Decoder
|
|
1494
|
+
|
|
1495
|
+
model = TransformerWrapper(
|
|
1496
|
+
num_tokens = 256,
|
|
1497
|
+
max_seq_len = 1024,
|
|
1498
|
+
attn_layers = Decoder(
|
|
1499
|
+
dim = 512,
|
|
1500
|
+
depth = 6,
|
|
1501
|
+
heads = 8,
|
|
1502
|
+
attn_kv_heads = 4,
|
|
1503
|
+
rotary_pos_emb = True,
|
|
1504
|
+
use_derf = True
|
|
1505
|
+
)
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
x = torch.randint(0, 256, (1, 10))
|
|
1509
|
+
|
|
1510
|
+
logits = model(x)
|
|
@@ -941,6 +941,31 @@ class DynamicTanh(Module):
|
|
|
941
941
|
gamma = self.gamma + self.gamma_offset
|
|
942
942
|
return (x * pre_tanh_scale).tanh() * gamma + self.beta
|
|
943
943
|
|
|
944
|
+
class Derf(Module):
|
|
945
|
+
""" https://arxiv.org/abs/2512.10938 """
|
|
946
|
+
def __init__(
|
|
947
|
+
self,
|
|
948
|
+
dim,
|
|
949
|
+
init_alpha = 0.5,
|
|
950
|
+
init_bias = 0.,
|
|
951
|
+
unit_offset = False
|
|
952
|
+
):
|
|
953
|
+
super().__init__()
|
|
954
|
+
scale_offset = 1. if unit_offset else 0.
|
|
955
|
+
|
|
956
|
+
self.alpha = nn.Parameter(tensor(init_alpha) - scale_offset)
|
|
957
|
+
self.s = nn.Parameter(tensor(init_bias))
|
|
958
|
+
|
|
959
|
+
self.gamma = nn.Parameter(torch.ones(dim) - scale_offset)
|
|
960
|
+
self.beta = nn.Parameter(torch.zeros(dim))
|
|
961
|
+
|
|
962
|
+
self.scale_offset = scale_offset
|
|
963
|
+
|
|
964
|
+
def forward(self, x):
|
|
965
|
+
x = x * (self.alpha + self.scale_offset) + self.s
|
|
966
|
+
activated = torch.erf(x)
|
|
967
|
+
return activated * (self.gamma + self.scale_offset) + self.beta
|
|
968
|
+
|
|
944
969
|
# residual and residual gates
|
|
945
970
|
|
|
946
971
|
class Residual(Module):
|
|
@@ -1431,6 +1456,7 @@ class Attention(Module):
|
|
|
1431
1456
|
assert divisible_by(heads, kv_heads)
|
|
1432
1457
|
|
|
1433
1458
|
self.kv_heads = kv_heads
|
|
1459
|
+
self.groups = heads // kv_heads
|
|
1434
1460
|
|
|
1435
1461
|
q_dim = dim_head * heads
|
|
1436
1462
|
k_dim = dim_head * kv_heads
|
|
@@ -2077,7 +2103,7 @@ class Attention(Module):
|
|
|
2077
2103
|
|
|
2078
2104
|
if self.orthog_projected_values or self.orthog_projected_values_per_head:
|
|
2079
2105
|
orthog_projected = []
|
|
2080
|
-
v_for_proj =
|
|
2106
|
+
v_for_proj = repeat(orig_values, 'b h n d -> b n (g h d)', g = self.groups)
|
|
2081
2107
|
|
|
2082
2108
|
if self.orthog_projected_values:
|
|
2083
2109
|
projected = orthog_project(out, v_for_proj)
|
|
@@ -2122,6 +2148,7 @@ class AttentionLayers(Module):
|
|
|
2122
2148
|
use_scalenorm = False,
|
|
2123
2149
|
use_rmsnorm = False,
|
|
2124
2150
|
use_dynamic_tanh = False,
|
|
2151
|
+
use_derf = False,
|
|
2125
2152
|
dynamic_tanh_init_alpha = 1.,
|
|
2126
2153
|
use_simple_rmsnorm = False,
|
|
2127
2154
|
use_adaptive_layernorm = False,
|
|
@@ -2276,7 +2303,7 @@ class AttentionLayers(Module):
|
|
|
2276
2303
|
|
|
2277
2304
|
# determine norm
|
|
2278
2305
|
|
|
2279
|
-
assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
|
|
2306
|
+
assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_derf, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
|
|
2280
2307
|
|
|
2281
2308
|
norm_need_condition = False
|
|
2282
2309
|
dim_condition = default(dim_condition, dim)
|
|
@@ -2294,6 +2321,8 @@ class AttentionLayers(Module):
|
|
|
2294
2321
|
elif use_dynamic_tanh:
|
|
2295
2322
|
assert pre_norm, 'dynamic tanh norm only tested for pre-norm'
|
|
2296
2323
|
norm_class = partial(DynamicTanh, init_alpha = dynamic_tanh_init_alpha)
|
|
2324
|
+
elif use_derf:
|
|
2325
|
+
norm_class = Derf
|
|
2297
2326
|
elif use_adaptive_layernorm:
|
|
2298
2327
|
norm_need_condition = True
|
|
2299
2328
|
norm_class = partial(AdaptiveLayerNorm, dim_condition = dim_condition * dim_condition_mult)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/nonautoregressive_wrapper.py
RENAMED
|
File without changes
|
|
File without changes
|
{x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/xl_autoregressive_wrapper.py
RENAMED
|
File without changes
|
|
File without changes
|