x-transformers 2.11.22__tar.gz → 2.11.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of x-transformers might be problematic. Click here for more details.

Files changed (68) hide show
  1. {x_transformers-2.11.22 → x_transformers-2.11.24}/PKG-INFO +13 -1
  2. {x_transformers-2.11.22 → x_transformers-2.11.24}/README.md +12 -0
  3. {x_transformers-2.11.22 → x_transformers-2.11.24}/pyproject.toml +1 -1
  4. {x_transformers-2.11.22 → x_transformers-2.11.24}/tests/test_x_transformers.py +21 -0
  5. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/x_transformers.py +31 -2
  6. {x_transformers-2.11.22 → x_transformers-2.11.24}/.github/FUNDING.yml +0 -0
  7. {x_transformers-2.11.22 → x_transformers-2.11.24}/.github/workflows/python-publish.yml +0 -0
  8. {x_transformers-2.11.22 → x_transformers-2.11.24}/.github/workflows/python-test.yaml +0 -0
  9. {x_transformers-2.11.22 → x_transformers-2.11.24}/.gitignore +0 -0
  10. {x_transformers-2.11.22 → x_transformers-2.11.24}/LICENSE +0 -0
  11. {x_transformers-2.11.22 → x_transformers-2.11.24}/data/README.md +0 -0
  12. {x_transformers-2.11.22 → x_transformers-2.11.24}/data/enwik8.gz +0 -0
  13. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/all-attention.png +0 -0
  14. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/attention-on-attention.png +0 -0
  15. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/cosine-sim-attention.png +0 -0
  16. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/deepnorm.png +0 -0
  17. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias-linear.png +0 -0
  18. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias-log.png +0 -0
  19. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  20. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/dynamic-pos-bias.png +0 -0
  21. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/enhanced-recurrence.png +0 -0
  22. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/fcm.png +0 -0
  23. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/ffglu.png +0 -0
  24. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/flash-attention.png +0 -0
  25. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/gate_values.png +0 -0
  26. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/gating.png +0 -0
  27. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/length-extrapolation-scale.png +0 -0
  28. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/macaron-1.png +0 -0
  29. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/macaron-2.png +0 -0
  30. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/memory-transformer.png +0 -0
  31. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/normformer.png +0 -0
  32. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/pia.png +0 -0
  33. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/qknorm-analysis.png +0 -0
  34. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/resi_dual.png +0 -0
  35. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/residual_attn.png +0 -0
  36. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/rezero.png +0 -0
  37. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/rotary.png +0 -0
  38. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/sandwich-2.png +0 -0
  39. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/sandwich.png +0 -0
  40. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/sandwich_norm.png +0 -0
  41. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/scalenorm.png +0 -0
  42. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/talking-heads.png +0 -0
  43. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/topk-attention.png +0 -0
  44. {x_transformers-2.11.22 → x_transformers-2.11.24}/images/xval.png +0 -0
  45. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_belief_state.py +0 -0
  46. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_copy.py +0 -0
  47. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_entropy_tokenizer.py +0 -0
  48. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_enwik8.py +0 -0
  49. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_free.py +0 -0
  50. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_gpt_vae.py +0 -0
  51. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_length_extrapolate.py +0 -0
  52. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_parity.py +0 -0
  53. {x_transformers-2.11.22 → x_transformers-2.11.24}/train_with_muon.py +0 -0
  54. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/__init__.py +0 -0
  55. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/attend.py +0 -0
  56. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/autoregressive_wrapper.py +0 -0
  57. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/belief_state_wrapper.py +0 -0
  58. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/continuous.py +0 -0
  59. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/dpo.py +0 -0
  60. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/entropy_based_tokenizer.py +0 -0
  61. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/free_transformer.py +0 -0
  62. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/gpt_vae.py +0 -0
  63. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/multi_input.py +0 -0
  64. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/neo_mlp.py +0 -0
  65. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/nonautoregressive_wrapper.py +0 -0
  66. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/up_wrapper.py +0 -0
  67. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  68. {x_transformers-2.11.22 → x_transformers-2.11.24}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.11.22
3
+ Version: 2.11.24
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2618,4 +2618,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
2618
2618
  }
2619
2619
  ```
2620
2620
 
2621
+ ```bibtex
2622
+ @misc{chen2025strongernormalizationfreetransformers,
2623
+ title = {Stronger Normalization-Free Transformers},
2624
+ author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
2625
+ year = {2025},
2626
+ eprint = {2512.10938},
2627
+ archivePrefix = {arXiv},
2628
+ primaryClass = {cs.LG},
2629
+ url = {https://arxiv.org/abs/2512.10938},
2630
+ }
2631
+ ```
2632
+
2621
2633
  *solve intelligence... then use that to solve everything else.* - Demis Hassabis
@@ -2569,4 +2569,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
2569
2569
  }
2570
2570
  ```
2571
2571
 
2572
+ ```bibtex
2573
+ @misc{chen2025strongernormalizationfreetransformers,
2574
+ title = {Stronger Normalization-Free Transformers},
2575
+ author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
2576
+ year = {2025},
2577
+ eprint = {2512.10938},
2578
+ archivePrefix = {arXiv},
2579
+ primaryClass = {cs.LG},
2580
+ url = {https://arxiv.org/abs/2512.10938},
2581
+ }
2582
+ ```
2583
+
2572
2584
  *solve intelligence... then use that to solve everything else.* - Demis Hassabis
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.11.22"
3
+ version = "2.11.24"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -1478,6 +1478,7 @@ def test_belief_attn(
1478
1478
  dim = 512,
1479
1479
  depth = 6,
1480
1480
  heads = 8,
1481
+ attn_kv_heads = 4,
1481
1482
  rotary_pos_emb = True,
1482
1483
  attn_orthog_projected_values = orthog_project,
1483
1484
  attn_orthog_projected_values_per_head = orthog_project_per_head
@@ -1487,3 +1488,23 @@ def test_belief_attn(
1487
1488
  x = torch.randint(0, 256, (1, 10))
1488
1489
 
1489
1490
  logits = model(x)
1491
+
1492
+ def test_derf():
1493
+ from x_transformers import TransformerWrapper, Decoder
1494
+
1495
+ model = TransformerWrapper(
1496
+ num_tokens = 256,
1497
+ max_seq_len = 1024,
1498
+ attn_layers = Decoder(
1499
+ dim = 512,
1500
+ depth = 6,
1501
+ heads = 8,
1502
+ attn_kv_heads = 4,
1503
+ rotary_pos_emb = True,
1504
+ use_derf = True
1505
+ )
1506
+ )
1507
+
1508
+ x = torch.randint(0, 256, (1, 10))
1509
+
1510
+ logits = model(x)
@@ -941,6 +941,31 @@ class DynamicTanh(Module):
941
941
  gamma = self.gamma + self.gamma_offset
942
942
  return (x * pre_tanh_scale).tanh() * gamma + self.beta
943
943
 
944
+ class Derf(Module):
945
+ """ https://arxiv.org/abs/2512.10938 """
946
+ def __init__(
947
+ self,
948
+ dim,
949
+ init_alpha = 0.5,
950
+ init_bias = 0.,
951
+ unit_offset = False
952
+ ):
953
+ super().__init__()
954
+ scale_offset = 1. if unit_offset else 0.
955
+
956
+ self.alpha = nn.Parameter(tensor(init_alpha) - scale_offset)
957
+ self.s = nn.Parameter(tensor(init_bias))
958
+
959
+ self.gamma = nn.Parameter(torch.ones(dim) - scale_offset)
960
+ self.beta = nn.Parameter(torch.zeros(dim))
961
+
962
+ self.scale_offset = scale_offset
963
+
964
+ def forward(self, x):
965
+ x = x * (self.alpha + self.scale_offset) + self.s
966
+ activated = torch.erf(x)
967
+ return activated * (self.gamma + self.scale_offset) + self.beta
968
+
944
969
  # residual and residual gates
945
970
 
946
971
  class Residual(Module):
@@ -1431,6 +1456,7 @@ class Attention(Module):
1431
1456
  assert divisible_by(heads, kv_heads)
1432
1457
 
1433
1458
  self.kv_heads = kv_heads
1459
+ self.groups = heads // kv_heads
1434
1460
 
1435
1461
  q_dim = dim_head * heads
1436
1462
  k_dim = dim_head * kv_heads
@@ -2077,7 +2103,7 @@ class Attention(Module):
2077
2103
 
2078
2104
  if self.orthog_projected_values or self.orthog_projected_values_per_head:
2079
2105
  orthog_projected = []
2080
- v_for_proj = self.merge_heads(orig_values)
2106
+ v_for_proj = repeat(orig_values, 'b h n d -> b n (g h d)', g = self.groups)
2081
2107
 
2082
2108
  if self.orthog_projected_values:
2083
2109
  projected = orthog_project(out, v_for_proj)
@@ -2122,6 +2148,7 @@ class AttentionLayers(Module):
2122
2148
  use_scalenorm = False,
2123
2149
  use_rmsnorm = False,
2124
2150
  use_dynamic_tanh = False,
2151
+ use_derf = False,
2125
2152
  dynamic_tanh_init_alpha = 1.,
2126
2153
  use_simple_rmsnorm = False,
2127
2154
  use_adaptive_layernorm = False,
@@ -2276,7 +2303,7 @@ class AttentionLayers(Module):
2276
2303
 
2277
2304
  # determine norm
2278
2305
 
2279
- assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
2306
+ assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_derf, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
2280
2307
 
2281
2308
  norm_need_condition = False
2282
2309
  dim_condition = default(dim_condition, dim)
@@ -2294,6 +2321,8 @@ class AttentionLayers(Module):
2294
2321
  elif use_dynamic_tanh:
2295
2322
  assert pre_norm, 'dynamic tanh norm only tested for pre-norm'
2296
2323
  norm_class = partial(DynamicTanh, init_alpha = dynamic_tanh_init_alpha)
2324
+ elif use_derf:
2325
+ norm_class = Derf
2297
2326
  elif use_adaptive_layernorm:
2298
2327
  norm_need_condition = True
2299
2328
  norm_class = partial(AdaptiveLayerNorm, dim_condition = dim_condition * dim_condition_mult)