x-transformers 2.11.23__tar.gz → 2.11.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of x-transformers might be problematic. Click here for more details.

Files changed (68) hide show
  1. {x_transformers-2.11.23 → x_transformers-2.11.24}/PKG-INFO +13 -1
  2. {x_transformers-2.11.23 → x_transformers-2.11.24}/README.md +12 -0
  3. {x_transformers-2.11.23 → x_transformers-2.11.24}/pyproject.toml +1 -1
  4. {x_transformers-2.11.23 → x_transformers-2.11.24}/tests/test_x_transformers.py +20 -0
  5. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/x_transformers.py +29 -1
  6. {x_transformers-2.11.23 → x_transformers-2.11.24}/.github/FUNDING.yml +0 -0
  7. {x_transformers-2.11.23 → x_transformers-2.11.24}/.github/workflows/python-publish.yml +0 -0
  8. {x_transformers-2.11.23 → x_transformers-2.11.24}/.github/workflows/python-test.yaml +0 -0
  9. {x_transformers-2.11.23 → x_transformers-2.11.24}/.gitignore +0 -0
  10. {x_transformers-2.11.23 → x_transformers-2.11.24}/LICENSE +0 -0
  11. {x_transformers-2.11.23 → x_transformers-2.11.24}/data/README.md +0 -0
  12. {x_transformers-2.11.23 → x_transformers-2.11.24}/data/enwik8.gz +0 -0
  13. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/all-attention.png +0 -0
  14. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/attention-on-attention.png +0 -0
  15. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/cosine-sim-attention.png +0 -0
  16. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/deepnorm.png +0 -0
  17. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias-linear.png +0 -0
  18. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias-log.png +0 -0
  19. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  20. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/dynamic-pos-bias.png +0 -0
  21. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/enhanced-recurrence.png +0 -0
  22. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/fcm.png +0 -0
  23. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/ffglu.png +0 -0
  24. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/flash-attention.png +0 -0
  25. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/gate_values.png +0 -0
  26. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/gating.png +0 -0
  27. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/length-extrapolation-scale.png +0 -0
  28. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/macaron-1.png +0 -0
  29. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/macaron-2.png +0 -0
  30. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/memory-transformer.png +0 -0
  31. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/normformer.png +0 -0
  32. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/pia.png +0 -0
  33. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/qknorm-analysis.png +0 -0
  34. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/resi_dual.png +0 -0
  35. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/residual_attn.png +0 -0
  36. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/rezero.png +0 -0
  37. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/rotary.png +0 -0
  38. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/sandwich-2.png +0 -0
  39. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/sandwich.png +0 -0
  40. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/sandwich_norm.png +0 -0
  41. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/scalenorm.png +0 -0
  42. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/talking-heads.png +0 -0
  43. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/topk-attention.png +0 -0
  44. {x_transformers-2.11.23 → x_transformers-2.11.24}/images/xval.png +0 -0
  45. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_belief_state.py +0 -0
  46. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_copy.py +0 -0
  47. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_entropy_tokenizer.py +0 -0
  48. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_enwik8.py +0 -0
  49. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_free.py +0 -0
  50. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_gpt_vae.py +0 -0
  51. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_length_extrapolate.py +0 -0
  52. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_parity.py +0 -0
  53. {x_transformers-2.11.23 → x_transformers-2.11.24}/train_with_muon.py +0 -0
  54. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/__init__.py +0 -0
  55. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/attend.py +0 -0
  56. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/autoregressive_wrapper.py +0 -0
  57. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/belief_state_wrapper.py +0 -0
  58. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/continuous.py +0 -0
  59. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/dpo.py +0 -0
  60. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/entropy_based_tokenizer.py +0 -0
  61. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/free_transformer.py +0 -0
  62. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/gpt_vae.py +0 -0
  63. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/multi_input.py +0 -0
  64. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/neo_mlp.py +0 -0
  65. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/nonautoregressive_wrapper.py +0 -0
  66. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/up_wrapper.py +0 -0
  67. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  68. {x_transformers-2.11.23 → x_transformers-2.11.24}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.11.23
3
+ Version: 2.11.24
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2618,4 +2618,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
2618
2618
  }
2619
2619
  ```
2620
2620
 
2621
+ ```bibtex
2622
+ @misc{chen2025strongernormalizationfreetransformers,
2623
+ title = {Stronger Normalization-Free Transformers},
2624
+ author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
2625
+ year = {2025},
2626
+ eprint = {2512.10938},
2627
+ archivePrefix = {arXiv},
2628
+ primaryClass = {cs.LG},
2629
+ url = {https://arxiv.org/abs/2512.10938},
2630
+ }
2631
+ ```
2632
+
2621
2633
  *solve intelligence... then use that to solve everything else.* - Demis Hassabis
@@ -2569,4 +2569,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
2569
2569
  }
2570
2570
  ```
2571
2571
 
2572
+ ```bibtex
2573
+ @misc{chen2025strongernormalizationfreetransformers,
2574
+ title = {Stronger Normalization-Free Transformers},
2575
+ author = {Mingzhi Chen and Taiming Lu and Jiachen Zhu and Mingjie Sun and Zhuang Liu},
2576
+ year = {2025},
2577
+ eprint = {2512.10938},
2578
+ archivePrefix = {arXiv},
2579
+ primaryClass = {cs.LG},
2580
+ url = {https://arxiv.org/abs/2512.10938},
2581
+ }
2582
+ ```
2583
+
2572
2584
  *solve intelligence... then use that to solve everything else.* - Demis Hassabis
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.11.23"
3
+ version = "2.11.24"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -1488,3 +1488,23 @@ def test_belief_attn(
1488
1488
  x = torch.randint(0, 256, (1, 10))
1489
1489
 
1490
1490
  logits = model(x)
1491
+
1492
+ def test_derf():
1493
+ from x_transformers import TransformerWrapper, Decoder
1494
+
1495
+ model = TransformerWrapper(
1496
+ num_tokens = 256,
1497
+ max_seq_len = 1024,
1498
+ attn_layers = Decoder(
1499
+ dim = 512,
1500
+ depth = 6,
1501
+ heads = 8,
1502
+ attn_kv_heads = 4,
1503
+ rotary_pos_emb = True,
1504
+ use_derf = True
1505
+ )
1506
+ )
1507
+
1508
+ x = torch.randint(0, 256, (1, 10))
1509
+
1510
+ logits = model(x)
@@ -941,6 +941,31 @@ class DynamicTanh(Module):
941
941
  gamma = self.gamma + self.gamma_offset
942
942
  return (x * pre_tanh_scale).tanh() * gamma + self.beta
943
943
 
944
+ class Derf(Module):
945
+ """ https://arxiv.org/abs/2512.10938 """
946
+ def __init__(
947
+ self,
948
+ dim,
949
+ init_alpha = 0.5,
950
+ init_bias = 0.,
951
+ unit_offset = False
952
+ ):
953
+ super().__init__()
954
+ scale_offset = 1. if unit_offset else 0.
955
+
956
+ self.alpha = nn.Parameter(tensor(init_alpha) - scale_offset)
957
+ self.s = nn.Parameter(tensor(init_bias))
958
+
959
+ self.gamma = nn.Parameter(torch.ones(dim) - scale_offset)
960
+ self.beta = nn.Parameter(torch.zeros(dim))
961
+
962
+ self.scale_offset = scale_offset
963
+
964
+ def forward(self, x):
965
+ x = x * (self.alpha + self.scale_offset) + self.s
966
+ activated = torch.erf(x)
967
+ return activated * (self.gamma + self.scale_offset) + self.beta
968
+
944
969
  # residual and residual gates
945
970
 
946
971
  class Residual(Module):
@@ -2123,6 +2148,7 @@ class AttentionLayers(Module):
2123
2148
  use_scalenorm = False,
2124
2149
  use_rmsnorm = False,
2125
2150
  use_dynamic_tanh = False,
2151
+ use_derf = False,
2126
2152
  dynamic_tanh_init_alpha = 1.,
2127
2153
  use_simple_rmsnorm = False,
2128
2154
  use_adaptive_layernorm = False,
@@ -2277,7 +2303,7 @@ class AttentionLayers(Module):
2277
2303
 
2278
2304
  # determine norm
2279
2305
 
2280
- assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
2306
+ assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_derf, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
2281
2307
 
2282
2308
  norm_need_condition = False
2283
2309
  dim_condition = default(dim_condition, dim)
@@ -2295,6 +2321,8 @@ class AttentionLayers(Module):
2295
2321
  elif use_dynamic_tanh:
2296
2322
  assert pre_norm, 'dynamic tanh norm only tested for pre-norm'
2297
2323
  norm_class = partial(DynamicTanh, init_alpha = dynamic_tanh_init_alpha)
2324
+ elif use_derf:
2325
+ norm_class = Derf
2298
2326
  elif use_adaptive_layernorm:
2299
2327
  norm_need_condition = True
2300
2328
  norm_class = partial(AdaptiveLayerNorm, dim_condition = dim_condition * dim_condition_mult)