x-transformers 2.4.11__tar.gz → 2.4.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {x_transformers-2.4.11 → x_transformers-2.4.14}/PKG-INFO +1 -1
  2. {x_transformers-2.4.11 → x_transformers-2.4.14}/pyproject.toml +1 -1
  3. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/autoregressive_wrapper.py +1 -1
  4. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/x_transformers.py +9 -0
  5. {x_transformers-2.4.11 → x_transformers-2.4.14}/.github/FUNDING.yml +0 -0
  6. {x_transformers-2.4.11 → x_transformers-2.4.14}/.github/workflows/python-publish.yml +0 -0
  7. {x_transformers-2.4.11 → x_transformers-2.4.14}/.github/workflows/python-test.yaml +0 -0
  8. {x_transformers-2.4.11 → x_transformers-2.4.14}/.gitignore +0 -0
  9. {x_transformers-2.4.11 → x_transformers-2.4.14}/LICENSE +0 -0
  10. {x_transformers-2.4.11 → x_transformers-2.4.14}/README.md +0 -0
  11. {x_transformers-2.4.11 → x_transformers-2.4.14}/data/README.md +0 -0
  12. {x_transformers-2.4.11 → x_transformers-2.4.14}/data/enwik8.gz +0 -0
  13. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/all-attention.png +0 -0
  14. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/attention-on-attention.png +0 -0
  15. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/cosine-sim-attention.png +0 -0
  16. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/deepnorm.png +0 -0
  17. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/dynamic-pos-bias-linear.png +0 -0
  18. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/dynamic-pos-bias-log.png +0 -0
  19. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  20. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/dynamic-pos-bias.png +0 -0
  21. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/enhanced-recurrence.png +0 -0
  22. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/fcm.png +0 -0
  23. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/ffglu.png +0 -0
  24. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/flash-attention.png +0 -0
  25. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/gate_values.png +0 -0
  26. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/gating.png +0 -0
  27. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/length-extrapolation-scale.png +0 -0
  28. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/macaron-1.png +0 -0
  29. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/macaron-2.png +0 -0
  30. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/memory-transformer.png +0 -0
  31. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/normformer.png +0 -0
  32. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/pia.png +0 -0
  33. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/qknorm-analysis.png +0 -0
  34. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/resi_dual.png +0 -0
  35. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/residual_attn.png +0 -0
  36. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/rezero.png +0 -0
  37. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/rotary.png +0 -0
  38. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/sandwich-2.png +0 -0
  39. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/sandwich.png +0 -0
  40. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/sandwich_norm.png +0 -0
  41. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/scalenorm.png +0 -0
  42. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/talking-heads.png +0 -0
  43. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/topk-attention.png +0 -0
  44. {x_transformers-2.4.11 → x_transformers-2.4.14}/images/xval.png +0 -0
  45. {x_transformers-2.4.11 → x_transformers-2.4.14}/tests/test_x_transformers.py +0 -0
  46. {x_transformers-2.4.11 → x_transformers-2.4.14}/train_belief_state.py +0 -0
  47. {x_transformers-2.4.11 → x_transformers-2.4.14}/train_copy.py +0 -0
  48. {x_transformers-2.4.11 → x_transformers-2.4.14}/train_entropy_tokenizer.py +0 -0
  49. {x_transformers-2.4.11 → x_transformers-2.4.14}/train_enwik8.py +0 -0
  50. {x_transformers-2.4.11 → x_transformers-2.4.14}/train_length_extrapolate.py +0 -0
  51. {x_transformers-2.4.11 → x_transformers-2.4.14}/train_parity.py +0 -0
  52. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/__init__.py +0 -0
  53. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/attend.py +0 -0
  54. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/belief_state_wrapper.py +0 -0
  55. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/continuous.py +0 -0
  56. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/dpo.py +0 -0
  57. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/entropy_based_tokenizer.py +0 -0
  58. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/multi_input.py +0 -0
  59. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/neo_mlp.py +0 -0
  60. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/nonautoregressive_wrapper.py +0 -0
  61. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/up_wrapper.py +0 -0
  62. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  63. {x_transformers-2.4.11 → x_transformers-2.4.14}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.4.11
3
+ Version: 2.4.14
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.4.11"
3
+ version = "2.4.14"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -188,7 +188,7 @@ class AutoregressiveWrapper(Module):
188
188
  temperature = 1.,
189
189
  stochastic = False,
190
190
  prompt_lens: Tensor | None = None,
191
- filter_logits_fn: str | Callable = top_k,
191
+ filter_logits_fn: str | Callable = identity,
192
192
  restrict_to_max_seq_len = True,
193
193
  filter_kwargs: dict = dict(),
194
194
  cache_kv = True,
@@ -1304,6 +1304,7 @@ class Attention(Module):
1304
1304
  qk_norm_groups = 1,
1305
1305
  qk_norm_scale = 10,
1306
1306
  qk_norm_dim_scale = False,
1307
+ value_rmsnorm = False, # used in alphagenome and bytedance's GR3 for further stability
1307
1308
  l2_distance = False,
1308
1309
  sigmoid = False,
1309
1310
  selective = False,
@@ -1458,6 +1459,10 @@ class Attention(Module):
1458
1459
  assert (not qk_norm) or divisible_by(dim_head, qk_norm_groups), 'dimension per attention head must be divisible by the qk norm groups'
1459
1460
  assert not (qk_norm and (dim_head // qk_norm_groups) <= 2), 'the group dimension may be too small (2 was too small in my tests, but 4 still works, surprisingly)'
1460
1461
 
1462
+ # value rms norm
1463
+
1464
+ self.value_rmsnorm = MultiheadRMSNorm(dim_head, heads = heads) if value_rmsnorm else None
1465
+
1461
1466
  # contextual positional encoding
1462
1467
  # https://arxiv.org/html/2405.18719v2
1463
1468
 
@@ -1697,6 +1702,10 @@ class Attention(Module):
1697
1702
  q = q * self.qk_norm_q_scale
1698
1703
  k = k * self.qk_norm_k_scale
1699
1704
 
1705
+ # maybe value rmsnorm
1706
+
1707
+ v = maybe(self.value_rmsnorm)(v)
1708
+
1700
1709
  # take care of caching
1701
1710
 
1702
1711
  if not is_multi_latent_attn:
File without changes