x-transformers 2.2.11__tar.gz → 2.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {x_transformers-2.2.11 → x_transformers-2.2.12}/PKG-INFO +1 -1
  2. {x_transformers-2.2.11 → x_transformers-2.2.12}/pyproject.toml +1 -1
  3. {x_transformers-2.2.11 → x_transformers-2.2.12}/tests/test_x_transformers.py +1 -0
  4. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/x_transformers.py +19 -2
  5. {x_transformers-2.2.11 → x_transformers-2.2.12}/.github/FUNDING.yml +0 -0
  6. {x_transformers-2.2.11 → x_transformers-2.2.12}/.github/workflows/python-publish.yml +0 -0
  7. {x_transformers-2.2.11 → x_transformers-2.2.12}/.github/workflows/python-test.yaml +0 -0
  8. {x_transformers-2.2.11 → x_transformers-2.2.12}/.gitignore +0 -0
  9. {x_transformers-2.2.11 → x_transformers-2.2.12}/LICENSE +0 -0
  10. {x_transformers-2.2.11 → x_transformers-2.2.12}/README.md +0 -0
  11. {x_transformers-2.2.11 → x_transformers-2.2.12}/data/README.md +0 -0
  12. {x_transformers-2.2.11 → x_transformers-2.2.12}/data/enwik8.gz +0 -0
  13. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/all-attention.png +0 -0
  14. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/attention-on-attention.png +0 -0
  15. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/cosine-sim-attention.png +0 -0
  16. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/deepnorm.png +0 -0
  17. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/dynamic-pos-bias-linear.png +0 -0
  18. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/dynamic-pos-bias-log.png +0 -0
  19. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  20. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/dynamic-pos-bias.png +0 -0
  21. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/enhanced-recurrence.png +0 -0
  22. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/fcm.png +0 -0
  23. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/ffglu.png +0 -0
  24. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/flash-attention.png +0 -0
  25. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/gate_values.png +0 -0
  26. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/gating.png +0 -0
  27. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/length-extrapolation-scale.png +0 -0
  28. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/macaron-1.png +0 -0
  29. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/macaron-2.png +0 -0
  30. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/memory-transformer.png +0 -0
  31. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/normformer.png +0 -0
  32. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/pia.png +0 -0
  33. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/qknorm-analysis.png +0 -0
  34. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/resi_dual.png +0 -0
  35. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/residual_attn.png +0 -0
  36. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/rezero.png +0 -0
  37. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/rotary.png +0 -0
  38. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/sandwich-2.png +0 -0
  39. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/sandwich.png +0 -0
  40. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/sandwich_norm.png +0 -0
  41. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/scalenorm.png +0 -0
  42. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/talking-heads.png +0 -0
  43. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/topk-attention.png +0 -0
  44. {x_transformers-2.2.11 → x_transformers-2.2.12}/images/xval.png +0 -0
  45. {x_transformers-2.2.11 → x_transformers-2.2.12}/train_belief_state.py +0 -0
  46. {x_transformers-2.2.11 → x_transformers-2.2.12}/train_copy.py +0 -0
  47. {x_transformers-2.2.11 → x_transformers-2.2.12}/train_entropy_tokenizer.py +0 -0
  48. {x_transformers-2.2.11 → x_transformers-2.2.12}/train_enwik8.py +0 -0
  49. {x_transformers-2.2.11 → x_transformers-2.2.12}/train_length_extrapolate.py +0 -0
  50. {x_transformers-2.2.11 → x_transformers-2.2.12}/train_parity.py +0 -0
  51. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/__init__.py +0 -0
  52. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/attend.py +0 -0
  53. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/autoregressive_wrapper.py +0 -0
  54. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/belief_state_wrapper.py +0 -0
  55. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/continuous.py +0 -0
  56. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/dpo.py +0 -0
  57. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/entropy_based_tokenizer.py +0 -0
  58. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/multi_input.py +0 -0
  59. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/neo_mlp.py +0 -0
  60. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/nonautoregressive_wrapper.py +0 -0
  61. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  62. {x_transformers-2.2.11 → x_transformers-2.2.12}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.2.11
3
+ Version: 2.2.12
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.2.11"
3
+ version = "2.2.12"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -826,6 +826,7 @@ def test_entropy_based_tokenizer_max_token_len():
826
826
  token_lengths = tokenizer(seq, lens = lens)
827
827
 
828
828
  assert token_lengths.amax().item() <= 4
829
+ assert token_lengths.sum().item() == 14
829
830
 
830
831
  def test_custom_ff_activation():
831
832
 
@@ -62,6 +62,9 @@ def default(val, d):
62
62
  return val
63
63
  return d() if callable(d) else d
64
64
 
65
+ def identity(t, *args, **kwargs):
66
+ return t
67
+
65
68
  def first(it, default = None):
66
69
  return it[0] if len(it) > 0 else default
67
70
 
@@ -74,7 +77,10 @@ def cast_tuple(val, depth = 1):
74
77
  def divisible_by(num, den):
75
78
  return (num % den) == 0
76
79
 
77
- def maybe(fn):
80
+ def maybe(fn = None):
81
+ if not exists(fn):
82
+ fn = identity
83
+
78
84
  @wraps(fn)
79
85
  def inner(x, *args, **kwargs):
80
86
  if not exists(x):
@@ -1199,6 +1205,7 @@ class FeedForward(Module):
1199
1205
  custom_activation = None,
1200
1206
  post_act_ln = False,
1201
1207
  dropout = 0.,
1208
+ sublayer_dropout = 0.,
1202
1209
  no_bias = False,
1203
1210
  zero_init_output = False
1204
1211
  ):
@@ -1227,7 +1234,8 @@ class FeedForward(Module):
1227
1234
  project_in,
1228
1235
  LayerNorm(inner_dim) if post_act_ln else None,
1229
1236
  nn.Dropout(dropout),
1230
- nn.Linear(inner_dim, dim_out, bias = not no_bias)
1237
+ nn.Linear(inner_dim, dim_out, bias = not no_bias),
1238
+ nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
1231
1239
  )
1232
1240
 
1233
1241
  # init last linear layer to 0
@@ -1256,6 +1264,7 @@ class Attention(Module):
1256
1264
  sparse_topk_straight_through = False,
1257
1265
  num_mem_kv = 0,
1258
1266
  dropout = 0.,
1267
+ sublayer_dropout = 0.,
1259
1268
  on_attn = False,
1260
1269
  gate_value_heads = False,
1261
1270
  swiglu_values = False,
@@ -1534,6 +1543,10 @@ class Attention(Module):
1534
1543
  dim_out = default(dim_out, dim)
1535
1544
  self.to_out = nn.Sequential(LinearNoBias(out_dim, dim_out * 2), nn.GLU()) if on_attn else LinearNoBias(out_dim, dim_out)
1536
1545
 
1546
+ # sublayer dropout
1547
+
1548
+ self.sublayer_dropout = nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
1549
+
1537
1550
  # the number of attention heads to rotate, for decoupled rope in multi-latent attention
1538
1551
 
1539
1552
  rotate_num_heads = default(rotate_num_heads, heads)
@@ -1871,6 +1884,10 @@ class Attention(Module):
1871
1884
 
1872
1885
  out = self.to_out(out)
1873
1886
 
1887
+ # maybe sublayer dropout
1888
+
1889
+ out = maybe(self.sublayer_dropout)(out)
1890
+
1874
1891
  if exists(mask):
1875
1892
  out = einx.where('b n, b n d, -> b n d', mask, out, 0.)
1876
1893
 
File without changes