PyPI - x-transformers - Versions diffs - 2.2.11__tar.gz → 2.2.12__tar.gz - Mend

@@ -62,6 +62,9 @@ def default(val, d):
         return val
     return d() if callable(d) else d
+def identity(t, *args, **kwargs):
+    return t
 def first(it, default = None):
     return it[0] if len(it) > 0 else default
@@ -74,7 +77,10 @@ def cast_tuple(val, depth = 1):
 def divisible_by(num, den):
     return (num % den) == 0
-def maybe(fn):
+def maybe(fn = None):
+    if not exists(fn):
+        fn = identity
     @wraps(fn)
     def inner(x, *args, **kwargs):
         if not exists(x):
@@ -1199,6 +1205,7 @@ class FeedForward(Module):
         custom_activation = None,
         post_act_ln = False,
         dropout = 0.,
+        sublayer_dropout = 0.,
         no_bias = False,
         zero_init_output = False
     ):
@@ -1227,7 +1234,8 @@ class FeedForward(Module):
             project_in,
             LayerNorm(inner_dim) if post_act_ln else None,
             nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out, bias = not no_bias)
+            nn.Linear(inner_dim, dim_out, bias = not no_bias),
+            nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
         )
         # init last linear layer to 0
@@ -1256,6 +1264,7 @@ class Attention(Module):
         sparse_topk_straight_through = False,
         num_mem_kv = 0,
         dropout = 0.,
+        sublayer_dropout = 0.,
         on_attn = False,
         gate_value_heads = False,
         swiglu_values = False,
@@ -1534,6 +1543,10 @@ class Attention(Module):
         dim_out = default(dim_out, dim)
         self.to_out = nn.Sequential(LinearNoBias(out_dim, dim_out * 2), nn.GLU()) if on_attn else LinearNoBias(out_dim, dim_out)
+        # sublayer dropout
+        self.sublayer_dropout = nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
         # the number of attention heads to rotate, for decoupled rope in multi-latent attention
         rotate_num_heads = default(rotate_num_heads, heads)
@@ -1871,6 +1884,10 @@ class Attention(Module):
         out = self.to_out(out)
+        # maybe sublayer dropout
+        out = maybe(self.sublayer_dropout)(out)
         if exists(mask):
             out = einx.where('b n, b n d, -> b n d', mask, out, 0.)

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.2.11
+Version: 2.2.12
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.2.11"
+version = "2.2.12"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

@@ -826,6 +826,7 @@ def test_entropy_based_tokenizer_max_token_len():
     token_lengths = tokenizer(seq, lens = lens)
     assert token_lengths.amax().item() <= 4
+    assert token_lengths.sum().item() == 14
 def test_custom_ff_activation():

x-transformers 2.2.11__tar.gz → 2.2.12__tar.gz

x-transformers 2.2.11tar.gz → 2.2.12tar.gz