PyPI - x-transformers - Versions diffs - 1.37.1__py3-none-any.whl → 1.37.3__py3-none-any.whl - Mend

x-transformers 1.37.1py3-none-any.whl → 1.37.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

x_transformers/attend.py CHANGED Viewed

@@ -84,6 +84,7 @@ class Attend(Module):
         softclamp_logits = False,
         logit_softclamp_value = 50.,
         add_zero_kv = False,
+        sigsoftmax = False,
         cope = None,
         onnxable = False,
         sdp_kwargs: dict = dict(
@@ -117,6 +118,11 @@ class Attend(Module):
         assert not (flash and sparse_topk), 'sparse topk not compatible with flash attention'
         self.sparse_topk = sparse_topk
+        # sig softmax
+        assert not (flash and sigsoftmax), 'sigsoftmax not available for flash attention'
+        self.sigsoftmax = sigsoftmax
         # add a key / value token composed of zeros
         # in case this helps controlling outliers, proposed by https://www.evanmiller.org/attention-is-off-by-one.html
@@ -298,14 +304,14 @@ class Attend(Module):
         # handle grouped multi-query attention
         if kv_heads == 1:
-            k, v = map(lambda t: rearrange(t, 'b 1 n d -> b n d'), (k, v))
+            k, v = tuple(rearrange(t, 'b 1 n d -> b n d') for t in (k, v))
         elif kv_heads < heads:
-            k, v = map(lambda t: repeat(t, 'b kvh n d -> b (r kvh) n d', r = heads // kv_heads), (k, v))
+            k, v = tuple(repeat(t, 'b kvh n d -> b (r kvh) n d', r = heads // kv_heads) for t in (k, v))
         # handle zero kv, as means for allowing network to attend to nothing
         if self.add_zero_kv:
-            k, v = map(lambda t: F.pad(t, (0, 0, 1, 0), value = 0.), (k, v))
+            k, v = tuple(F.pad(t, (0, 0, 1, 0), value = 0.) for t in (k, v))
             if exists(mask):
                 mask = F.pad(mask, (1, 0), value = True)
@@ -359,12 +365,15 @@ class Attend(Module):
         if exists(self.cope):
             sim = sim + self.cope(q, sim)
-        pre_softmax_attn = sim.clone()
+        pre_softmax_attn = sim
+        if self.sigsoftmax:
+            sim = sim + sim.sigmoid().log()
         attn = self.attn_fn(sim, dim = -1)
         attn = attn.type(dtype)
-        post_softmax_attn = attn.clone()
+        post_softmax_attn = attn
         attn = self.attn_dropout(attn)

x_transformers/x_transformers.py CHANGED Viewed

@@ -917,6 +917,7 @@ class Attention(Module):
         swiglu_values = False,
         gate_values = False,
         zero_init_output = False,
+        sigsoftmax = False,
         max_attend_past = None,
         qk_norm = False,
         qk_norm_groups = 1,
@@ -1039,6 +1040,7 @@ class Attention(Module):
             add_zero_kv = add_zero_kv,
             flash = flash,
             softclamp_logits = softclamp_logits,
+            sigsoftmax = sigsoftmax,
             logit_softclamp_value = logit_softclamp_value,
             cope = cope,
             onnxable = onnxable
@@ -2003,6 +2005,7 @@ class TransformerWrapper(Module):
         token_emb: TokenEmbedding | None = None,
         mixture_of_softmax = False,
         mixture_of_softmax_k = 4,
+        sigsoftmax_logits = False
     ):
         super().__init__()
@@ -2090,6 +2093,10 @@ class TransformerWrapper(Module):
             self.combine_mixture = LinearNoBias(dim, mixture_of_softmax_k)
+        # sig softmax
+        self.sigsoftmax_logits = sigsoftmax_logits
         # output head, usually to logits of num_tokens
         logits_dim = default(logits_dim, num_tokens)
@@ -2258,7 +2265,7 @@ class TransformerWrapper(Module):
         # attention layers
         if not self.recycling:
-            assert recycle_steps == 1, 'you did not train with recycling'
+            assert not exists(recycle_steps) or recycle_steps == 1, 'you did not train with recycling'
             # regular
@@ -2322,6 +2329,11 @@ class TransformerWrapper(Module):
             else:
                 logits = self.to_logits(x)
+        # maybe sig softmax
+        if self.sigsoftmax_logits:
+            logits = logits + logits.sigmoid().log()
         # handle maybe combine mixture
         if exists(combine_mixture):

{x_transformers-1.37.1.dist-info → x_transformers-1.37.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.37.1
+Version: 1.37.3
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x_transformers-1.37.1.dist-info → x_transformers-1.37.3.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 x_transformers/__init__.py,sha256=-MkQrSc37cTVDX7AOykxunYnqVtFlQ7lb0Cse5dsGWU,793
-x_transformers/attend.py,sha256=7q996VGYHGIsc0FQnN8WNiwHn3xny3i1biRwx7yW5vg,12090
+x_transformers/attend.py,sha256=mV7duZ7ON2puS3-k4ctBifb2rq-jTJqrMbof7tI5jR4,12326
 x_transformers/autoregressive_wrapper.py,sha256=2FN4ZobFcdDGDGWEnUof_geb16dRGSJycZGwG899Pa4,10493
 x_transformers/continuous.py,sha256=cIVEdhfei258__ziV7kQBrJMxCel54bExBTDrO9rfCI,6450
 x_transformers/dpo.py,sha256=LjvWgCkqTl-UuehrzQ8nkX5guLr4whYwsmm7SKSwdls,3450
 x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
 x_transformers/nonautoregressive_wrapper.py,sha256=ys_p8obc7lTeeodCqvkRKxOXQ1C9T3j5Jwr-JbVgnXk,10432
-x_transformers/x_transformers.py,sha256=9lk6wtz0vNigyLoMWleo442Q0mhce-BCxEhazhSHuvI,83356
+x_transformers/x_transformers.py,sha256=gOJBZzOJMu5RkIsxw9TZtde4Sx--D18yX8LjrYIsPbE,83677
 x_transformers/xl_autoregressive_wrapper.py,sha256=DCx4n0_c1tFai4nOqaWVnqx2p9eutsZsDMiMP1ckxNU,4117
 x_transformers/xval.py,sha256=QE1ltYZTR_eGgIHPP2BrMWVWVLqMW-OpDZh87BSmQEg,8563
-x_transformers-1.37.1.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-1.37.1.dist-info/METADATA,sha256=ik8UKwzq_pW9zdxCl6pt7POrjRC7_GwIi6gAnY7Fck0,661
-x_transformers-1.37.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-x_transformers-1.37.1.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
-x_transformers-1.37.1.dist-info/RECORD,,
+x_transformers-1.37.3.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-1.37.3.dist-info/METADATA,sha256=SIGTCQMrLkyq_aksJAst0iXw9VfFT6QWlGvtUElbTMg,661
+x_transformers-1.37.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+x_transformers-1.37.3.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
+x_transformers-1.37.3.dist-info/RECORD,,

{x_transformers-1.37.1.dist-info → x_transformers-1.37.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{x_transformers-1.37.1.dist-info → x_transformers-1.37.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{x_transformers-1.37.1.dist-info → x_transformers-1.37.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

x-transformers 1.37.1__py3-none-any.whl → 1.37.3__py3-none-any.whl

x-transformers 1.37.1py3-none-any.whl → 1.37.3py3-none-any.whl