PyPI - x-transformers - Versions diffs - 1.23.0__py3-none-any.whl → 1.23.2__py3-none-any.whl - Mend

x-transformers 1.23.0py3-none-any.whl → 1.23.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

x_transformers/attend.py CHANGED Viewed

@@ -346,79 +346,3 @@ class Attend(nn.Module):
         )
         return out, intermediates
-# cascading heads logic
-def to_single_heads(t, dim = 1):
-    heads = t.unbind(dim = dim)
-    return tuple(head.unsqueeze(dim) for head in heads)
-class CascadingHeads(nn.Module):
-    def __init__(self, attend: Attend):
-        super().__init__()
-        self.attend = attend
-    def forward(
-        self,
-        q, k, v,
-        mask = None,
-        attn_bias = None,
-        prev_attn = None
-    ):
-        assert q.shape[-1] == v.shape[-1], 'cascading heads can only be done if query / key and value head dimensions are the same'
-        # split inputs into per-head inputs
-        heads = q.shape[1]
-        queries = to_single_heads(q)
-        keys = to_single_heads(k) if k.ndim == 4 else ((k,) * heads)
-        values = to_single_heads(v) if v.ndim == 4 else ((v,) * heads)
-        mask = (mask,) * heads
-        attn_bias = to_single_heads(attn_bias, dim = 0) if exists(attn_bias) else ((None,) * heads)
-        prev_attn = to_single_heads(prev_attn) if exists(prev_attn) else ((None,) * heads)
-        # now loop through each head, without output of previous head summed with the next head
-        # thus cascading
-        all_outs = []
-        all_intermediates = []
-        prev_head_out = None
-        for h_q, h_k, h_v, h_mask, h_attn_bias, h_prev_attn in zip(queries, keys, values, mask, attn_bias, prev_attn):
-            if exists(prev_head_out):
-                h_q = h_q + prev_head_out
-            out, intermediates = self.attend(
-                h_q, h_k, h_v,
-                mask = h_mask,
-                attn_bias = h_attn_bias,
-                prev_attn = h_prev_attn
-            )
-            prev_head_out = out
-            all_outs.append(out)
-            all_intermediates.append(intermediates)
-        # cat all output heads
-        all_outs = torch.cat(all_outs, dim = 1)
-        # cat all intermediates, if they exist
-        qk_similarities, pre_softmax_attn, post_softmax_attn = zip(*map(lambda i: i.to_tuple(), all_intermediates))
-        qk_similarities, pre_softmax_attn, post_softmax_attn = map(compact, (qk_similarities, pre_softmax_attn, post_softmax_attn))
-        aggregated_intermediates = Intermediates(
-            qk_similarities = torch.cat(qk_similarities, dim = 1) if len(qk_similarities) > 0 else None,
-            pre_softmax_attn = torch.cat(pre_softmax_attn, dim = 1) if len(pre_softmax_attn) > 0 else None,
-            post_softmax_attn = torch.cat(post_softmax_attn, dim = 1) if len(post_softmax_attn) > 0 else None
-        )
-        return all_outs, aggregated_intermediates

x_transformers/x_transformers.py CHANGED Viewed

@@ -14,7 +14,7 @@ from typing import List, Callable, Optional
 from einops import rearrange, repeat, reduce, pack, unpack
 from einops.layers.torch import Rearrange
-from x_transformers.attend import Attend, Intermediates, CascadingHeads
+from x_transformers.attend import Attend, Intermediates
 from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
 # constants
@@ -650,6 +650,7 @@ class Attention(nn.Module):
         num_mem_kv = 0,
         dropout = 0.,
         on_attn = False,
+        gate_value_heads = False,
         gate_values = False,
         zero_init_output = False,
         max_attend_past = None,
@@ -662,7 +663,6 @@ class Attention(nn.Module):
         shared_kv = False,
         value_dim_head = None,
         tensor_product = False,      # https://arxiv.org/abs/2208.06061
-        cascading_heads = False,
         add_zero_kv = False,         # same as add_zero_attn in pytorch
         rotary_embed_values = False,
         onnxable = False
@@ -674,7 +674,6 @@ class Attention(nn.Module):
         self.causal = causal
         self.max_attend_past = max_attend_past
         assert not (exists(kv_heads) and one_kv_head), 'either attn_one_kv_head is set to True (in which case kv_heads is set to 1), or attn_kv_heads is set, but not both'
         value_dim_head = default(value_dim_head, dim_head)
@@ -705,7 +704,14 @@ class Attention(nn.Module):
         if gate_values:
             self.to_v_gate = nn.Linear(dim, out_dim)
             nn.init.constant_(self.to_v_gate.weight, 0)
-            nn.init.constant_(self.to_v_gate.bias, 1)
+            nn.init.constant_(self.to_v_gate.bias, 10)
+        # add per head gating of the output values, from 'Attend to nothing' paper
+        self.to_v_head_gate = None
+        if gate_value_heads:
+            self.to_v_head_gate = nn.Linear(dim, heads)
+            nn.init.constant_(self.to_v_head_gate.weight, 0)
+            nn.init.constant_(self.to_v_head_gate.bias, 10)
         # cosine sim attention
         self.qk_norm = qk_norm
@@ -738,10 +744,6 @@ class Attention(nn.Module):
             onnxable = onnxable
         )
-        if cascading_heads:
-            # cascading heads - wrap the Attend logic
-            self.attend = CascadingHeads(self.attend)
         # head scaling
         self.head_scale = head_scale
         if head_scale:
@@ -911,6 +913,12 @@ class Attention(nn.Module):
         if head_scale:
             out = out * self.head_scale_params
+        # per head gating, from https://arxiv.org/abs/2306.12929
+        if exists(self.to_v_head_gate):
+            head_gate = self.to_v_head_gate(x)
+            out = out * rearrange(head_gate, 'b n h -> b h n 1').sigmoid()
         # merge heads
         out = rearrange(out, 'b h n d -> b n (h d)')

{x_transformers-1.23.0.dist-info → x_transformers-1.23.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.23.0
+Version: 1.23.2
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x_transformers-1.23.0.dist-info → x_transformers-1.23.2.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 x_transformers/__init__.py,sha256=FDb654rUx8FpXRd76B8q0diH8I7q-ZjTWEtEJ4UM21Y,701
-x_transformers/attend.py,sha256=T2EzF_o0qVxIC0WvWoDDO2sY6J3h-aXAK0vN4McDgbc,13819
+x_transformers/attend.py,sha256=hZcz_iijzbEqbXp2_BPEVL-1LoHXmYaHE6e6Oy-7hFE,11263
 x_transformers/autoregressive_wrapper.py,sha256=f2u0usjUfAlXwgTz87O8J8XjGTbsbrx2XEP6K2beSNI,8944
 x_transformers/continuous_autoregressive_wrapper.py,sha256=pTiDqu6JRUlnQJQp_xHATYHy0lgSd6ERLqyiFO3pC-4,1575
 x_transformers/nonautoregressive_wrapper.py,sha256=AQLE4rA_Kh8VNoe9OzpwyeWson34sRkhks4dn4seNjI,10414
-x_transformers/x_transformers.py,sha256=o8PJ0aZatavxyqx80JLh6Lk-8_C8H-HRwlc1dHsIV6g,60760
+x_transformers/x_transformers.py,sha256=KQ9mU_jE27whl6yQI67grF0S8Xhd3GndnM6Yd0-q-lw,61162
 x_transformers/xl_autoregressive_wrapper.py,sha256=DCx4n0_c1tFai4nOqaWVnqx2p9eutsZsDMiMP1ckxNU,4117
-x_transformers-1.23.0.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-1.23.0.dist-info/METADATA,sha256=cA6JGJ3U7NSpBXjaNvXRkpLTF4YXqLpTQA-8C-RHWk8,661
-x_transformers-1.23.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-x_transformers-1.23.0.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
-x_transformers-1.23.0.dist-info/RECORD,,
+x_transformers-1.23.2.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-1.23.2.dist-info/METADATA,sha256=8h0sbx8-4yNTOJuAZLbe5HQ16hsmZI1M_mT-rMIIMJc,661
+x_transformers-1.23.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+x_transformers-1.23.2.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
+x_transformers-1.23.2.dist-info/RECORD,,

{x_transformers-1.23.0.dist-info → x_transformers-1.23.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{x_transformers-1.23.0.dist-info → x_transformers-1.23.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{x_transformers-1.23.0.dist-info → x_transformers-1.23.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

x-transformers 1.23.0__py3-none-any.whl → 1.23.2__py3-none-any.whl

x-transformers 1.23.0py3-none-any.whl → 1.23.2py3-none-any.whl