PyPI - x-transformers - Versions diffs - 1.23.0__tar.gz → 1.23.2__tar.gz - Mend

x-transformers 1.23.0tar.gz → 1.23.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{x-transformers-1.23.0/x_transformers.egg-info → x-transformers-1.23.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.23.0
+Version: 1.23.2
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x-transformers-1.23.0 → x-transformers-1.23.2}/README.md RENAMED Viewed

@@ -1932,16 +1932,6 @@ generated = model.generate(start_emb, 17) # (17, 777)
 }
 ```
-```bibtex
-@article{Liu2023EfficientViTME,
-    title   = {EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention},
-    author  = {Xinyu Liu and Houwen Peng and Ningxin Zheng and Yuqing Yang and Han Hu and Yixuan Yuan},
-    journal = {ArXiv},
-    year    = {2023},
-    volume  = {abs/2305.07027}
-}
-```
 ```bibtex
 @article{Kazemnejad2023TheIO,
     title   = {The Impact of Positional Encoding on Length Generalization in Transformers},
@@ -2007,4 +1997,15 @@ generated = model.generate(start_emb, 17) # (17, 777)
 }
 ```
+```bibtex
+@article{Bondarenko2023QuantizableTR,
+    title   = {Quantizable Transformers: Removing Outliers by Helping Attention Heads Do Nothing},
+    author  = {Yelysei Bondarenko and Markus Nagel and Tijmen Blankevoort},
+    journal = {ArXiv},
+    year    = {2023},
+    volume  = {abs/2306.12929},
+    url     = {https://api.semanticscholar.org/CorpusID:259224568}
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x-transformers-1.23.0 → x-transformers-1.23.2}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.23.0',
+  version = '1.23.2',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',

{x-transformers-1.23.0 → x-transformers-1.23.2}/x_transformers/attend.py RENAMED Viewed

@@ -346,79 +346,3 @@ class Attend(nn.Module):
         )
         return out, intermediates
-# cascading heads logic
-def to_single_heads(t, dim = 1):
-    heads = t.unbind(dim = dim)
-    return tuple(head.unsqueeze(dim) for head in heads)
-class CascadingHeads(nn.Module):
-    def __init__(self, attend: Attend):
-        super().__init__()
-        self.attend = attend
-    def forward(
-        self,
-        q, k, v,
-        mask = None,
-        attn_bias = None,
-        prev_attn = None
-    ):
-        assert q.shape[-1] == v.shape[-1], 'cascading heads can only be done if query / key and value head dimensions are the same'
-        # split inputs into per-head inputs
-        heads = q.shape[1]
-        queries = to_single_heads(q)
-        keys = to_single_heads(k) if k.ndim == 4 else ((k,) * heads)
-        values = to_single_heads(v) if v.ndim == 4 else ((v,) * heads)
-        mask = (mask,) * heads
-        attn_bias = to_single_heads(attn_bias, dim = 0) if exists(attn_bias) else ((None,) * heads)
-        prev_attn = to_single_heads(prev_attn) if exists(prev_attn) else ((None,) * heads)
-        # now loop through each head, without output of previous head summed with the next head
-        # thus cascading
-        all_outs = []
-        all_intermediates = []
-        prev_head_out = None
-        for h_q, h_k, h_v, h_mask, h_attn_bias, h_prev_attn in zip(queries, keys, values, mask, attn_bias, prev_attn):
-            if exists(prev_head_out):
-                h_q = h_q + prev_head_out
-            out, intermediates = self.attend(
-                h_q, h_k, h_v,
-                mask = h_mask,
-                attn_bias = h_attn_bias,
-                prev_attn = h_prev_attn
-            )
-            prev_head_out = out
-            all_outs.append(out)
-            all_intermediates.append(intermediates)
-        # cat all output heads
-        all_outs = torch.cat(all_outs, dim = 1)
-        # cat all intermediates, if they exist
-        qk_similarities, pre_softmax_attn, post_softmax_attn = zip(*map(lambda i: i.to_tuple(), all_intermediates))
-        qk_similarities, pre_softmax_attn, post_softmax_attn = map(compact, (qk_similarities, pre_softmax_attn, post_softmax_attn))
-        aggregated_intermediates = Intermediates(
-            qk_similarities = torch.cat(qk_similarities, dim = 1) if len(qk_similarities) > 0 else None,
-            pre_softmax_attn = torch.cat(pre_softmax_attn, dim = 1) if len(pre_softmax_attn) > 0 else None,
-            post_softmax_attn = torch.cat(post_softmax_attn, dim = 1) if len(post_softmax_attn) > 0 else None
-        )
-        return all_outs, aggregated_intermediates

{x-transformers-1.23.0 → x-transformers-1.23.2}/x_transformers/x_transformers.py RENAMED Viewed

@@ -14,7 +14,7 @@ from typing import List, Callable, Optional
 from einops import rearrange, repeat, reduce, pack, unpack
 from einops.layers.torch import Rearrange
-from x_transformers.attend import Attend, Intermediates, CascadingHeads
+from x_transformers.attend import Attend, Intermediates
 from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
 # constants
@@ -650,6 +650,7 @@ class Attention(nn.Module):
         num_mem_kv = 0,
         dropout = 0.,
         on_attn = False,
+        gate_value_heads = False,
         gate_values = False,
         zero_init_output = False,
         max_attend_past = None,
@@ -662,7 +663,6 @@ class Attention(nn.Module):
         shared_kv = False,
         value_dim_head = None,
         tensor_product = False,      # https://arxiv.org/abs/2208.06061
-        cascading_heads = False,
         add_zero_kv = False,         # same as add_zero_attn in pytorch
         rotary_embed_values = False,
         onnxable = False
@@ -674,7 +674,6 @@ class Attention(nn.Module):
         self.causal = causal
         self.max_attend_past = max_attend_past
         assert not (exists(kv_heads) and one_kv_head), 'either attn_one_kv_head is set to True (in which case kv_heads is set to 1), or attn_kv_heads is set, but not both'
         value_dim_head = default(value_dim_head, dim_head)
@@ -705,7 +704,14 @@ class Attention(nn.Module):
         if gate_values:
             self.to_v_gate = nn.Linear(dim, out_dim)
             nn.init.constant_(self.to_v_gate.weight, 0)
-            nn.init.constant_(self.to_v_gate.bias, 1)
+            nn.init.constant_(self.to_v_gate.bias, 10)
+        # add per head gating of the output values, from 'Attend to nothing' paper
+        self.to_v_head_gate = None
+        if gate_value_heads:
+            self.to_v_head_gate = nn.Linear(dim, heads)
+            nn.init.constant_(self.to_v_head_gate.weight, 0)
+            nn.init.constant_(self.to_v_head_gate.bias, 10)
         # cosine sim attention
         self.qk_norm = qk_norm
@@ -738,10 +744,6 @@ class Attention(nn.Module):
             onnxable = onnxable
         )
-        if cascading_heads:
-            # cascading heads - wrap the Attend logic
-            self.attend = CascadingHeads(self.attend)
         # head scaling
         self.head_scale = head_scale
         if head_scale:
@@ -911,6 +913,12 @@ class Attention(nn.Module):
         if head_scale:
             out = out * self.head_scale_params
+        # per head gating, from https://arxiv.org/abs/2306.12929
+        if exists(self.to_v_head_gate):
+            head_gate = self.to_v_head_gate(x)
+            out = out * rearrange(head_gate, 'b n h -> b h n 1').sigmoid()
         # merge heads
         out = rearrange(out, 'b h n d -> b n (h d)')

{x-transformers-1.23.0 → x-transformers-1.23.2/x_transformers.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.23.0
+Version: 1.23.2
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang