PyPI - x-transformers - Versions diffs - 2.6.2__tar.gz → 2.6.4__tar.gz - Mend

x-transformers 2.6.2tar.gz → 2.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{x_transformers-2.6.2 → x_transformers-2.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.6.2
+Version: 2.6.4
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2507,4 +2507,14 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@misc{openai_gpt_oss,
+  author       = {OpenAI},
+  title        = {Introducing gpt-oss},
+  howpublished = {https://openai.com/index/introducing-gpt-oss},
+  month        = {August},
+  year         = {2025}
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x_transformers-2.6.2 → x_transformers-2.6.4}/README.md RENAMED Viewed

@@ -2459,4 +2459,14 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@misc{openai_gpt_oss,
+  author       = {OpenAI},
+  title        = {Introducing gpt-oss},
+  howpublished = {https://openai.com/index/introducing-gpt-oss},
+  month        = {August},
+  year         = {2025}
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x_transformers-2.6.2 → x_transformers-2.6.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.6.2"
+version = "2.6.4"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{x_transformers-2.6.2 → x_transformers-2.6.4}/tests/test_x_transformers.py RENAMED Viewed

@@ -1228,10 +1228,27 @@ def test_external_key_values():
     seq = torch.randint(0, 20000, (3, 1024))
     key_values = [
-        (torch.randn(3, 8, 32, 16), torch.randn(3, 8, 32, 16)),
-        (torch.randn(3, 8, 32, 16), torch.randn(3, 8, 32, 16)),
+        (torch.randn(3, 2, 32, 16), torch.randn(3, 2, 32, 16)),
+        (torch.randn(3, 2, 32, 16), torch.randn(3, 2, 32, 16)),
     ]
     additional_kv_mask = torch.randint(0, 2, (3, 32)).bool()
     logits = model(seq, self_attn_additional_kv = key_values, additional_kv_mask = additional_kv_mask)
+def test_learned_head_attn_sink():
+    model = TransformerWrapper(
+        num_tokens = 20000,
+        max_seq_len = 1024,
+        attn_layers = Decoder(
+            dim = 512,
+            depth = 12,
+            heads = 8,
+            attn_head_learned_sink = True
+        )
+    )
+    seq = torch.randint(0, 20000, (3, 1024))
+    logits = model(seq)

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/attend.py RENAMED Viewed

@@ -4,8 +4,8 @@ from functools import partial
 from typing import Tuple, Callable
 import torch
-from torch.nn import Module
-from torch import nn, einsum, Tensor
+from torch.nn import Module, Parameter
+from torch import cat, nn, einsum, Tensor
 import torch.nn.functional as F
 from collections import namedtuple
@@ -176,6 +176,7 @@ class Attend(Module):
         softclamp_logits = False,
         logit_softclamp_value = 50.,
         add_zero_kv = False,
+        head_learned_sink = False,
         selective = False,
         hard = False,
         cope = None,
@@ -254,6 +255,13 @@ class Attend(Module):
         self.add_zero_kv = add_zero_kv
+        # learned sink concatted pre-softmax, working solution from gpt-oss
+        assert not (head_learned_sink and flash), f'not supported for flash attention yet'
+        self.head_learned_sink = head_learned_sink
+        self.head_attn_sink = Parameter(torch.zeros(heads)) if head_learned_sink else None
         # soft clamp attention logit value
         if softclamp_logits:
@@ -315,10 +323,10 @@ class Attend(Module):
         if self.l2_distance:
             k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2
             k = F.pad(k, (0, 1), value = -1.)
-            k = torch.cat((k, k_norm_sq), dim = -1)
+            k = cat((k, k_norm_sq), dim = -1)
             q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2
-            q = torch.cat((2 * q, q_norm_sq), dim = -1)
+            q = cat((2 * q, q_norm_sq), dim = -1)
             q = F.pad(q, (0, 1), value = -1.)
         # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
@@ -509,6 +517,11 @@ class Attend(Module):
         if self.selective:
             sim = selective_attn(sim)
+        if self.head_learned_sink:
+            # add learned attention sink
+            attn_sink = repeat(self.head_attn_sink, 'h -> b h i 1', b = sim.shape[0], i = sim.shape[2])
+            sim = cat((attn_sink, sim), dim = -1)
         pre_softmax_attn = sim
         attn = self.attn_fn(sim)
@@ -517,6 +530,10 @@ class Attend(Module):
         post_softmax_attn = attn
+        if self.head_learned_sink:
+            # remove attention sink
+            attn = attn[..., 1:]
         attn = self.attn_dropout(attn)
         if exists(self.post_softmax_talking_heads):

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/x_transformers.py RENAMED Viewed

@@ -1319,6 +1319,7 @@ class Attention(Module):
         value_dim_head = None,
         dim_out = None,
         add_zero_kv = False,         # same as add_zero_attn in pytorch
+        head_learned_sink = False,
         rotate_num_heads = None,
         data_dependent_alibi = False,
         data_dependent_alibi_per_row = False,
@@ -1515,6 +1516,7 @@ class Attention(Module):
             selective = selective,
             custom_attn_fn = custom_attn_fn,
             add_zero_kv = add_zero_kv,
+            head_learned_sink = head_learned_sink,
             flash = flash,
             softclamp_logits = softclamp_logits,
             logit_softclamp_value = logit_softclamp_value,
@@ -1795,6 +1797,13 @@ class Attention(Module):
             seq_len = k.shape[-2]
             added_k, added_v = additional_key_values
+            added_kv_heads, added_kv_len = added_k.shape[1], added_k.shape[-2]
+            # take care of expanding to query heads if mismatch between key / value heads with the ones coming from vlm
+            if added_kv_heads != kv_h:
+                assert divisible_by(h, added_kv_heads)
+                k, v, added_k, added_v = tuple(repeat(t, 'b h ... -> b (r h) ...', r = h // t.shape[1]) for t in (k, v, added_k, added_v))
             k = cat((added_k, k), dim = -2)
             v = cat((added_v, v), dim = -2)
@@ -1802,7 +1811,6 @@ class Attention(Module):
             if (exists(input_mask) or exists(additional_key_value_mask)):
                 if not exists(additional_key_value_mask):
-                    added_kv_len = added_k.shape[-2]
                     input_mask = pad_at_dim(input_mask, (added_kv_len, 0), dim = -1, value = True)
                 elif not exists(input_mask):
                     input_mask = pad_at_dim(additional_key_value_mask, (0, seq_len), dim = -1, value = True)

{x_transformers-2.6.2 → x_transformers-2.6.4}/.github/FUNDING.yml RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/.github/workflows/python-test.yaml RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/.gitignore RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/LICENSE RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/data/README.md RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/data/enwik8.gz RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/all-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/attention-on-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/cosine-sim-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/deepnorm.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/dynamic-pos-bias-linear.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/dynamic-pos-bias-log.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/dynamic-pos-bias-sinusoidal.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/dynamic-pos-bias.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/enhanced-recurrence.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/fcm.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/ffglu.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/flash-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/gate_values.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/gating.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/length-extrapolation-scale.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/macaron-1.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/macaron-2.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/memory-transformer.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/normformer.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/pia.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/qknorm-analysis.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/resi_dual.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/residual_attn.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/rezero.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/rotary.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/sandwich-2.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/sandwich.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/sandwich_norm.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/scalenorm.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/talking-heads.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/topk-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/images/xval.png RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/train_belief_state.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/train_copy.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/train_entropy_tokenizer.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/train_enwik8.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/train_length_extrapolate.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/train_parity.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/__init__.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/belief_state_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/continuous.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/dpo.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/entropy_based_tokenizer.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/multi_input.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/neo_mlp.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/nonautoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/up_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/xl_autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.2 → x_transformers-2.6.4}/x_transformers/xval.py RENAMED Viewed

File without changes

x-transformers 2.6.2__tar.gz → 2.6.4__tar.gz

x-transformers 2.6.2tar.gz → 2.6.4tar.gz