PyPI - x-transformers - Versions diffs - 2.9.1__tar.gz → 2.10.0__tar.gz - Mend

x-transformers 2.9.1tar.gz → 2.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of x-transformers might be problematic. Click here for more details.

Files changed (66) hide show

{x_transformers-2.9.1 → x_transformers-2.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.9.1
+Version: 2.10.0
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2586,4 +2586,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@inproceedings{anonymous2025more,
+    title   = {More Expressive Attention with Negative Weights},
+    author  = {Anonymous},
+    booktitle = {Submitted to The Fourteenth International Conference on Learning Representations},
+    year    = {2025},
+    url     = {https://openreview.net/forum?id=ezRrwwbxd0},
+    note    = {under review}
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x_transformers-2.9.1 → x_transformers-2.10.0}/README.md RENAMED Viewed

@@ -2537,4 +2537,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@inproceedings{anonymous2025more,
+    title   = {More Expressive Attention with Negative Weights},
+    author  = {Anonymous},
+    booktitle = {Submitted to The Fourteenth International Conference on Learning Representations},
+    year    = {2025},
+    url     = {https://openreview.net/forum?id=ezRrwwbxd0},
+    note    = {under review}
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x_transformers-2.9.1 → x_transformers-2.10.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.9.1"
+version = "2.10.0"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{x_transformers-2.9.1 → x_transformers-2.10.0}/tests/test_x_transformers.py RENAMED Viewed

@@ -1378,6 +1378,28 @@ def test_stochastic_attn():
     from x_transformers import Attention
     attn = Attention(dim = 512, gumbel_softmax = True)
-    out = attn(torch.randn(1, 1024, 512))
+    out, intermediate = attn(torch.randn(1, 1024, 512), return_intermediates = True)
     assert out.shape == (1, 1024, 512)
+    from x_transformers.attend import log_prob_from_hard_attend
+    log_probs = log_prob_from_hard_attend(intermediate)
+    assert log_probs.shape == (1, 8, 1024)
+def test_attn_negative_weights():
+    from x_transformers import TransformerWrapper, Decoder
+    model = TransformerWrapper(
+        num_tokens = 256,
+        max_seq_len = 1024,
+        attn_layers = Decoder(
+            dim = 512,
+            depth = 12,
+            heads = 8,
+            attn_cog_signed = True
+        ),
+    )
+    x = torch.randint(0, 256, (1, 10))
+    logits = model(x)

{x_transformers-2.9.1 → x_transformers-2.10.0}/train_with_muon.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # /// script
 # dependencies = [
 #   "x-transformers",
-#   "adam-atan2-pytorch>=0.2.2",
+#   "adam-atan2-pytorch>=0.2.4",
 # ]
 # ///
@@ -25,7 +25,6 @@ NUM_BATCHES = int(1e5)
 BATCH_SIZE = 4
 GRADIENT_ACCUMULATE_EVERY = 4
 LEARNING_RATE = 1e-4
-MUON_LEARNING_RATE = 1e-3
 VALIDATE_EVERY  = 100
 GENERATE_EVERY  = 500
 GENERATE_LENGTH = 1024
@@ -92,8 +91,7 @@ optim = MuonAdamAtan2(
     muon_params = model.muon_parameters(),
     params = model.parameters(),
     remove_muon_params_from_params = True,
-    lr = LEARNING_RATE,
-    muon_lr = MUON_LEARNING_RATE,
+    lr = LEARNING_RATE
 )
 # training

{x_transformers-2.9.1 → x_transformers-2.10.0}/x_transformers/attend.py RENAMED Viewed

@@ -67,6 +67,15 @@ def once(fn):
 print_once = once(print)
+# gumbel softmax attention related
+def log_prob_from_hard_attend(intermeds: Intermediates):
+    log_probs = intermeds.pre_softmax_attn.log_softmax(dim = -1)
+    one_hot = intermeds.post_softmax_attn.argmax(dim = -1, keepdim = True)
+    log_prob = log_probs.gather(-1, one_hot)
+    return rearrange(log_prob, 'b h i 1 -> b h i')
 # selective attention
 # https://arxiv.org/abs/2410.02703 - section 3.3
 # it is a technique to allow each token to prevent itself from being attended to by future tokens
@@ -174,6 +183,7 @@ class Attend(Module):
         gumbel_softmax = False,
         gumbel_softmax_temp = 1.,
         gumbel_softmax_hard = True,
+        cog_signed = False,
         custom_attn_fn: Callable | None = None,
         flash = False,
         softclamp_logits = False,
@@ -251,6 +261,12 @@ class Attend(Module):
         assert not (selective and not causal), 'selective attention is designed for autoregressive'
         self.selective = selective
+        # cog attention - negative weights for expressiveness
+        # https://openreview.net/forum?id=ezRrwwbxd0
+        assert not (flash and cog_signed), 'cog attention not available for flash'
+        self.cog_signed = cog_signed
         # l2 distance attention
         self.l2_distance = l2_distance
@@ -500,6 +516,13 @@ class Attend(Module):
         if self.softclamp_logits:
             sim = softclamp(sim, self.logit_softclamp_value)
+        # pre-masking - handle cog by storing sign
+        if self.cog_signed:
+            sim_sign = sim.sign()
+        # masking
         i, j, dtype = *sim.shape[-2:], sim.dtype
         mask_value = -torch.finfo(sim.dtype).max
@@ -529,10 +552,18 @@ class Attend(Module):
         pre_softmax_attn = sim
+        if self.cog_signed:
+            sim = sim.abs()
         attn = self.attn_fn(sim)
         attn = attn.type(dtype)
+        # add back the sign
+        if self.cog_signed:
+            attn = attn * sim_sign
         post_softmax_attn = attn
         if self.head_learned_sink:

{x_transformers-2.9.1 → x_transformers-2.10.0}/x_transformers/x_transformers.py RENAMED Viewed

@@ -1340,6 +1340,7 @@ class Attention(Module):
         gumbel_softmax_temp = 1.,
         gumbel_softmax_hard = True,
         selective = False,
+        cog_signed = False,
         custom_attn_fn: Callable | None = None,
         hybrid_module: Module | None = None,
         hybrid_mask_kwarg: str | None = None,
@@ -1548,6 +1549,7 @@ class Attention(Module):
             gumbel_softmax_temp = gumbel_softmax_temp,
             gumbel_softmax_hard = gumbel_softmax_hard,
             selective = selective,
+            cog_signed = cog_signed,
             custom_attn_fn = custom_attn_fn,
             add_zero_kv = add_zero_kv,
             head_learned_sink = head_learned_sink,