PyPI - x-transformers - Versions diffs - 2.11.19__tar.gz → 2.11.22__tar.gz - Mend

x-transformers 2.11.19tar.gz → 2.11.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of x-transformers might be problematic. Click here for more details.

Files changed (68) hide show

{x_transformers-2.11.19 → x_transformers-2.11.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.11.19
+Version: 2.11.22
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2608,12 +2608,13 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 ```
 ```bibtex
-@article{elhage2022solu,
-    title   = {Softmax Linear Units},
-    author  = {Elhage, Nelson and Hume, Tristan and Olsson, Catherine and Nanda, Neel and Henighan, Tom and Johnston, Scott and ElShowk, Sheer and Joseph, Nicholas and DasSarma, Nova and Mann, Ben and Hernandez, Danny and Askell, Amanda and Ndousse, Kamal and Jones, Andy and Drain, Dawn and Chen, Anna and Bai, Yuntao and Ganguli, Deep and Lovitt, Liane and Hatfield-Dodds, Zac and Kernion, Jackson and Conerly, Tom and Kravec, Shauna and Fort, Stanislav and Kadavath, Saurav and Jacobson, Josh and Tran-Johnson, Eli and Kaplan, Jared and Clark, Jack and Brown, Tom and McCandlish, Sam and Amodei, Dario and Olah, Christopher},
-    year    = {2022},
-    journal = {Transformer Circuits Thread},
-    note    = {https://transformer-circuits.pub/2022/solu/index.html}
+@inproceedings{anonymous2025beliefformer,
+    title   = {BeliefFormer: Belief Attention in Transformer},
+    author  = {Anonymous},
+    booktitle = {Submitted to The Fourteenth International Conference on Learning Representations},
+    year    = {2025},
+    url     = {https://openreview.net/forum?id=Ard2QzPAUK},
+    note    = {under review}
 }
 ```

{x_transformers-2.11.19 → x_transformers-2.11.22}/README.md RENAMED Viewed

@@ -2559,12 +2559,13 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 ```
 ```bibtex
-@article{elhage2022solu,
-    title   = {Softmax Linear Units},
-    author  = {Elhage, Nelson and Hume, Tristan and Olsson, Catherine and Nanda, Neel and Henighan, Tom and Johnston, Scott and ElShowk, Sheer and Joseph, Nicholas and DasSarma, Nova and Mann, Ben and Hernandez, Danny and Askell, Amanda and Ndousse, Kamal and Jones, Andy and Drain, Dawn and Chen, Anna and Bai, Yuntao and Ganguli, Deep and Lovitt, Liane and Hatfield-Dodds, Zac and Kernion, Jackson and Conerly, Tom and Kravec, Shauna and Fort, Stanislav and Kadavath, Saurav and Jacobson, Josh and Tran-Johnson, Eli and Kaplan, Jared and Clark, Jack and Brown, Tom and McCandlish, Sam and Amodei, Dario and Olah, Christopher},
-    year    = {2022},
-    journal = {Transformer Circuits Thread},
-    note    = {https://transformer-circuits.pub/2022/solu/index.html}
+@inproceedings{anonymous2025beliefformer,
+    title   = {BeliefFormer: Belief Attention in Transformer},
+    author  = {Anonymous},
+    booktitle = {Submitted to The Fourteenth International Conference on Learning Representations},
+    year    = {2025},
+    url     = {https://openreview.net/forum?id=Ard2QzPAUK},
+    note    = {under review}
 }
 ```

{x_transformers-2.11.19 → x_transformers-2.11.22}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.11.19"
+version = "2.11.22"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{x_transformers-2.11.19 → x_transformers-2.11.22}/tests/test_x_transformers.py RENAMED Viewed

@@ -1463,13 +1463,27 @@ def test_kv_input_residual():
     assert tokens.shape == out.shape
-def test_solu():
-    attn = Decoder(
-        dim = 256,
-        depth = 2,
-        heads = 4,
-        ff_solu = True
+@param('orthog_project', (False, True))
+@param('orthog_project_per_head', (False, True))
+def test_belief_attn(
+    orthog_project,
+    orthog_project_per_head
+):
+    from x_transformers import TransformerWrapper, Decoder
+    model = TransformerWrapper(
+        num_tokens = 256,
+        max_seq_len = 1024,
+        attn_layers = Decoder(
+            dim = 512,
+            depth = 6,
+            heads = 8,
+            rotary_pos_emb = True,
+            attn_orthog_projected_values = orthog_project,
+            attn_orthog_projected_values_per_head = orthog_project_per_head
+        )
     )
-    tokens = torch.randn(3, 32, 256)
-    attn(tokens)
+    x = torch.randint(0, 256, (1, 10))
+    logits = model(x)

{x_transformers-2.11.19 → x_transformers-2.11.22}/train_enwik8.py RENAMED Viewed

@@ -1,3 +1,11 @@
+# /// script
+# dependencies = [
+#   "tqdm",
+#   "x-transformers",
+#   "wandb"
+# ]
+# ///
 from x_transformers import TransformerWrapper, Decoder
 from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
@@ -20,6 +28,7 @@ VALIDATE_EVERY  = 100
 GENERATE_EVERY  = 500
 GENERATE_LENGTH = 1024
 SEQ_LEN = 1024
+TRACK_EXPERIMENT_ONLINE = False
 # helpers
@@ -43,7 +52,9 @@ model = TransformerWrapper(
         dim = 512,
         depth = 6,
         heads = 8,
-        rotary_pos_emb = True
+        rotary_pos_emb = True,
+        attn_orthog_projected_values = True,
+        attn_orthog_projected_values_per_head = True
     )
 )
@@ -80,6 +91,12 @@ val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE, drop_last
 optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
+# experiment
+import wandb
+wandb.init(project = 'enwik8', mode = 'online' if TRACK_EXPERIMENT_ONLINE else 'disabled')
+wandb.run.name = 'baseline'
 # training
 for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
@@ -90,6 +107,8 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
         (loss / GRADIENT_ACCUMULATE_EVERY).backward()
     print(f'training loss: {loss.item()}')
+    wandb.log(dict(loss = loss.item()))
     torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
     optim.step()
     optim.zero_grad()
@@ -98,7 +117,9 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
         model.eval()
         with torch.no_grad():
             loss = model(next(val_loader))
             print(f'validation loss: {loss.item()}')
+            wandb.log(dict(valid_loss = loss.item()))
     if i % GENERATE_EVERY == 0:
         model.eval()

{x_transformers-2.11.19 → x_transformers-2.11.22}/x_transformers/x_transformers.py RENAMED Viewed

@@ -161,6 +161,21 @@ def or_reduce(masks):
         head = head | rest
     return head
+def orthog_project(x, y):
+    x, packed_shape = pack([x], 'b *')
+    y, _ = pack([y], 'b *')
+    dtype = x.dtype
+    x, y = x.double(), y.double()
+    unit = F.normalize(y, dim = -1)
+    parallel = (x * unit).sum(dim = -1, keepdim = True) * unit
+    orthog = x - parallel
+    orthog, = unpack(orthog, packed_shape, 'b *')
+    return orthog.to(dtype)
 # cache helpers
 def get_cached_kvs(
@@ -1381,7 +1396,9 @@ class Attention(Module):
         softclamp_logits = False,
         logit_softclamp_value = 50.,
         learned_value_residual_mix = False,
-        laser = False,                # https://arxiv.org/abs/2411.03493v1
+        orthog_projected_values = False,  # https://openreview.net/forum?id=Ard2QzPAUK
+        orthog_projected_values_per_head = False,
+        laser = False,                    # https://arxiv.org/abs/2411.03493v1
         laser_softclamp_value = 15.,
         qkv_receive_diff_residuals = False,
         use_latent_q = False,
@@ -1607,6 +1624,14 @@ class Attention(Module):
         self.attn_on_attn = on_attn
+        # return orthogonal projected weighted values on original values
+        # "belief attention" - iclr 2026
+        self.orthog_projected_values = orthog_projected_values
+        self.orthog_projected_values_per_head = orthog_projected_values_per_head
+        out_dim *= max(1, int(orthog_projected_values) + int(orthog_projected_values_per_head))
         # hybrid module, in same vein as hymba https://www.arxiv.org/abs/2411.13676
         hybrid_mix = None
@@ -2048,6 +2073,25 @@ class Attention(Module):
             gates = self.to_v_gate(x)
             out = out * self.to_v_gate_activation(gates)
+        # maybe orthogonal projected weighted values - "belief" attention
+        if self.orthog_projected_values or self.orthog_projected_values_per_head:
+            orthog_projected = []
+            v_for_proj = self.merge_heads(orig_values)
+            if self.orthog_projected_values:
+                projected = orthog_project(out, v_for_proj)
+                orthog_projected.append(projected)
+            if self.orthog_projected_values_per_head:
+                v_for_proj = rearrange(v_for_proj, 'b n (h d) -> b n h d', h = h)
+                out = rearrange(out, 'b n (h d) -> b n h d', h = h)
+                projected = orthog_project(out, v_for_proj)
+                projected = rearrange(projected, 'b n h d -> b n (h d)')
+                orthog_projected.append(projected)
+            out = cat(orthog_projected, dim = -1)
         # combine the heads
         out = self.to_out(out)