PyPI - x-transformers - Versions diffs - 2.6.7__tar.gz → 2.7.1__tar.gz - Mend

x-transformers 2.6.7tar.gz → 2.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{x_transformers-2.6.7 → x_transformers-2.7.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.6.7
+Version: 2.7.1
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2509,11 +2509,22 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 ```bibtex
 @misc{openai_gpt_oss,
-  author       = {OpenAI},
-  title        = {Introducing gpt-oss},
-  howpublished = {https://openai.com/index/introducing-gpt-oss},
-  month        = {August},
-  year         = {2025}
+    author       = {OpenAI},
+    title        = {Introducing gpt-oss},
+    howpublished = {https://openai.com/index/introducing-gpt-oss},
+    month        = {August},
+    year         = {2025}
+}
+```
+```bibtex
+@article{Sahoo2024SimpleAE,
+    title   = {Simple and Effective Masked Diffusion Language Models},
+    author  = {Subham Sekhar Sahoo and Marianne Arriola and Yair Schiff and Aaron Gokaslan and Edgar Marroquin and Justin T Chiu and Alexander Rush and Volodymyr Kuleshov},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2406.07524},
+    url     = {https://api.semanticscholar.org/CorpusID:270380319}
 }
 ```

{x_transformers-2.6.7 → x_transformers-2.7.1}/README.md RENAMED Viewed

@@ -2461,11 +2461,22 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 ```bibtex
 @misc{openai_gpt_oss,
-  author       = {OpenAI},
-  title        = {Introducing gpt-oss},
-  howpublished = {https://openai.com/index/introducing-gpt-oss},
-  month        = {August},
-  year         = {2025}
+    author       = {OpenAI},
+    title        = {Introducing gpt-oss},
+    howpublished = {https://openai.com/index/introducing-gpt-oss},
+    month        = {August},
+    year         = {2025}
+}
+```
+```bibtex
+@article{Sahoo2024SimpleAE,
+    title   = {Simple and Effective Masked Diffusion Language Models},
+    author  = {Subham Sekhar Sahoo and Marianne Arriola and Yair Schiff and Aaron Gokaslan and Edgar Marroquin and Justin T Chiu and Alexander Rush and Volodymyr Kuleshov},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2406.07524},
+    url     = {https://api.semanticscholar.org/CorpusID:270380319}
 }
 ```

{x_transformers-2.6.7 → x_transformers-2.7.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.6.7"
+version = "2.7.1"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{x_transformers-2.6.7 → x_transformers-2.7.1}/tests/test_x_transformers.py RENAMED Viewed

@@ -1287,3 +1287,30 @@ def test_accept_layer_intermediates():
     )
     assert embeds.shape == (3, 32, 512)
+@pytest.mark.parametrize('use_loss_weight', (False, True))
+def test_simple_mdlm(
+    use_loss_weight
+):
+    from x_transformers.nonautoregressive_wrapper import NonAutoregressiveWrapper
+    model = TransformerWrapper(
+        num_tokens = 256 + 1,
+        max_seq_len = 1024,
+        attn_layers = Encoder(
+            dim = 512,
+            depth = 4,
+            rotary_pos_emb = True
+        )
+    )
+    nar = NonAutoregressiveWrapper(
+        model,
+        mask_id = 256,
+        use_simple_mdlm_loss_weight = use_loss_weight
+    )
+    seq = torch.randint(0, 256, (1, 1024))
+    loss = nar(seq)
+    loss.loss.backward()

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/continuous.py RENAMED Viewed

@@ -241,6 +241,7 @@ class ContinuousAutoregressiveWrapper(Module):
         self,
         net: ContinuousTransformerWrapper,
         loss_fn: Module | None = None,
+        use_l1_loss = False,
         equal_loss_weight_batch = False,  # setting this to True, if the mask is passed in and sequences are variable in length, each sequence will be weighted the same (as opposed to each token)
     ):
         super().__init__()
@@ -250,7 +251,15 @@ class ContinuousAutoregressiveWrapper(Module):
         probabilistic = net.probabilistic
         self.probabilistic = probabilistic
-        loss_fn = default(loss_fn, nn.MSELoss(reduction = 'none') if not probabilistic else GaussianNLL())
+        # default loss function
+        if not exists(loss_fn):
+            if probabilistic:
+                loss_fn = GaussianNLL()
+            elif use_l1_loss:
+                loss_fn = nn.L1Loss(reduction = 'none')
+            else:
+                loss_fn = nn.MSELoss(reduction = 'none')
         self.loss_fn = loss_fn
         self.equal_loss_weight_batch = equal_loss_weight_batch

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/nonautoregressive_wrapper.py RENAMED Viewed

@@ -1,16 +1,20 @@
+from __future__ import annotations
 import math
 from random import random
 from contextlib import nullcontext
 from collections import namedtuple
 import torch
+from torch import nn, pi
+from torch.nn import Module
+from torch.func import grad_and_value, vmap
 import torch.nn.functional as F
-from torch import nn
+import einx
 from einops import rearrange, repeat, pack, unpack
 from x_transformers.x_transformers import TransformerWrapper
-from typing import Optional
 # constants
@@ -75,12 +79,12 @@ def linear_schedule(t):
 def cosine_schedule(t):
     """ https://arxiv.org/abs/2202.04200 """
-    return torch.cos(t * math.pi / 2)
+    return torch.cos(t * pi / 2)
 # self token critic
 # inspired by Nijkamp et al. - https://aclanthology.org/2021.naacl-main.409/
-class SelfCritic(nn.Module):
+class SelfCritic(Module):
     def __init__(self, net):
         super().__init__()
         self.net = net
@@ -92,7 +96,7 @@ class SelfCritic(nn.Module):
         embed = self.net(x, return_embeddings = True)
         return self.to_logits(embed)
-class NonAutoregressiveWrapper(nn.Module):
+class NonAutoregressiveWrapper(Module):
     """
     https://arxiv.org/abs/1904.09324
     https://arxiv.org/abs/2202.04200
@@ -110,9 +114,10 @@ class NonAutoregressiveWrapper(nn.Module):
         random_token_prob = 0.1,         # which percentage of tokens to be replaced with random token, done in original MLM paper
         schedule = 'linear',
         can_mask_prev_unmasked = False,  # when unmasking, whether it can remask previously unmasked
-        token_critic: Optional[TransformerWrapper] = None,
+        token_critic: TransformerWrapper | None = None,
         self_token_critic = False,
-        critic_loss_weight = 1.
+        critic_loss_weight = 1.,
+        use_simple_mdlm_loss_weight = True # Sahoo et al. https://arxiv.org/abs/2406.07524
     ):
         super().__init__()
         assert not (self_token_critic and exists(token_critic))
@@ -143,6 +148,23 @@ class NonAutoregressiveWrapper(nn.Module):
         else:
             raise ValueError(f'invalid schedule {schedule}')
+        # whether to use the loss weighting proposed in simple diffusion lm paper
+        self.loss_weight_fn = None
+        if use_simple_mdlm_loss_weight:
+            grad_and_value_schedule_fn = vmap(grad_and_value(self.schedule_fn))
+            # eq (10)
+            def loss_weight_fn(times):
+                grad, value = grad_and_value_schedule_fn(times)
+                return grad / (1. - value)
+            self.loss_weight_fn = loss_weight_fn
+        # whether to mask previous - in the simple mdlm paper, they chose not to
         self.can_mask_prev_unmasked = can_mask_prev_unmasked
         # self conditioning
@@ -311,12 +333,27 @@ class NonAutoregressiveWrapper(nn.Module):
         loss_fn = F.cross_entropy if not self.net.output_is_log_prob else F.nll_loss
-        # cross entropy loss
+        # loss
-        loss = loss_fn(
-            logits[mask],
-            orig_seq[mask]
-        )
+        if exists(self.loss_weight_fn):
+            # using simple mdlm loss weighting
+            loss = loss_fn(
+                rearrange(logits, 'b n l -> b l n'),
+                orig_seq,
+                reduction = 'none'
+            )
+            loss_weights = self.loss_weight_fn(rand_times)     # calculate loss weight
+            loss = einx.multiply('b n, b', loss, loss_weights) # apply loss weights
+            loss = loss[mask].mean()
+        else:
+            loss = loss_fn(
+                logits[mask],
+                orig_seq[mask],
+            )
         if not exists(self.token_critic) or only_train_generator:
             return Losses(loss, loss, None)

{x_transformers-2.6.7 → x_transformers-2.7.1}/.github/FUNDING.yml RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/.github/workflows/python-test.yaml RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/.gitignore RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/LICENSE RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/data/README.md RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/data/enwik8.gz RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/all-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/attention-on-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/cosine-sim-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/deepnorm.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/dynamic-pos-bias-linear.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/dynamic-pos-bias-log.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/dynamic-pos-bias-sinusoidal.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/dynamic-pos-bias.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/enhanced-recurrence.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/fcm.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/ffglu.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/flash-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/gate_values.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/gating.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/length-extrapolation-scale.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/macaron-1.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/macaron-2.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/memory-transformer.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/normformer.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/pia.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/qknorm-analysis.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/resi_dual.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/residual_attn.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/rezero.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/rotary.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/sandwich-2.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/sandwich.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/sandwich_norm.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/scalenorm.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/talking-heads.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/topk-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/images/xval.png RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/train_belief_state.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/train_copy.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/train_entropy_tokenizer.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/train_enwik8.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/train_length_extrapolate.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/train_parity.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/__init__.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/attend.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/belief_state_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/dpo.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/entropy_based_tokenizer.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/multi_input.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/neo_mlp.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/up_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/x_transformers.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/xl_autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.7 → x_transformers-2.7.1}/x_transformers/xval.py RENAMED Viewed

File without changes

x-transformers 2.6.7__tar.gz → 2.7.1__tar.gz

x-transformers 2.6.7tar.gz → 2.7.1tar.gz