PyPI - x-transformers - Versions diffs - 2.6.6__tar.gz → 2.7.0__tar.gz - Mend

x-transformers 2.6.6tar.gz → 2.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{x_transformers-2.6.6 → x_transformers-2.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.6.6
+Version: 2.7.0
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2509,11 +2509,22 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 ```bibtex
 @misc{openai_gpt_oss,
-  author       = {OpenAI},
-  title        = {Introducing gpt-oss},
-  howpublished = {https://openai.com/index/introducing-gpt-oss},
-  month        = {August},
-  year         = {2025}
+    author       = {OpenAI},
+    title        = {Introducing gpt-oss},
+    howpublished = {https://openai.com/index/introducing-gpt-oss},
+    month        = {August},
+    year         = {2025}
+}
+```
+```bibtex
+@article{Sahoo2024SimpleAE,
+    title   = {Simple and Effective Masked Diffusion Language Models},
+    author  = {Subham Sekhar Sahoo and Marianne Arriola and Yair Schiff and Aaron Gokaslan and Edgar Marroquin and Justin T Chiu and Alexander Rush and Volodymyr Kuleshov},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2406.07524},
+    url     = {https://api.semanticscholar.org/CorpusID:270380319}
 }
 ```

{x_transformers-2.6.6 → x_transformers-2.7.0}/README.md RENAMED Viewed

@@ -2461,11 +2461,22 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 ```bibtex
 @misc{openai_gpt_oss,
-  author       = {OpenAI},
-  title        = {Introducing gpt-oss},
-  howpublished = {https://openai.com/index/introducing-gpt-oss},
-  month        = {August},
-  year         = {2025}
+    author       = {OpenAI},
+    title        = {Introducing gpt-oss},
+    howpublished = {https://openai.com/index/introducing-gpt-oss},
+    month        = {August},
+    year         = {2025}
+}
+```
+```bibtex
+@article{Sahoo2024SimpleAE,
+    title   = {Simple and Effective Masked Diffusion Language Models},
+    author  = {Subham Sekhar Sahoo and Marianne Arriola and Yair Schiff and Aaron Gokaslan and Edgar Marroquin and Justin T Chiu and Alexander Rush and Volodymyr Kuleshov},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2406.07524},
+    url     = {https://api.semanticscholar.org/CorpusID:270380319}
 }
 ```

{x_transformers-2.6.6 → x_transformers-2.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.6.6"
+version = "2.7.0"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{x_transformers-2.6.6 → x_transformers-2.7.0}/tests/test_x_transformers.py RENAMED Viewed

@@ -1252,3 +1252,65 @@ def test_learned_head_attn_sink():
     seq = torch.randint(0, 20000, (3, 1024))
     logits = model(seq)
+def test_accept_layer_intermediates():
+    from x_transformers import TransformerWrapper, Decoder, AutoregressiveWrapper
+    vlm = TransformerWrapper(
+        num_tokens = 20000,
+        max_seq_len = 1024,
+        attn_layers = Decoder(
+            dim = 512,
+            depth = 3,
+            heads = 4,
+        )
+    )
+    seq = torch.randint(0, 20000, (3, 1024))
+    mask = torch.randint(0, 2, (3, 1024)).bool()
+    _, intermediates = vlm(seq, return_intermediates = True)
+    action_model = Decoder(
+        dim = 512,
+        depth = 6,
+        heads = 8,
+    )
+    seq = torch.randn(3, 32, 512)
+    embeds = action_model(
+        seq,
+        self_attn_additional_kv = intermediates,
+        detach_additional_kv = True,
+        additional_kv_mask = mask
+    )
+    assert embeds.shape == (3, 32, 512)
+@pytest.mark.parametrize('use_loss_weight', (False, True))
+def test_simple_mdlm(
+    use_loss_weight
+):
+    from x_transformers.nonautoregressive_wrapper import NonAutoregressiveWrapper
+    model = TransformerWrapper(
+        num_tokens = 256 + 1,
+        max_seq_len = 1024,
+        attn_layers = Encoder(
+            dim = 512,
+            depth = 4,
+            rotary_pos_emb = True
+        )
+    )
+    nar = NonAutoregressiveWrapper(
+        model,
+        mask_id = 256,
+        use_simple_mdlm_loss_weight = use_loss_weight
+    )
+    seq = torch.randint(0, 256, (1, 1024))
+    loss = nar(seq)
+    loss.loss.backward()

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/attend.py RENAMED Viewed

@@ -23,7 +23,7 @@ class Intermediates:
     pre_softmax_attn:   Tensor | None = None
     post_softmax_attn:  Tensor | None = None
     values:             Tensor | None = None
-    cached_kv:          Tuple[Tensor, Tensor] | None = None
+    cached_kv:          tuple[Tensor, Tensor] | None = None
     layer_type:         str | None = None
     hybrid_hidden:      Tensor | None = None

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/nonautoregressive_wrapper.py RENAMED Viewed

@@ -1,16 +1,20 @@
+from __future__ import annotations
 import math
 from random import random
 from contextlib import nullcontext
 from collections import namedtuple
 import torch
+from torch import nn, pi
+from torch.nn import Module
+from torch.func import grad_and_value, vmap
 import torch.nn.functional as F
-from torch import nn
+import einx
 from einops import rearrange, repeat, pack, unpack
 from x_transformers.x_transformers import TransformerWrapper
-from typing import Optional
 # constants
@@ -75,12 +79,12 @@ def linear_schedule(t):
 def cosine_schedule(t):
     """ https://arxiv.org/abs/2202.04200 """
-    return torch.cos(t * math.pi / 2)
+    return torch.cos(t * pi / 2)
 # self token critic
 # inspired by Nijkamp et al. - https://aclanthology.org/2021.naacl-main.409/
-class SelfCritic(nn.Module):
+class SelfCritic(Module):
     def __init__(self, net):
         super().__init__()
         self.net = net
@@ -92,7 +96,7 @@ class SelfCritic(nn.Module):
         embed = self.net(x, return_embeddings = True)
         return self.to_logits(embed)
-class NonAutoregressiveWrapper(nn.Module):
+class NonAutoregressiveWrapper(Module):
     """
     https://arxiv.org/abs/1904.09324
     https://arxiv.org/abs/2202.04200
@@ -110,9 +114,10 @@ class NonAutoregressiveWrapper(nn.Module):
         random_token_prob = 0.1,         # which percentage of tokens to be replaced with random token, done in original MLM paper
         schedule = 'linear',
         can_mask_prev_unmasked = False,  # when unmasking, whether it can remask previously unmasked
-        token_critic: Optional[TransformerWrapper] = None,
+        token_critic: TransformerWrapper | None = None,
         self_token_critic = False,
-        critic_loss_weight = 1.
+        critic_loss_weight = 1.,
+        use_simple_mdlm_loss_weight = True # Sahoo et al. https://arxiv.org/abs/2406.07524
     ):
         super().__init__()
         assert not (self_token_critic and exists(token_critic))
@@ -143,6 +148,23 @@ class NonAutoregressiveWrapper(nn.Module):
         else:
             raise ValueError(f'invalid schedule {schedule}')
+        # whether to use the loss weighting proposed in simple diffusion lm paper
+        self.loss_weight_fn = None
+        if use_simple_mdlm_loss_weight:
+            grad_and_value_schedule_fn = vmap(grad_and_value(self.schedule_fn))
+            # eq (10)
+            def loss_weight_fn(times):
+                grad, value = grad_and_value_schedule_fn(times)
+                return grad / (1. - value)
+            self.loss_weight_fn = loss_weight_fn
+        # whether to mask previous - in the simple mdlm paper, they chose not to
         self.can_mask_prev_unmasked = can_mask_prev_unmasked
         # self conditioning
@@ -311,12 +333,27 @@ class NonAutoregressiveWrapper(nn.Module):
         loss_fn = F.cross_entropy if not self.net.output_is_log_prob else F.nll_loss
-        # cross entropy loss
+        # loss
-        loss = loss_fn(
-            logits[mask],
-            orig_seq[mask]
-        )
+        if exists(self.loss_weight_fn):
+            # using simple mdlm loss weighting
+            loss = loss_fn(
+                rearrange(logits, 'b n l -> b l n'),
+                orig_seq,
+                reduction = 'none'
+            )
+            loss_weights = self.loss_weight_fn(rand_times)     # calculate loss weight
+            loss = einx.multiply('b n, b', loss, loss_weights) # apply loss weights
+            loss = loss[mask].mean()
+        else:
+            loss = loss_fn(
+                logits[mask],
+                orig_seq[mask],
+            )
         if not exists(self.token_critic) or only_train_generator:
             return Losses(loss, loss, None)

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/x_transformers.py RENAMED Viewed

@@ -10,7 +10,7 @@ import torch
 from torch.amp import autocast
 import torch.nn.functional as F
 from torch import nn, einsum, tensor, Tensor, cat, stack, arange, is_tensor
-from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
 from torch.nn import Module, ModuleList, ModuleDict
 from functools import partial, wraps
@@ -81,6 +81,9 @@ def cast_tuple(val, depth = 1):
 def divisible_by(num, den):
     return (num % den) == 0
+def detach_all(obj):
+    return tree_map(lambda t: t.detach() if is_tensor(t) and t.requires_grad else t, obj)
 def maybe(fn = None):
     if not exists(fn):
         fn = identity
@@ -157,6 +160,19 @@ def or_reduce(masks):
         head = head | rest
     return head
+# cache helpers
+def get_cached_kvs(
+    cache: LayerIntermediates
+) -> list[tuple[Tensor, Tensor]]:
+    cached_kvs = []
+    for attn_intermediate in cache.attn_intermediates:
+        cached_kvs.append(attn_intermediate.cached_kv)
+    return cached_kvs
 # entropy
 def calc_entropy(
@@ -2441,8 +2457,13 @@ class AttentionLayers(Module):
         context_pos = None,
         attn_bias = None,
         deep_embeds_and_ids: tuple[nn.Parameter, Tensor] | None = None,
-        self_attn_additional_kv: list[tuple[Tensor, Tensor]] | None = None,
+        self_attn_additional_kv: (
+            LayerIntermediates |
+            list[tuple[Tensor, Tensor]]
+            | None
+        ) = None,
         additional_kv_mask = None,
+        detach_additional_kv = False,
         route_additional_kv_to_top = True,
         condition = None,
         in_attn_cond = None, # https://arxiv.org/abs/2105.04090
@@ -2590,6 +2611,13 @@ class AttentionLayers(Module):
         # additional self attn key / values - say coming from vlm
         if exists(self_attn_additional_kv) and route_additional_kv_to_top:
+            if isinstance(self_attn_additional_kv, LayerIntermediates):
+                self_attn_additional_kv = get_cached_kvs(self_attn_additional_kv)
+            if detach_additional_kv:
+                self_attn_additional_kv = detach_all(self_attn_additional_kv)
             num_self_attns = sum([layer_type == 'a' for layer_type in first(layer_variables)])
             self_attn_additional_kv = self_attn_additional_kv[-num_self_attns:]

{x_transformers-2.6.6 → x_transformers-2.7.0}/.github/FUNDING.yml RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/.github/workflows/python-test.yaml RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/.gitignore RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/LICENSE RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/data/README.md RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/data/enwik8.gz RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/all-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/attention-on-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/cosine-sim-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/deepnorm.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/dynamic-pos-bias-linear.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/dynamic-pos-bias-log.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/dynamic-pos-bias-sinusoidal.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/dynamic-pos-bias.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/enhanced-recurrence.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/fcm.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/ffglu.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/flash-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/gate_values.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/gating.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/length-extrapolation-scale.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/macaron-1.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/macaron-2.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/memory-transformer.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/normformer.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/pia.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/qknorm-analysis.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/resi_dual.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/residual_attn.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/rezero.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/rotary.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/sandwich-2.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/sandwich.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/sandwich_norm.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/scalenorm.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/talking-heads.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/topk-attention.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/images/xval.png RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/train_belief_state.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/train_copy.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/train_entropy_tokenizer.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/train_enwik8.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/train_length_extrapolate.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/train_parity.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/__init__.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/belief_state_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/continuous.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/dpo.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/entropy_based_tokenizer.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/multi_input.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/neo_mlp.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/up_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/xl_autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.6.6 → x_transformers-2.7.0}/x_transformers/xval.py RENAMED Viewed

File without changes

x-transformers 2.6.6__tar.gz → 2.7.0__tar.gz

x-transformers 2.6.6tar.gz → 2.7.0tar.gz