PyPI - x-transformers - Versions diffs - 2.0.1__tar.gz → 2.0.3__tar.gz - Mend

x-transformers 2.0.1tar.gz → 2.0.3tar.gz

Files changed (55) hide show

{x_transformers-2.0.1 → x_transformers-2.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.0.1
+Version: 2.0.3
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -40,6 +40,7 @@ Requires-Dist: loguru
 Requires-Dist: packaging>=21.0
 Requires-Dist: torch>=2.0
 Provides-Extra: examples
+Requires-Dist: lion-pytorch; extra == 'examples'
 Requires-Dist: torchvision; extra == 'examples'
 Requires-Dist: tqdm; extra == 'examples'
 Provides-Extra: test
@@ -949,7 +950,8 @@ model_xl = TransformerWrapper(
         dim = 512,
         depth = 6,
         heads = 8,
-        rotary_pos_emb = True
+        rotary_pos_emb = True,
+        rotate_num_heads = 4   # only rotate 4 out of the 8 attention heads
     )
 )
@@ -1838,6 +1840,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@inproceedings{Yang2025RopeTN,
+    title   = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
+    author  = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
+    year    = {2025},
+    url     = {https://api.semanticscholar.org/CorpusID:276079501}
+}
+```
 ```bibtex
 @inproceedings{Chen2023ExtendingCW,
     title   = {Extending Context Window of Large Language Models via Positional Interpolation},

{x_transformers-2.0.1 → x_transformers-2.0.3}/README.md RENAMED Viewed

@@ -901,7 +901,8 @@ model_xl = TransformerWrapper(
         dim = 512,
         depth = 6,
         heads = 8,
-        rotary_pos_emb = True
+        rotary_pos_emb = True,
+        rotate_num_heads = 4   # only rotate 4 out of the 8 attention heads
     )
 )
@@ -1790,6 +1791,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@inproceedings{Yang2025RopeTN,
+    title   = {Rope to Nope and Back Again: A New Hybrid Attention Strategy},
+    author  = {Bowen Yang and Bharat Venkitesh and Dwarak Talupuru and Hangyu Lin and David Cairuz and Phil Blunsom and Acyr F. Locatelli},
+    year    = {2025},
+    url     = {https://api.semanticscholar.org/CorpusID:276079501}
+}
+```
 ```bibtex
 @inproceedings{Chen2023ExtendingCW,
     title   = {Extending Context Window of Large Language Models via Positional Interpolation},

{x_transformers-2.0.1 → x_transformers-2.0.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.0.1"
+version = "2.0.3"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -34,7 +34,11 @@ Homepage = "https://pypi.org/project/x-transformers/"
 Repository = "https://github.com/lucidrains/x-transformers"
 [project.optional-dependencies]
-examples = ["tqdm", "torchvision"]
+examples = [
+    "lion-pytorch",
+    "tqdm",
+    "torchvision"
+]
 test = [
     "pytest",

{x_transformers-2.0.1 → x_transformers-2.0.3}/train_parity.py RENAMED Viewed

@@ -7,12 +7,16 @@ from x_transformers import TransformerWrapper, Decoder
 # constants
-NUM_BATCHES = 100000
 BATCH_SIZE = 256
 LEARNING_RATE = 3e-4
 EVAL_EVERY  = 500
-TRAIN_MAX_LENGTH = 64
 EVAL_LENGTHS = (16, 32, 64, 128, 256, 512)
+TRAIN_MAX_LENGTH = EVAL_LENGTHS[-2]
+LOSS_THRES_INCREASE_LEN = 1e-3
+MEET_CRITERIA_THRES_INCREASE_LEN = 10
 HYBRIDIZE_WITH_RNN = True
 # rnn for fully resolving state tracking by hybridization
@@ -28,6 +32,7 @@ if HYBRIDIZE_WITH_RNN:
     decoder_kwargs = dict(
         attn_hybrid_fold_axial_dim = 4, # even if recurrence is every 4 tokens, can generalize for parity
+        attn_hybrid_learned_mix = True,
         attn_hybrid_module = GRU(dim, dim_head * heads, batch_first = True)
     )
@@ -48,7 +53,9 @@ model = TransformerWrapper(
 # optimizer
-adam = optim.Adam(model.parameters(), lr = LEARNING_RATE)
+from lion_pytorch.cautious_lion import Lion
+optimizer = Lion(model.parameters(), lr = LEARNING_RATE, cautious_factor = 0.1)
 # data generator
@@ -73,7 +80,8 @@ meet_criteria = 0
 train_seq_len = 1
 stop_length = EVAL_LENGTHS[-2]
-with tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training') as pbar:
+with tqdm.tqdm(mininterval = 10., desc = 'training') as pbar:
     while train_seq_len < stop_length:
         model.train()
@@ -90,12 +98,12 @@ with tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training') as pbar
         last_loss = loss[:, -1].mean()
         loss.mean().backward()
-        if last_loss.item() < 0.001:
+        if last_loss.item() < LOSS_THRES_INCREASE_LEN:
             meet_criteria += 1
         else:
             meet_criteria = 0
-        if meet_criteria >= 10:
+        if meet_criteria >= MEET_CRITERIA_THRES_INCREASE_LEN:
             meet_criteria = 0
             train_seq_len += 1
             print(f'criteria met, incrementing to {train_seq_len}')
@@ -103,8 +111,8 @@ with tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training') as pbar
         print(f'({train_seq_len})| {i}: {last_loss.item()}')
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
-        adam.step()
-        adam.zero_grad()
+        optimizer.step()
+        optimizer.zero_grad()
         last_step = train_seq_len == stop_length

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/x_transformers.py RENAMED Viewed

@@ -1204,6 +1204,7 @@ class Attention(Module):
         hybrid_module: Module | None = None,
         hybrid_mask_kwarg: str | None = None,
         hybrid_fold_axial_dim: int | None = None,
+        hybrid_learned_mix = False,
         one_kv_head = False,
         kv_heads = None,
         value_dim_head = None,
@@ -1446,7 +1447,7 @@ class Attention(Module):
         if exists(hybrid_module) and exists(hybrid_fold_axial_dim):
             hybrid_module = FoldAxially(axial_dim = hybrid_fold_axial_dim, fn = hybrid_module)
-            hybrid_mix = LinearNoBias(dim, heads)
+            hybrid_mix = LinearNoBias(dim, heads) if hybrid_learned_mix else None
             hybrid_norms = ModuleList([
                 MultiheadRMSNorm(dim_head, heads = heads),
@@ -1779,7 +1780,12 @@ class Attention(Module):
             out = out_norm(out)
             hybrid_out = hybrid_out_norm(hybrid_out)
-            out = 0.5 * (out + hybrid_out)
+            if exists(self.hybrid_mix):
+                mix = self.hybrid_mix(x)
+                mix = rearrange(mix, 'b n h -> b h n 1')
+                out = out.lerp(hybrid_out, mix.sigmoid())
+            else:
+                out = 0.5 * (out + hybrid_out)
         # merge heads
@@ -1839,6 +1845,7 @@ class AttentionLayers(Module):
         rotary_interpolation_factor = 1.,
         rotary_xpos_scale_base = 512,
         rotary_base_rescale_factor = 1.,
+        rotate_num_heads = None,
         weight_tie_layers = False,
         custom_layers: tuple[str, ...] | None = None,
         layers_execute_order: tuple[int, ...] | None = None,
@@ -2141,7 +2148,7 @@ class AttentionLayers(Module):
             if layer_type == 'a':
                 self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
-                layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, **attn_kwargs)
+                layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = qkv_receive_diff_residuals, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
                 is_first_self_attn = False
             elif layer_type == 'c':
                 layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})

{x_transformers-2.0.1 → x_transformers-2.0.3}/.github/FUNDING.yml RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/.github/workflows/python-test.yaml RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/.gitignore RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/LICENSE RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/all-attention.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/attention-on-attention.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/cosine-sim-attention.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/deepnorm.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/dynamic-pos-bias-linear.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/dynamic-pos-bias-log.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/dynamic-pos-bias-sinusoidal.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/dynamic-pos-bias.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/enhanced-recurrence.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/fcm.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/ffglu.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/flash-attention.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/gate_values.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/gating.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/length-extrapolation-scale.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/macaron-1.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/macaron-2.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/memory-transformer.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/normformer.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/pia.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/qknorm-analysis.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/resi_dual.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/residual_attn.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/rezero.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/rotary.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/sandwich-2.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/sandwich.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/sandwich_norm.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/scalenorm.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/talking-heads.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/topk-attention.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/images/xval.png RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/tests/test_x_transformers.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/train_copy.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/train_enwik8.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/__init__.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/attend.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/continuous.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/dpo.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/multi_input.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/neo_mlp.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/nonautoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/xl_autoregressive_wrapper.py RENAMED Viewed

File without changes

{x_transformers-2.0.1 → x_transformers-2.0.3}/x_transformers/xval.py RENAMED Viewed

File without changes

x-transformers 2.0.1__tar.gz → 2.0.3__tar.gz

x-transformers 2.0.1tar.gz → 2.0.3tar.gz