PyPI - x-transformers - Versions diffs - 1.44.0__tar.gz → 1.44.2__tar.gz - Mend

x-transformers 1.44.0tar.gz → 1.44.2tar.gz

Files changed (22) hide show

{x_transformers-1.44.0/x_transformers.egg-info → x_transformers-1.44.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.44.0
+Version: 1.44.2
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x_transformers-1.44.0 → x_transformers-1.44.2}/README.md RENAMED Viewed

@@ -317,7 +317,9 @@ model = TransformerWrapper(
 Update: MetaAI researchers <a href="https://arxiv.org/abs/2309.16588">have found</a> that adding memory tokens (they call them register tokens), alleviates outliers (which is suspected now to be a pathology of attention networks unable to <a href="https://arxiv.org/abs/2306.12929">attend to nothing</a>).
-Update 2: a hybrid architecture out of Nvidia named <a href="https://openreview.net/forum?id=A1ztozypga">Hymba</a> used memory tokens successfully in the autoregressive case, termed meta tokens in their paper
+Update 2: a hybrid architecture out of Nvidia named <a href="https://openreview.net/forum?id=A1ztozypga">Hymba</a> used memory tokens successfully in the autoregressive case, termed meta tokens in their paper.
+Update 3: further corroborated by <a href="https://arxiv.org/abs/2501.00663">a paper</a> trying to extend memory in attention networks, termed persistent memory
 ### Transformers Without Tears

{x_transformers-1.44.0 → x_transformers-1.44.2}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.44.0',
+  version = '1.44.2',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',

{x_transformers-1.44.0 → x_transformers-1.44.2}/tests/test_x_transformers.py RENAMED Viewed

@@ -617,7 +617,7 @@ def test_hyper_connections(tanh):
 def test_hybrid():
     from torch.nn import GRU
-    model = TransformerWrapper(
+    dec = TransformerWrapper(
         num_tokens = 20000,
         max_seq_len = 1024,
         attn_layers = Decoder(
@@ -631,4 +631,19 @@ def test_hybrid():
     x = torch.randint(0, 20000, (2, 1024))
-    embed = model(x)
+    embed = dec(x)
+    enc = TransformerWrapper(
+        num_tokens = 20000,
+        max_seq_len = 1024,
+        attn_layers = Encoder(
+            dim = 128,
+            depth = 6,
+            heads = 8,
+            attn_dim_head = 64,
+            attn_hybrid_module = GRU(128, 64 * 4, batch_first = True, bidirectional = True)
+        )
+    )
+    mask = torch.randint(0, 2, (2, 1024)).bool()
+    embed = enc(x, mask = mask)

{x_transformers-1.44.0 → x_transformers-1.44.2}/x_transformers/x_transformers.py RENAMED Viewed

@@ -7,10 +7,11 @@ from random import random, randrange
 from packaging import version
 import torch
+from torch.amp import autocast
 import torch.nn.functional as F
 from torch import nn, einsum, Tensor
+from torch.utils._pytree import tree_flatten
 from torch.nn import Module, ModuleList, ModuleDict
-from torch.amp import autocast
 from functools import partial, wraps
 from collections import namedtuple
@@ -1138,6 +1139,7 @@ class Attention(Module):
         selective = False,
         custom_attn_fn: Callable | None = None,
         hybrid_module: Module | None = None,
+        hybrid_mask_kwarg: str | None = None,
         one_kv_head = False,
         kv_heads = None,
         shared_kv = False,
@@ -1341,6 +1343,8 @@ class Attention(Module):
         self.hybrid_module = deepcopy(hybrid_module) if exists(hybrid_module) else None
+        self.hybrid_mask_kwarg = hybrid_mask_kwarg # for bidirectional, can forward `mask` into the hybrid module and let it handle variable lengths
         # output dimension by default same as input, but can be overridden
         dim_out = default(dim_out, dim)
@@ -1592,7 +1596,21 @@ class Attention(Module):
         # hybrid module
         if exists(self.hybrid_module):
-            hybrid_out, _ = self.hybrid_module(x)
+            # hybrid input
+            hybrid_forward_kwargs = dict()
+            if not self.causal and exists(self.hybrid_mask_kwarg):
+                hybrid_forward_kwargs = {self.hybrid_mask_kwarg: mask}
+            # hybrid forward
+            hybrid_outputs = self.hybrid_module(x, **hybrid_forward_kwargs)
+            # handle hybrid out
+            (hybrid_out, *rest_hybrid_outs), _ = tree_flatten(hybrid_outputs)
             out = 0.5 * (out + hybrid_out)
         # alphafold2 styled gating of the values

{x_transformers-1.44.0 → x_transformers-1.44.2/x_transformers.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.44.0
+Version: 1.44.2
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang