PyPI - x-transformers - Versions diffs - 1.42.8__tar.gz → 1.42.9__tar.gz - Mend

x-transformers 1.42.8tar.gz → 1.42.9tar.gz

Files changed (22) hide show

{x_transformers-1.42.8/x_transformers.egg-info → x_transformers-1.42.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.42.8
+Version: 1.42.9
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x_transformers-1.42.8 → x_transformers-1.42.9}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.42.8',
+  version = '1.42.9',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',

{x_transformers-1.42.8 → x_transformers-1.42.9}/tests/test_x_transformers.py RENAMED Viewed

@@ -381,6 +381,7 @@ def test_neo_mlp():
     assert out.shape == (3, 7)
 def test_custom_alibi():
     model = TransformerWrapper(
         num_tokens = 20_000,
         max_seq_len = 1024,
@@ -398,6 +399,26 @@ def test_custom_alibi():
     logits = model(x, pos = pos)
+def test_custom_alibi_across_heads():
+    model = Decoder(
+        dim = 512,
+        depth = 2,
+        heads = 2,
+        alibi_pos_bias = True,
+        rel_pos_kwargs = dict(
+            slopes = [1, 1]
+        ),
+    )
+    x = torch.randn(2, 4, 512)
+    pos = torch.tensor([
+        [[0, 1, 2, 4], [1, 3, 5, 7]],
+        [[2, 3, 4, 5], [6, 8, 9, 10]]
+    ])
+    embed = model(x, pos = pos)
 @pytest.mark.parametrize('embedder_type', ('embedding', 'none', 'custom'))
 def test_embedder(embedder_type):

{x_transformers-1.42.8 → x_transformers-1.42.9}/x_transformers/x_transformers.py RENAMED Viewed

@@ -452,13 +452,20 @@ class DynamicPositionBias(Module):
         return bias
 class AlibiPositionalBias(Module):
-    def __init__(self, heads, total_heads = None, **kwargs):
+    def __init__(
+        self,
+        heads,
+        total_heads = None,
+        slopes: list[int] | None = None,
+        **kwargs
+    ):
         super().__init__()
         self.heads = heads
         self.total_heads = default(total_heads, heads)
-        slopes = Tensor(self._get_slopes(heads))
+        slopes = Tensor(default(slopes, self._get_slopes(heads)))
         slopes = rearrange(slopes, 'h -> h 1 1')
         self.register_buffer('slopes', slopes, persistent = False)
         self.register_buffer('bias', None, persistent = False)
@@ -487,7 +494,10 @@ class AlibiPositionalBias(Module):
         h, device = self.total_heads, self.device
         pos_j = default(pos_j, pos_i)
-        bias = -einx.subtract('... j, ... i -> ... 1 i j', pos_j, pos_i).abs()
+        bias = -einx.subtract('... j, ... i -> ... i j', pos_j, pos_i).abs()
+        if bias.ndim == 3:
+            bias = rearrange(bias, 'b i j -> b 1 i j')
         bias = bias * self.slopes
         num_heads_unalibied = h - bias.shape[-3]
@@ -1531,8 +1541,9 @@ class AttentionLayers(Module):
         use_layerscale = False,
         layerscale_init_value = 0.,
         unet_skips = False,
-        reinject_input = False, # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1
-        add_value_residual = False, # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1
+        reinject_input = False,         # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1
+        add_value_residual = False,     # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1
+        rel_pos_kwargs: dict = dict(),
         **kwargs
     ):
         super().__init__()
@@ -1573,14 +1584,14 @@ class AttentionLayers(Module):
         if rel_pos_bias:
             assert not flash_attn, 'flash attention not compatible with t5 relative positional bias'
-            self.rel_pos = RelativePositionBias(scale = dim_head ** 0.5, causal = causal, heads = heads, num_buckets = rel_pos_num_buckets, max_distance = rel_pos_max_distance)
+            self.rel_pos = RelativePositionBias(scale = dim_head ** 0.5, causal = causal, heads = heads, num_buckets = rel_pos_num_buckets, max_distance = rel_pos_max_distance, **rel_pos_kwargs)
         elif dynamic_pos_bias:
             assert not flash_attn, 'flash attention not compatible with dynamic positional bias'
-            self.rel_pos = DynamicPositionBias(dim = dim // 4, heads = heads, log_distance = dynamic_pos_bias_log_distance, depth = dynamic_pos_bias_mlp_depth, norm = dynamic_pos_bias_norm)
+            self.rel_pos = DynamicPositionBias(dim = dim // 4, heads = heads, log_distance = dynamic_pos_bias_log_distance, depth = dynamic_pos_bias_mlp_depth, norm = dynamic_pos_bias_norm, **rel_pos_kwargs)
         elif alibi_pos_bias:
             alibi_num_heads = default(alibi_num_heads, heads)
             assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads'
-            self.rel_pos = AlibiPositionalBias(heads = alibi_num_heads, total_heads = heads)
+            self.rel_pos = AlibiPositionalBias(heads = alibi_num_heads, total_heads = heads, **rel_pos_kwargs)
         assert at_most_one_of(sandwich_norm, resi_dual), 'either sandwich norm or resiDual is selected, but not both'
         assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm'

{x_transformers-1.42.8 → x_transformers-1.42.9/x_transformers.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.42.8
+Version: 1.42.9
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang