PyPI - x-transformers - Versions diffs - 1.30.4__py3-none-any.whl → 1.30.7__py3-none-any.whl - Mend

x-transformers 1.30.4py3-none-any.whl → 1.30.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

x_transformers/attend.py CHANGED Viewed

@@ -4,6 +4,7 @@ from functools import partial
 from typing import Tuple
 import torch
+from torch.nn import Module
 from torch import nn, einsum, Tensor
 import torch.nn.functional as F
@@ -22,6 +23,7 @@ class Intermediates:
     pre_softmax_attn:   Tensor | None = None
     post_softmax_attn:  Tensor | None = None
     cached_kv:          Tuple[Tensor, Tensor] | None = None
+    layer_type:         str | None = None
     def to_tuple(self):
         return (self.qk_similarities, self.pre_softmax_attn, self.post_softmax_attn)
@@ -81,6 +83,7 @@ class Attend(nn.Module):
         flash = False,
         logit_softclamp_value = None,
         add_zero_kv = False,
+        cope = None,
         onnxable = False,
         sdp_kwargs: dict = dict(
             enable_flash = True,
@@ -126,6 +129,10 @@ class Attend(nn.Module):
         self.logit_softclamp_value = logit_softclamp_value
+        # contextual positional encoding
+        self.cope = cope
         # flash attention
         self.flash = flash
@@ -317,6 +324,9 @@ class Attend(nn.Module):
             causal_mask = self.create_causal_mask(i, j, device = device)
             sim = sim.masked_fill(causal_mask, mask_value)
+        if exists(self.cope):
+            sim = sim + self.cope(q, sim)
         pre_softmax_attn = sim.clone()
         if exists(self.logit_softclamp_value):

x_transformers/x_transformers.py CHANGED Viewed

@@ -304,6 +304,33 @@ class RelativePositionBias(Module):
         bias = rearrange(values, 'i j h -> h i j')
         return bias * self.scale
+class CoPE(Module):
+    """
+    Appendix B of https://arxiv.org/abs/2405.18719
+    """
+    def __init__ (self, dim, max_pos):
+        super () . __init__ ()
+        self.max_pos = max_pos
+        self.pos_emb = nn.Parameter(torch.zeros(max_pos, dim))
+    def forward(self, query, attn_logits):
+        # compute positions
+        gates = attn_logits.sigmoid()
+        pos = gates.flip(-1).cumsum(dim = -1).flip(-1)
+        pos = pos.clamp(max = self.max_pos - 1)
+        # interpolate from integer positions
+        pos_ceil = pos.ceil().long()
+        pos_floor = pos.floor().long()
+        logits_int = einsum('b h n d, p d -> b h n p', query, self.pos_emb)
+        logits_ceil = logits_int.gather(-1, pos_ceil)
+        logits_floor = logits_int.gather(-1, pos_floor)
+        w = pos - pos_floor
+        return logits_ceil * w + logits_floor * (1 - w)
 class DynamicPositionBias(Module):
     def __init__(self, dim, *, heads, depth, log_distance = False, norm = False):
         super().__init__()
@@ -722,6 +749,8 @@ class Attention(Module):
         tensor_product = False,      # https://arxiv.org/abs/2208.06061
         add_zero_kv = False,         # same as add_zero_attn in pytorch
         rotary_embed_values = False,
+        use_cope = False,
+        cope_max_pos = 16,
         logit_softclamp_value = None,
         onnxable = False
     ):
@@ -753,13 +782,16 @@ class Attention(Module):
         self.to_k = nn.Linear(dim_kv, k_dim, bias = False)
         # shared key / values, for further memory savings during inference
         assert not (shared_kv and value_dim_head != dim_head), 'key and value head dimensions must be equal for shared key / values'
         self.to_v = nn.Linear(dim_kv, v_dim, bias = False) if not shared_kv else None
         # relations projection from tp-attention
         self.to_r = nn.Linear(dim, v_dim, bias = False) if tensor_product else None
         # add GLU gating for aggregated values, from alphafold2
         self.to_v_gate = None
         if gate_values:
             self.to_v_gate = nn.Linear(dim, out_dim)
@@ -768,6 +800,7 @@ class Attention(Module):
             nn.init.constant_(self.to_v_gate.bias, 10)
         # add per head gating of the output values, from 'Attend to nothing' paper
         self.to_v_head_gate = None
         if gate_value_heads:
             self.to_v_head_gate = nn.Linear(dim, heads)
@@ -775,11 +808,13 @@ class Attention(Module):
             nn.init.constant_(self.to_v_head_gate.bias, 10)
         # cosine sim attention
         self.qk_norm = qk_norm
         self.qk_norm_groups = qk_norm_groups
         self.qk_norm_scale = qk_norm_scale
         # whether to use the rmsnorm (equivalent to cosine sim attention when scale is equal to 1) - https://arxiv.org/abs/2302.05442
         self.qk_norm_dim_scale = qk_norm_dim_scale
         self.qk_norm_q_scale = self.qk_norm_k_scale = 1
@@ -790,6 +825,17 @@ class Attention(Module):
         assert (not qk_norm) or divisible_by(dim_head, qk_norm_groups), 'dimension per attention head must be divisible by the qk norm groups'
         assert not (qk_norm and (dim_head // qk_norm_groups) <= 2), 'the group dimension may be too small (2 was too small in my tests, but 4 still works, surprisingly)'
+        # contextual positional encoding
+        # https://arxiv.org/html/2405.18719v2
+        cope = None
+        if use_cope:
+            assert causal, 'CoPE was designed for causal attention'
+            assert not flash, 'CoPE is not flash attention compatible'
+            cope = CoPE(dim_head, cope_max_pos)
         # attend class - includes core attention algorithm + talking heads
         self.attend = Attend(
@@ -803,31 +849,38 @@ class Attention(Module):
             add_zero_kv = add_zero_kv,
             flash = flash,
             logit_softclamp_value = logit_softclamp_value,
+            cope = cope,
             onnxable = onnxable
         )
         # head scaling
         self.head_scale = head_scale
         if head_scale:
             self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1))
         # explicit topk sparse attention
         self.sparse_topk = sparse_topk
         # add memory key / values
         self.num_mem_kv = num_mem_kv
         if num_mem_kv > 0:
             self.mem_k = nn.Parameter(torch.randn(kv_heads, num_mem_kv, dim_head))
             self.mem_v = nn.Parameter(torch.randn(kv_heads, num_mem_kv, dim_head))
         # attention on attention
         self.attn_on_attn = on_attn
         self.to_out = nn.Sequential(nn.Linear(out_dim, dim * 2, bias = False), nn.GLU()) if on_attn else nn.Linear(out_dim, dim, bias = False)
         # whether to rotate positions into values, for absolute positions in addition to relative
         self.rotary_embed_values = rotary_embed_values
         # init output projection 0
         if zero_init_output:
             init_zero_(self.to_out)
@@ -1410,6 +1463,7 @@ class AttentionLayers(Module):
             x = residual_fn(out, inner_residual)
             if layer_type in ('a', 'c') and return_hiddens:
+                inter.layer_type = layer_type
                 intermediates.append(inter)
             if layer_type == 'a' and self.residual_attn:

{x_transformers-1.30.4.dist-info → x_transformers-1.30.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.30.4
+Version: 1.30.7
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x_transformers-1.30.4.dist-info → x_transformers-1.30.7.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 x_transformers/__init__.py,sha256=8LQl-dNL6vj8VHRx5LMSOlRDTXQvYOuM21PDXz8WdiI,703
-x_transformers/attend.py,sha256=2SPHjXS_QAAZt04lHWGtdOypTExmo3BrbFhgcIQTk-Y,10671
+x_transformers/attend.py,sha256=ap2QkD-bRadFE9ZFQP84Lo1P2DpLOXPam24Jq9ybpPY,10903
 x_transformers/autoregressive_wrapper.py,sha256=uX8Mb0zLsQrZECt_9UGt35g7tC05Rk3nPqO6xp2FFCc,9619
 x_transformers/continuous.py,sha256=WO52n9lFAXv5-SGadi2cApGF8dkouN8QSTEOuC7erj8,6180
 x_transformers/dpo.py,sha256=LjvWgCkqTl-UuehrzQ8nkX5guLr4whYwsmm7SKSwdls,3450
 x_transformers/nonautoregressive_wrapper.py,sha256=ys_p8obc7lTeeodCqvkRKxOXQ1C9T3j5Jwr-JbVgnXk,10432
-x_transformers/x_transformers.py,sha256=P4rqlYGS9j9Gz00B4NPM7L6mhvamSYdBy5nG0ggOIMM,66342
+x_transformers/x_transformers.py,sha256=r9F_LLp5bQyAlue3bBTRwoRx02noTCh4ICF8oWCw1wE,67657
 x_transformers/xl_autoregressive_wrapper.py,sha256=DCx4n0_c1tFai4nOqaWVnqx2p9eutsZsDMiMP1ckxNU,4117
 x_transformers/xval.py,sha256=QE1ltYZTR_eGgIHPP2BrMWVWVLqMW-OpDZh87BSmQEg,8563
-x_transformers-1.30.4.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-1.30.4.dist-info/METADATA,sha256=VwdrJaRjocQXIAxdGzq4rByPGvaA4jsogostzCysdjI,661
-x_transformers-1.30.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-x_transformers-1.30.4.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
-x_transformers-1.30.4.dist-info/RECORD,,
+x_transformers-1.30.7.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-1.30.7.dist-info/METADATA,sha256=_TniCg2s6tlimpfzpWeMCsCMOjsoYwUObBiXFdY-JhA,661
+x_transformers-1.30.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+x_transformers-1.30.7.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
+x_transformers-1.30.7.dist-info/RECORD,,

{x_transformers-1.30.4.dist-info → x_transformers-1.30.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{x_transformers-1.30.4.dist-info → x_transformers-1.30.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{x_transformers-1.30.4.dist-info → x_transformers-1.30.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

x-transformers 1.30.4__py3-none-any.whl → 1.30.7__py3-none-any.whl

x-transformers 1.30.4py3-none-any.whl → 1.30.7py3-none-any.whl