PyPI - x-transformers - Versions diffs - 2.7.3__tar.gz → 2.7.5__tar.gz - Mend

@@ -1637,10 +1637,12 @@ class Attention(Module):
         q_weight = self.to_q.weight
         k_weight = self.to_k.weight
-        q_dim, k_dim, heads = q_weight.shape[0], k_weight.shape[0], qk_weight_scale.numel()
+        qk_dim, heads = q_weight.shape[0], qk_weight_scale.numel()
-        q_weight.mul_(repeat(qk_weight_scale, 'h -> (h expand)', expand = q_dim // heads))
-        k_weight.mul_(repeat(qk_weight_scale, 'h -> (h expand)', expand = k_dim // heads))
+        qk_weight_scale = repeat(qk_weight_scale, 'h -> (h expand)', expand = qk_dim // heads)
+        q_weight.mul_(qk_weight_scale)
+        k_weight.mul_(qk_weight_scale)
     def forward(
         self,
@@ -2460,6 +2462,20 @@ class AttentionLayers(Module):
         self.can_cache_kv = all([module.can_cache_kv for module in self.modules() if isinstance(module, Attention)])
+    def attn_qk_clip_(
+        self,
+        intermediates: LayerIntermediates,
+        tau = 100.
+    ):
+        # pairs up the attention intermediates with each attention module and does qk clip proposed by kimi team
+        for (_, layer, _), layer_type, attn_inter in zip(self.layers, self.layer_types, intermediates.attn_intermediates):
+            if layer_type not in ('a', 'c'):
+                continue
+            layer.qk_clip_(attn_inter, tau = tau)
     def forward(
         self,
         x,
@@ -3190,6 +3206,13 @@ class TransformerWrapper(Module):
             if not isinstance(self.pos_emb, always):
                 nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
+    def attn_qk_clip_(
+        self,
+        intermediates: LayerIntermediates,
+        tau = 100.
+    ):
+        self.attn_layers.attn_qk_clip_(intermediates, tau = tau)
     def forward(
         self,
         x,

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.7.3
+Version: 2.7.5
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.7.3"
+version = "2.7.5"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

@@ -1314,3 +1314,29 @@ def test_simple_mdlm(
     loss = nar(seq)
     loss.loss.backward()
+def test_qk_clip_attn():
+    from x_transformers import Attention
+    x = torch.randn(1, 1024, 512)
+    attn = Attention(dim = 512, dim_out = 384)
+    out, intermediates = attn(x, return_intermediates = True)
+    attn.qk_clip_(intermediates, tau = 100)
+def test_qk_clip_attn_layers():
+    from x_transformers import TransformerWrapper, Decoder
+    model = TransformerWrapper(
+        num_tokens = 256,
+        max_seq_len = 1024,
+        attn_layers = Decoder(dim = 512, depth = 2)
+    )
+    seq = torch.randint(0, 256, (1, 1024))
+    out, intermediates = model(seq, return_intermediates = True)
+    model.attn_qk_clip_(intermediates)

x-transformers 2.7.3__tar.gz → 2.7.5__tar.gz

x-transformers 2.7.3tar.gz → 2.7.5tar.gz