PyPI - x-transformers - Versions diffs - 2.7.4__tar.gz → 2.7.6__tar.gz - Mend

@@ -2462,6 +2462,23 @@ class AttentionLayers(Module):
         self.can_cache_kv = all([module.can_cache_kv for module in self.modules() if isinstance(module, Attention)])
+    def attn_qk_clip_(
+        self,
+        intermediates: LayerIntermediates,
+        tau = 100.
+    ):
+        # pairs up the attention intermediates with each attention module and does qk clip proposed by kimi team
+        layer_and_layer_types = (self.layers, self.layer_types)
+        attn_layers = [layer for (_, layer, _), layer_type in zip(self.layers, self.layer_types) if layer_type in ('a', 'c')]
+        attn_intermeds = intermediates.attn_intermediates
+        assert len(attn_layers) == len(attn_intermeds)
+        for attn_layer, attn_inter in zip(attn_layers, attn_intermeds):
+            attn_layer.qk_clip_(attn_inter, tau = tau)
     def forward(
         self,
         x,
@@ -3192,6 +3209,13 @@ class TransformerWrapper(Module):
             if not isinstance(self.pos_emb, always):
                 nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
+    def attn_qk_clip_(
+        self,
+        intermediates: LayerIntermediates,
+        tau = 100.
+    ):
+        self.attn_layers.attn_qk_clip_(intermediates, tau = tau)
     def forward(
         self,
         x,

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.7.4
+Version: 2.7.6
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers

@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.7.4"
+version = "2.7.6"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

@@ -1315,7 +1315,7 @@ def test_simple_mdlm(
     loss = nar(seq)
     loss.loss.backward()
-def test_qk_clip():
+def test_qk_clip_attn():
     from x_transformers import Attention
     x = torch.randn(1, 1024, 512)
@@ -1325,3 +1325,18 @@ def test_qk_clip():
     out, intermediates = attn(x, return_intermediates = True)
     attn.qk_clip_(intermediates, tau = 100)
+def test_qk_clip_attn_layers():
+    from x_transformers import TransformerWrapper, Decoder
+    model = TransformerWrapper(
+        num_tokens = 256,
+        max_seq_len = 1024,
+        attn_layers = Decoder(dim = 512, depth = 2)
+    )
+    seq = torch.randint(0, 256, (1, 1024))
+    out, intermediates = model(seq, return_intermediates = True)
+    model.attn_qk_clip_(intermediates)

x-transformers 2.7.4__tar.gz → 2.7.6__tar.gz