x-transformers 2.7.3__tar.gz → 2.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {x_transformers-2.7.3 → x_transformers-2.7.5}/PKG-INFO +1 -1
- {x_transformers-2.7.3 → x_transformers-2.7.5}/pyproject.toml +1 -1
- {x_transformers-2.7.3 → x_transformers-2.7.5}/tests/test_x_transformers.py +26 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/x_transformers.py +26 -3
- {x_transformers-2.7.3 → x_transformers-2.7.5}/.github/FUNDING.yml +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/.github/workflows/python-publish.yml +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/.github/workflows/python-test.yaml +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/.gitignore +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/LICENSE +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/README.md +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/data/README.md +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/data/enwik8.gz +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/all-attention.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/attention-on-attention.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/cosine-sim-attention.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/deepnorm.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias-linear.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias-log.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias-sinusoidal.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/dynamic-pos-bias.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/enhanced-recurrence.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/fcm.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/ffglu.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/flash-attention.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/gate_values.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/gating.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/length-extrapolation-scale.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/macaron-1.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/macaron-2.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/memory-transformer.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/normformer.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/pia.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/qknorm-analysis.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/resi_dual.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/residual_attn.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/rezero.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/rotary.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/sandwich-2.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/sandwich.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/sandwich_norm.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/scalenorm.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/talking-heads.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/topk-attention.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/images/xval.png +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/train_belief_state.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/train_copy.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/train_entropy_tokenizer.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/train_enwik8.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/train_length_extrapolate.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/train_parity.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/__init__.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/attend.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/autoregressive_wrapper.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/belief_state_wrapper.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/continuous.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/dpo.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/entropy_based_tokenizer.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/multi_input.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/neo_mlp.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/nonautoregressive_wrapper.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/up_wrapper.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/xl_autoregressive_wrapper.py +0 -0
- {x_transformers-2.7.3 → x_transformers-2.7.5}/x_transformers/xval.py +0 -0
@@ -1314,3 +1314,29 @@ def test_simple_mdlm(
|
|
1314
1314
|
|
1315
1315
|
loss = nar(seq)
|
1316
1316
|
loss.loss.backward()
|
1317
|
+
|
1318
|
+
def test_qk_clip_attn():
|
1319
|
+
from x_transformers import Attention
|
1320
|
+
|
1321
|
+
x = torch.randn(1, 1024, 512)
|
1322
|
+
|
1323
|
+
attn = Attention(dim = 512, dim_out = 384)
|
1324
|
+
|
1325
|
+
out, intermediates = attn(x, return_intermediates = True)
|
1326
|
+
|
1327
|
+
attn.qk_clip_(intermediates, tau = 100)
|
1328
|
+
|
1329
|
+
def test_qk_clip_attn_layers():
|
1330
|
+
from x_transformers import TransformerWrapper, Decoder
|
1331
|
+
|
1332
|
+
model = TransformerWrapper(
|
1333
|
+
num_tokens = 256,
|
1334
|
+
max_seq_len = 1024,
|
1335
|
+
attn_layers = Decoder(dim = 512, depth = 2)
|
1336
|
+
)
|
1337
|
+
|
1338
|
+
seq = torch.randint(0, 256, (1, 1024))
|
1339
|
+
|
1340
|
+
out, intermediates = model(seq, return_intermediates = True)
|
1341
|
+
|
1342
|
+
model.attn_qk_clip_(intermediates)
|
@@ -1637,10 +1637,12 @@ class Attention(Module):
|
|
1637
1637
|
q_weight = self.to_q.weight
|
1638
1638
|
k_weight = self.to_k.weight
|
1639
1639
|
|
1640
|
-
|
1640
|
+
qk_dim, heads = q_weight.shape[0], qk_weight_scale.numel()
|
1641
1641
|
|
1642
|
-
|
1643
|
-
|
1642
|
+
qk_weight_scale = repeat(qk_weight_scale, 'h -> (h expand)', expand = qk_dim // heads)
|
1643
|
+
|
1644
|
+
q_weight.mul_(qk_weight_scale)
|
1645
|
+
k_weight.mul_(qk_weight_scale)
|
1644
1646
|
|
1645
1647
|
def forward(
|
1646
1648
|
self,
|
@@ -2460,6 +2462,20 @@ class AttentionLayers(Module):
|
|
2460
2462
|
|
2461
2463
|
self.can_cache_kv = all([module.can_cache_kv for module in self.modules() if isinstance(module, Attention)])
|
2462
2464
|
|
2465
|
+
def attn_qk_clip_(
|
2466
|
+
self,
|
2467
|
+
intermediates: LayerIntermediates,
|
2468
|
+
tau = 100.
|
2469
|
+
):
|
2470
|
+
# pairs up the attention intermediates with each attention module and does qk clip proposed by kimi team
|
2471
|
+
|
2472
|
+
for (_, layer, _), layer_type, attn_inter in zip(self.layers, self.layer_types, intermediates.attn_intermediates):
|
2473
|
+
|
2474
|
+
if layer_type not in ('a', 'c'):
|
2475
|
+
continue
|
2476
|
+
|
2477
|
+
layer.qk_clip_(attn_inter, tau = tau)
|
2478
|
+
|
2463
2479
|
def forward(
|
2464
2480
|
self,
|
2465
2481
|
x,
|
@@ -3190,6 +3206,13 @@ class TransformerWrapper(Module):
|
|
3190
3206
|
if not isinstance(self.pos_emb, always):
|
3191
3207
|
nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
|
3192
3208
|
|
3209
|
+
def attn_qk_clip_(
|
3210
|
+
self,
|
3211
|
+
intermediates: LayerIntermediates,
|
3212
|
+
tau = 100.
|
3213
|
+
):
|
3214
|
+
self.attn_layers.attn_qk_clip_(intermediates, tau = tau)
|
3215
|
+
|
3193
3216
|
def forward(
|
3194
3217
|
self,
|
3195
3218
|
x,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|