x-transformers 2.7.5__tar.gz → 2.7.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {x_transformers-2.7.5 → x_transformers-2.7.6}/PKG-INFO +1 -1
- {x_transformers-2.7.5 → x_transformers-2.7.6}/pyproject.toml +1 -1
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/x_transformers.py +7 -4
- {x_transformers-2.7.5 → x_transformers-2.7.6}/.github/FUNDING.yml +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/.github/workflows/python-publish.yml +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/.github/workflows/python-test.yaml +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/.gitignore +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/LICENSE +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/README.md +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/data/README.md +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/data/enwik8.gz +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/all-attention.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/attention-on-attention.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/cosine-sim-attention.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/deepnorm.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/dynamic-pos-bias-linear.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/dynamic-pos-bias-log.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/dynamic-pos-bias-sinusoidal.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/dynamic-pos-bias.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/enhanced-recurrence.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/fcm.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/ffglu.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/flash-attention.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/gate_values.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/gating.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/length-extrapolation-scale.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/macaron-1.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/macaron-2.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/memory-transformer.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/normformer.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/pia.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/qknorm-analysis.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/resi_dual.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/residual_attn.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/rezero.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/rotary.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/sandwich-2.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/sandwich.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/sandwich_norm.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/scalenorm.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/talking-heads.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/topk-attention.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/images/xval.png +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/tests/test_x_transformers.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/train_belief_state.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/train_copy.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/train_entropy_tokenizer.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/train_enwik8.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/train_length_extrapolate.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/train_parity.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/__init__.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/attend.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/autoregressive_wrapper.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/belief_state_wrapper.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/continuous.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/dpo.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/entropy_based_tokenizer.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/multi_input.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/neo_mlp.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/nonautoregressive_wrapper.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/up_wrapper.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/xl_autoregressive_wrapper.py +0 -0
- {x_transformers-2.7.5 → x_transformers-2.7.6}/x_transformers/xval.py +0 -0
@@ -2469,12 +2469,15 @@ class AttentionLayers(Module):
|
|
2469
2469
|
):
|
2470
2470
|
# pairs up the attention intermediates with each attention module and does qk clip proposed by kimi team
|
2471
2471
|
|
2472
|
-
|
2472
|
+
layer_and_layer_types = (self.layers, self.layer_types)
|
2473
2473
|
|
2474
|
-
|
2475
|
-
|
2474
|
+
attn_layers = [layer for (_, layer, _), layer_type in zip(self.layers, self.layer_types) if layer_type in ('a', 'c')]
|
2475
|
+
attn_intermeds = intermediates.attn_intermediates
|
2476
|
+
|
2477
|
+
assert len(attn_layers) == len(attn_intermeds)
|
2476
2478
|
|
2477
|
-
|
2479
|
+
for attn_layer, attn_inter in zip(attn_layers, attn_intermeds):
|
2480
|
+
attn_layer.qk_clip_(attn_inter, tau = tau)
|
2478
2481
|
|
2479
2482
|
def forward(
|
2480
2483
|
self,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|