PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/fmha/ck_splitk.py ADDED Viewed

@@ -0,0 +1,204 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Any, Iterable, List, Optional, Tuple
+import torch
+from .attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
+from .common import AttentionFwOpBase, check_lastdim_alignment_stride1, Context, Inputs
+from .utils.op_common import get_operator, register_operator
+@register_operator
+class FwOp(AttentionFwOpBase):
+    OPERATOR = get_operator("xformers", "efficient_attention_forward_decoder_splitk_ck")
+    SUPPORTED_DEVICES = {"cuda"}
+    SUPPORTED_DTYPES = {
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+    }  # Those are dtypes of Q. In the quantized case K/V has dtype int32
+    SUPPORTED_MAX_K = 256
+    SUPPORTED_ATTN_BIAS_TYPES: Iterable[Any] = (
+        type(None),
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    )
+    SUPPORTS_DROPOUT = False
+    SUPPORTS_CUSTOM_SCALE = True
+    SUPPORTS_BMGHK = True
+    NAME = "ck_splitKF"
+    SPLIT_K: Optional[int] = None
+    BLOCK_M = 16
+    BLOCK_N = 64
+    NUM_GROUPS = 1  # Default quantization is row-wise
+    @classmethod
+    def shape_not_supported_reasons(
+        cls, Mq: int, Mkv: int, K: int, Kv: int
+    ) -> List[str]:
+        reasons = super().shape_not_supported_reasons(Mq, Mkv, K, Kv)
+        # if K not in {16, 32, 64, 128}:
+        #     reasons.append(f"Embed dim {K} not supported")
+        return reasons
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(FwOp, cls).not_supported_reasons(d)
+        check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
+        if d.key.dtype != torch.int32:
+            check_lastdim_alignment_stride1(reasons, "key", d.key, 8)
+            check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
+        if cls.OPERATOR is None:
+            reasons.append("triton is not available")
+        if d.device.type == "cuda":
+            # Has only been tested on 8.0 / 9.0.
+            if torch.cuda.get_device_capability(d.device) < (7, 0):
+                reasons.append(
+                    "requires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
+                )
+        q_len = d.query.shape[1]
+        if isinstance(d.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            seqinfo = d.attn_bias.q_seqinfo
+            if q_len != seqinfo.seqstart_py[-1]:
+                reasons.append(
+                    f"Expected total {seqinfo.seqstart_py[-1]} queries not {q_len}"
+                )
+            q_len = seqinfo.min_seqlen
+            if q_len != seqinfo.max_seqlen:
+                reasons.append(
+                    "Variable query len is not supported in the presence of causal mask."
+                )
+        if d.key.ndim in [4, 5] and d.key.shape[-2] != 1:
+            if d.key.stride(-2) == 0 and d.value.stride(-2) == 0 and q_len > 1:
+                reasons.append("multiquery is only supported with query seqlen=1")
+        if d.attn_bias is not None and q_len > 1:
+            reasons.append(
+                "query with seqlen > 1 is not supported in the presence of causal mask"
+            )
+        return reasons
+    @classmethod
+    def get_split_k(cls, B: int, H: int, Mk: int) -> int:
+        """Heuristic for the number of splits"""
+        bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
+        split_k = max(Mk, 1024) // bh
+        max_chunk_size = 64 if Mk <= 512 and bh <= 64 else 128
+        while split_k > 0 and Mk / split_k < max_chunk_size:
+            split_k = split_k // 2
+        split_k = min(split_k, 64)
+        split_k = max(split_k, 1)
+        return split_k
+    @classmethod
+    def apply(
+        cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        attn_bias = inp.attn_bias
+        q, k, v = inp.get_qkv_in_bmghk()
+        if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+            attn_bias.k_seqinfo.to(k.device)
+            attn_bias.q_seqinfo.to(q.device)
+            padding = attn_bias.k_seqinfo.padding
+            seq_positions_gpu = attn_bias.k_seqinfo.seqlen
+        else:
+            padding = k.shape[1]
+            seq_positions_gpu = None
+        if attn_bias is not None:
+            # key: (1, B * padding, G, 1 if multiquery else Hkv, D)
+            # value: like key
+            # query: (1, B * q_seqlen, G, Hq, D)
+            multiquery = k.stride(3) == 0
+            if multiquery:
+                key = k[0, :, :, :1].unflatten(0, (-1, padding))
+                value = v[0, :, :, :1].unflatten(0, (-1, padding))
+            else:
+                key = k[0].unflatten(0, (-1, padding))
+                value = v[0].unflatten(0, (-1, padding))
+            query = q[0].unflatten(0, (key.shape[0], -1))
+        else:
+            # key: (B, padding, G, 1 if multiquery else Hkv, D)
+            # value: like key
+            # query: (B, q_seqlen, G, Hq, D)
+            key = k
+            query = q
+            value = v
+        B, _, _, H, _ = query.shape
+        _, Mk, _, _, _ = key.shape
+        if cls.SPLIT_K is not None:
+            split_k = cls.SPLIT_K
+        else:
+            # Use heuristics
+            split_k = cls.get_split_k(B, H, Mk)
+        if inp.scale is not None:
+            qk_scale = inp.scale
+        else:
+            qk_scale = torch.rsqrt(
+                torch.tensor(k.shape[-1], dtype=torch.float32)
+            ).item()
+        out = cls.OPERATOR(
+            query=query,
+            key=key,
+            value=value,
+            seq_positions=seq_positions_gpu,
+            scale=qk_scale,
+            split_k=split_k,
+        )
+        return out, None
+class FwOp_S1(FwOp):
+    SPLIT_K = 1
+    NAME = "ck_splitK1"
+class FwOp_S2(FwOp):
+    SPLIT_K = 2
+    NAME = "ck_splitK2"
+class FwOp_S4(FwOp):
+    SPLIT_K = 4
+    NAME = "ck_splitK4"
+class FwOp_S8(FwOp):
+    SPLIT_K = 8
+    NAME = "ck_splitK8"
+class FwOp_S16(FwOp):
+    SPLIT_K = 16
+    NAME = "ck_splitK16"
+class FwOp_S32(FwOp):
+    SPLIT_K = 32
+    NAME = "ck_splitK32"
+class FwOp_S64(FwOp):
+    SPLIT_K = 64
+    NAME = "ck_splitK64"
+class FwOp_S128(FwOp):
+    SPLIT_K = 128
+    NAME = "ck_splitK128"