PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py ADDED Viewed

@@ -0,0 +1,533 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from enum import IntEnum
+from typing import Any, Optional
+import mslk.attention.cutlass_blackwell_fmha  # noqa: F401
+import torch
+class GenKernelType(IntEnum):
+    UMMA_I = 0
+    UMMA_P = 1
+def get_splitk_heuristic(
+    batch: int,
+    seqlen_kv: int,
+    kv_heads: int = 1,
+    tile_n: int = 256,
+    sm_count: int | None = None,
+) -> int:
+    """
+    Compute optimal split-K size for Shape<64, 256, 128> tile configuration.
+    Targets full GPU utilization by distributing work across all SMs.
+    First calculates SMs per batch, then per kv_head, then divides seqlen_kv by that number.
+    Ensures split size evenly divides seqlen_kv so all CTAs process same number of tiles.
+    Returns 0 (no split) when split would equal seqlen_kv (only 1 split).
+    Args:
+        batch: Batch size
+        seqlen_kv: Maximum sequence length for K/V
+        kv_heads: Number of KV heads (default 1 for MQA)
+        tile_n: TileN dimension (default 256 for Shape<64, 256, 128>)
+        sm_count: Number of SMs on the GPU. If None, queries the current device.
+    Returns:
+        Optimal split size along the K/V sequence dimension, or 0 to disable split-K
+    """
+    # Get SM count from current device if not provided
+    if sm_count is None:
+        sm_count = torch.cuda.get_device_properties(
+            torch.cuda.current_device()
+        ).multi_processor_count
+    # Calculate number of SMs available per batch element
+    sms_per_batch = max(1, sm_count // batch)
+    # Further divide by kv_heads for multi-head KV
+    sms_per_head_batch = max(1, sms_per_batch // kv_heads)
+    # Each (batch, kv_head) element should have sms_per_head_batch splits
+    # So split size = seqlen_kv / sms_per_head_batch
+    ideal_split = seqlen_kv // sms_per_head_batch
+    # Round up to multiple of tile_n
+    split = ((ideal_split + tile_n - 1) // tile_n) * tile_n
+    # Clamp to valid range: [tile_n, seqlen_kv]
+    split = max(split, tile_n)
+    split = min(split, seqlen_kv)
+    # If split equals seqlen_kv, there's only 1 split - disable split-K
+    if split == seqlen_kv:
+        split = 0
+    return split
+def maybe_contiguous(x: torch.Tensor) -> torch.Tensor:
+    """
+    We only require the head dim to be contiguous
+    """
+    return (
+        x.contiguous()
+        if x is not None and (x.stride(-1) != 1 or x.stride(-2) % 8 != 0)
+        else x
+    )
+def _cutlass_blackwell_fmha_forward(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_k: torch.Tensor | None = None,
+    max_seq_len_q: int | None = None,
+    max_seq_len_k: int | None = None,
+    softmax_scale: float | None = None,
+    causal: bool = False,
+    seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = None,
+    window_left: int = -1,
+    window_right: int = -1,
+    bottom_right: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    q = maybe_contiguous(q)
+    k = maybe_contiguous(k)
+    v = maybe_contiguous(v)
+    return torch.ops.mslk.fmha_fwd(
+        q,
+        k,
+        v,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seq_len_q=max_seq_len_q,
+        max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_left,
+        window_size_right=window_right,
+        bottom_right=bottom_right,
+    )
+def _cutlass_blackwell_fmha_backward(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_k: torch.Tensor | None = None,
+    max_seq_len_q: int | None = None,
+    max_seq_len_k: int | None = None,
+    softmax_scale: float | None = None,
+    causal: bool = False,
+    window_left: int = -1,
+    window_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    deterministic = deterministic or torch.are_deterministic_algorithms_enabled()
+    dout = maybe_contiguous(dout)
+    q = maybe_contiguous(q)
+    k = maybe_contiguous(k)
+    v = maybe_contiguous(v)
+    out = maybe_contiguous(out)
+    return torch.ops.mslk.fmha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seq_len_q=max_seq_len_q,
+        max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size_left=window_left,
+        window_size_right=window_right,
+        bottom_right=bottom_right,
+        deterministic=deterministic,
+    )
+def _validate_and_adjust_split_k_size(split_k_size: int) -> int:
+    """
+    Validate and adjust split_k_size parameter for optimal performance.
+    Args:
+        split_k_size: The requested split size along the K/V sequence dimension.
+    Returns:
+        Adjusted split_k_size that is valid for the kernel.
+    Valid values:
+        - split_k_size <= 0: Disable split-K (no splitting)
+        - split_k_size > 0: Enable split-K with specified split size
+    """
+    if not isinstance(split_k_size, int):
+        raise TypeError(
+            f"split_k_size must be an integer, got {type(split_k_size).__name__}"
+        )
+    # If split-K is disabled, return as-is
+    if split_k_size <= 0:
+        return split_k_size
+    # Constants
+    MIN_RECOMMENDED_SPLIT_SIZE = 256
+    TILE_SIZE = 128
+    # Adjust if split_k_size is too small
+    if split_k_size < MIN_RECOMMENDED_SPLIT_SIZE:
+        split_k_size = MIN_RECOMMENDED_SPLIT_SIZE
+    # Check if split_k_size is a power of 2
+    is_power_of_2 = (split_k_size & (split_k_size - 1)) == 0
+    # If not a power of 2, round to nearest multiple of tile size (128)
+    if not is_power_of_2:
+        split_k_size = ((split_k_size + TILE_SIZE - 1) // TILE_SIZE) * TILE_SIZE
+    return split_k_size
+def _validate_decode_inputs(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seqlen_kv: torch.Tensor | None,
+) -> None:
+    assert seqlen_kv is not None, "seqlen_kv must be provided for decode"
+    tensors = {"q": q, "k": k, "v": v, "seqlen_kv": seqlen_kv}
+    for name, tensor in tensors.items():
+        # assert tensor.is_contiguous(), f"{name} is not contiguous"
+        assert tensor.is_cuda, f"{name} must be on GPU"
+def _prepare_decode_inputs(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, bool, tuple[int, ...]]:
+    """
+    Prepare inputs for decode kernel by handling both varlen and batch formats.
+    Returns:
+        - Reshaped q, k, v tensors in batch format [B, 1, H, D]
+        - batch_size
+        - needs_reshape_output flag
+        - original_shape of q
+    """
+    original_shape = tuple(q.shape)
+    needs_reshape_output = False
+    batch_size = q.shape[0]
+    if q.dim() == 3:
+        # Varlen format: [total_queries, num_heads, head_dim]
+        q = q.view(batch_size, 1, q.shape[1], q.shape[2])
+        needs_reshape_output = True
+    if q.dim() != 4:
+        raise ValueError(
+            f"Invalid query shape: {q.shape}. Expected [B, 1, H, D] or [total_queries, H, D]"
+        )
+    assert q.shape[1] == 1, "Kernel  have sq=1"
+    k = k.view(batch_size, -1, k.shape[1], k.shape[2]) if k.dim() == 3 else k
+    v = v.view(batch_size, -1, v.shape[1], v.shape[2]) if v.dim() == 3 else v
+    return q, k, v, batch_size, needs_reshape_output, original_shape
+def cutlass_blackwell_fmha_decode_forward(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seqlen_kv: torch.Tensor | None = None,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_k: torch.Tensor | None = None,
+    max_seq_len_q: int | None = None,
+    max_seq_len_k: int | None = None,
+    softmax_scale: float | None = None,
+    causal: bool = False,
+    window_left: int = -1,
+    window_right: int = -1,
+    bottom_right: bool = True,
+    split_k_size: int = 0,
+    use_heuristic: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Decode-optimized forward pass using the gen kernel.
+    This is a wrapper to use the gen kernel which is optimized
+    for decode (query length = 1).
+    This function is called externally by xformers ops.
+    Accepts inputs in two formats:
+    - Varlen format: [total_queries, num_heads, head_dim] (3D)
+    - Batch format: [batch_size, 1, num_heads, head_dim] (4D)
+    Args:
+        q: Query tensor in varlen [B, H, D] or batch [B, 1, H, D] format
+        k: Key tensor [B, Sk, H_kv, D]
+        v: Value tensor [B, Sk, H_kv, D]
+        seqlen_kv: Per-batch sequence lengths [B] (required)
+        split_k_size: Size of each split along the K/V sequence dimension.
+                     - split_k_size <= 0 with use_heuristic=True: auto-compute using heuristic
+                     - split_k_size <= 0 with use_heuristic=False: disable split-K
+                     - split_k_size > 0: use the provided split size directly
+                     Values below 256 are adjusted to 256. Non-power-of-2 values
+                     are rounded to the nearest multiple of 128.
+        use_heuristic: If True and split_k_size <= 0, automatically compute optimal
+                      split size using the heuristic. Default is True.
+    Returns:
+        Kernel output with Q dimension added:
+        - out: [B, 1, H, num_splits, D] (num_splits=1 when split-K disabled)
+        - lse: [B, num_splits, H, 1]
+    """
+    _validate_decode_inputs(q, k, v, seqlen_kv)
+    # Prepare inputs and handle format conversion
+    q, k, v, batch_size, _, original_shape = _prepare_decode_inputs(q, k, v)
+    # Determine effective split_k_size
+    if split_k_size <= 0 and use_heuristic:
+        # Auto-compute using heuristic
+        max_seqlen_kv = k.shape[1]
+        kv_heads = k.shape[2]  # K shape is [B, Sk, H_kv, D]
+        split_k_size = get_splitk_heuristic(batch_size, max_seqlen_kv, kv_heads)
+    # Validate and adjust split_k_size
+    split_k_size = _validate_and_adjust_split_k_size(split_k_size)
+    # Validate window_right: decode kernel only supports causal attention (window_right <= 0)
+    if window_right > 0:
+        raise ValueError(
+            f"window_right={window_right} is not supported for decode attention. "
+            "The decode kernel only supports causal attention with window_right <= 0. "
+            "Use window_right=0 (causal, current position only)."
+        )
+    # Call the gen kernel (optimized for decode)
+    # Note: window_left specifies how many tokens to look back (exclusive)
+    # The kernel will attend to positions [seqlen_kv - window_left, seqlen_kv)
+    out, lse = torch.ops.mslk.fmha_gen_fwd(
+        q,
+        k,
+        v,
+        seqlen_kv,
+        None,
+        kernel_type=GenKernelType.UMMA_I,
+        window_left=window_left,
+        window_right=0,
+        split_k_size=split_k_size,
+    )
+    # Kernel returns: out [B, H, num_splits, D], lse [B, num_splits, H]
+    # Reshape to consistent format with Q dimension:
+    # out: [B, H, num_splits, D] -> [B, 1, H, num_splits, D]
+    # lse: [B, num_splits, H] -> [B, num_splits, H, 1]
+    out = out.unsqueeze(1)  # [B, 1, H, num_splits, D]
+    lse = lse.unsqueeze(-1)  # [B, num_splits, H, 1]
+    return out, lse
+class CutlassBlackwellFmhaFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        softmax_scale: float | None = None,
+        causal: bool = False,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_k: Optional[torch.Tensor] = None,
+        max_seq_len_q: Optional[int] = None,
+        max_seq_len_k: Optional[int] = None,
+        seqlen_kv: Optional[torch.Tensor] = None,
+        page_table: Optional[torch.Tensor] = None,
+        seqlen_k: Optional[int] = None,
+        window_size: tuple[int, int] = (-1, -1),
+        bottom_right: bool = True,
+        deterministic: bool = False,
+    ) -> torch.Tensor:
+        window_left, window_right = window_size
+        # Check if this is generation phase (sq = 1)
+        sq = q.shape[1]
+        if q.dim() == 4 and sq == 1:
+            # For gen case, we don't need to save tensors for backward
+            ctx.is_gen = True
+            out, _ = cutlass_blackwell_fmha_decode_forward(
+                q,
+                k,
+                v,
+                seqlen_kv,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seq_len_q,
+                max_seq_len_k,
+                softmax_scale,
+                causal,
+                window_left,
+                window_right,
+                bottom_right,
+            )
+            return out
+        ctx.is_gen = False
+        # Only check dtype if cu_seqlens_q and cu_seqlens_k are provided
+        if cu_seqlens_q is not None and cu_seqlens_k is not None:
+            assert (
+                cu_seqlens_q.dtype == torch.int32
+                and cu_seqlens_q.dtype == cu_seqlens_k.dtype
+            ), "cu_seqlens_q and cu_seqlens_k must be int32"
+        # handle window_size
+        if causal and window_left >= 0:
+            window_right = 0
+        # Use regular FMHA for non-generation case
+        out, softmax_lse = _cutlass_blackwell_fmha_forward(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seq_len_q,
+            max_seq_len_k,
+            softmax_scale,
+            causal,
+            seqlen_kv,
+            page_table,
+            seqlen_k,
+            window_left,
+            window_right,
+            bottom_right,
+        )
+        ctx.save_for_backward(q, k, v, out, softmax_lse)
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.max_seq_len_q = max_seq_len_q
+        ctx.max_seq_len_k = max_seq_len_k
+        ctx.cu_seqlens_q = cu_seqlens_q
+        ctx.cu_seqlens_k = cu_seqlens_k
+        ctx.bottom_right = bottom_right
+        ctx.deterministic = deterministic
+        return out
+    @staticmethod
+    def backward(
+        ctx, dout: torch.Tensor, *args: Any
+    ) -> tuple[  # type: ignore
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        if ctx.is_gen:
+            # For gen case, no backward pass is needed (generation is inference only)
+            raise RuntimeError(
+                "Backward pass is not supported for generation phase (sq=1)"
+            )
+        q, k, v, out, softmax_lse = ctx.saved_tensors
+        window_left, window_right = ctx.window_size
+        dq, dk, dv = _cutlass_blackwell_fmha_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            ctx.cu_seqlens_q,
+            ctx.cu_seqlens_k,
+            ctx.max_seq_len_q,
+            ctx.max_seq_len_k,
+            ctx.softmax_scale,
+            ctx.causal,
+            window_left,
+            window_right,
+            bottom_right=ctx.bottom_right,
+            deterministic=ctx.deterministic,
+        )
+        return (
+            dq,
+            dk,
+            dv,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+def cutlass_blackwell_fmha_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    softmax_scale: float | None = None,
+    causal: bool = False,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_k: torch.Tensor | None = None,
+    max_seq_len_q: int | None = None,
+    max_seq_len_k: int | None = None,
+    seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = None,
+    window_size: tuple[int, int] | None = (-1, -1),
+    bottom_right: bool = True,
+    deterministic: bool = False,
+):
+    return CutlassBlackwellFmhaFunc.apply(
+        q,
+        k,
+        v,
+        softmax_scale,
+        causal,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seq_len_q,
+        max_seq_len_k,
+        seqlen_kv,
+        page_table,
+        seqlen_k,
+        window_size,
+        bottom_right,
+        deterministic,
+    )

mslk/attention/flash_attn/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# @nolint # fbcode
+"""Flash Attention CUTE (CUDA Template Engine) implementation."""
+__version__ = "0.1.0"
+import cutlass.cute as cute
+from .interface import (
+    flash_attn_func,
+    flash_attn_varlen_func,
+)
+from mslk.attention.flash_attn.cute_dsl_utils import cute_compile_patched
+# Patch cute.compile to optionally dump SASS
+cute.compile = cute_compile_patched
+__all__ = [
+    "flash_attn_func",
+    "flash_attn_varlen_func",
+]

mslk/attention/flash_attn/ampere_helpers.py ADDED Viewed

@@ -0,0 +1,104 @@
+# @nolint # fbcode
+# Copyright (c) 2025, Tri Dao.
+from typing import Type, Callable, Optional
+import cutlass
+import cutlass.cute as cute
+def get_smem_layout_atom(dtype: Type[cutlass.Numeric], k_dim: int) -> cute.ComposedLayout:
+    dtype_byte = cutlass.const_expr(dtype.width // 8)
+    bytes_per_row = cutlass.const_expr(k_dim * dtype_byte)
+    smem_k_block_size = (
+        cutlass.const_expr(
+            128
+            if bytes_per_row % 128 == 0
+            else (64 if bytes_per_row % 64 == 0 else (32 if bytes_per_row % 32 == 0 else 16))
+        )
+        // dtype_byte
+    )
+    swizzle_bits = (
+        4
+        if smem_k_block_size == 128
+        else (3 if smem_k_block_size == 64 else (2 if smem_k_block_size == 32 else 1))
+    )
+    swizzle_base = 2 if dtype_byte == 4 else (3 if dtype_byte == 2 else 4)
+    return cute.make_composed_layout(
+        cute.make_swizzle(swizzle_bits, swizzle_base, swizzle_base),
+        0,
+        cute.make_ordered_layout(
+            (8 if cutlass.const_expr(k_dim % 32 == 0) else 16, smem_k_block_size), order=(1, 0)
+        ),
+    )
+@cute.jit
+def gemm(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    tCsA: cute.Tensor,
+    tCsB: cute.Tensor,
+    smem_thr_copy_A: cute.TiledCopy,
+    smem_thr_copy_B: cute.TiledCopy,
+    hook_fn: Optional[Callable] = None,
+    A_in_regs: cutlass.Constexpr[bool] = False,
+    B_in_regs: cutlass.Constexpr[bool] = False,
+    swap_AB: cutlass.Constexpr[bool] = False,
+) -> None:
+    if cutlass.const_expr(swap_AB):
+        gemm(
+            tiled_mma,
+            acc,
+            tCrB,
+            tCrA,
+            tCsB,
+            tCsA,
+            smem_thr_copy_B,
+            smem_thr_copy_A,
+            hook_fn,
+            A_in_regs=B_in_regs,
+            B_in_regs=A_in_regs,
+            swap_AB=False,
+        )
+    else:
+        tCrA_copy_view = smem_thr_copy_A.retile(tCrA)
+        tCrB_copy_view = smem_thr_copy_B.retile(tCrB)
+        if cutlass.const_expr(not A_in_regs):
+            cute.copy(smem_thr_copy_A, tCsA[None, None, 0], tCrA_copy_view[None, None, 0])
+        if cutlass.const_expr(not B_in_regs):
+            cute.copy(smem_thr_copy_B, tCsB[None, None, 0], tCrB_copy_view[None, None, 0])
+        for k in cutlass.range_constexpr(cute.size(tCsA.shape[2])):
+            if k < cute.size(tCsA.shape[2]) - 1:
+                if cutlass.const_expr(not A_in_regs):
+                    cute.copy(
+                        smem_thr_copy_A, tCsA[None, None, k + 1], tCrA_copy_view[None, None, k + 1]
+                    )
+                if cutlass.const_expr(not B_in_regs):
+                    cute.copy(
+                        smem_thr_copy_B, tCsB[None, None, k + 1], tCrB_copy_view[None, None, k + 1]
+                    )
+            cute.gemm(tiled_mma, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+            if cutlass.const_expr(k == 0 and hook_fn is not None):
+                hook_fn()
+@cute.jit
+def gemm_rs(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    tCsB: cute.Tensor,
+    smem_thr_copy_B: cute.TiledCopy,
+    hook_fn: Optional[Callable] = None,
+) -> None:
+    tCrB_copy_view = smem_thr_copy_B.retile(tCrB)
+    cute.copy(smem_thr_copy_B, tCsB[None, None, 0], tCrB_copy_view[None, None, 0])
+    for k in cutlass.range_constexpr(cute.size(tCrA.shape[2])):
+        if cutlass.const_expr(k < cute.size(tCrA.shape[2]) - 1):
+            cute.copy(smem_thr_copy_B, tCsB[None, None, k + 1], tCrB_copy_view[None, None, k + 1])
+        cute.gemm(tiled_mma, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+        if cutlass.const_expr(k == 0 and hook_fn is not None):
+            hook_fn()