PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/fmha/__init__.py ADDED Viewed

@@ -0,0 +1,967 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Any, cast, List, Optional, Sequence, Tuple, Type, Union
+import torch
+from . import (
+    attn_bias,
+    ck,
+    ck_decoder,
+    ck_splitk,
+    cutlass,
+    cutlass_blackwell,
+    flash,
+    flash3,
+    flash_mtia,
+    triton_splitk,
+)
+from .attn_bias import (
+    AttentionBias,
+    BlockDiagonalMask,
+    LowerTriangularMask,
+    VARLEN_BIASES,
+)
+from .common import (
+    AttentionBwOpBase,
+    AttentionFwOpBase,
+    AttentionOp,
+    AttentionOpBase,
+    bmk2bmhk,
+    Context,
+    Gradients,
+    Inputs,
+)
+from .dispatch import (
+    _dispatch_bw,
+    _dispatch_fw,
+    _ensure_op_supports_or_raise,
+    _get_use_fa3,
+    _set_use_fa3,
+)
+MemoryEfficientAttentionCutlassOp = (cutlass.FwOp, cutlass.BwOp)
+MemoryEfficientAttentionCutlassBlackwellOp = (
+    cutlass_blackwell.FwOp,
+    cutlass_blackwell.BwOp,
+)
+MemoryEfficientAttentionCutlassFwdFlashBwOp = (cutlass.FwOp, flash.BwOp)
+MemoryEfficientAttentionFlashAttentionOp = (flash.FwOp, flash.BwOp)
+MemoryEfficientAttentionFlashMtiaAttentionOp = (flash_mtia.FwOp, flash_mtia.BwOp)
+MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp)
+MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
+MemoryEfficientAttentionSplitKCkOp = (ck_splitk.FwOp, ck.BwOp)
+def _deserialize_bias(attn_bias_ctx, attn_bias_tensor: Optional[torch.Tensor]) -> Any:
+    if attn_bias_tensor is None:
+        return attn_bias_ctx
+    return attn_bias_tensor
+# Note: `torch.compile` only allows custom autograd functions
+# to accept a subset of types. Therefore we serialize `op` objects
+# to `str` before entering the function, and unserialize them inside.
+# See also: https://github.com/pytorch/pytorch/issues/118395
+_OPS_LOOKUP = {
+    flash.FwOp.NAME: flash.FwOp,
+    flash.BwOp.NAME: flash.BwOp,
+    flash_mtia.FwOp.NAME: flash_mtia.FwOp,
+    flash_mtia.BwOp.NAME: flash_mtia.BwOp,
+}
+def _serialize_op(op):
+    if op is not None and op.NAME in _OPS_LOOKUP:
+        return op.NAME
+    return op
+def _unserialize_op(op):
+    if isinstance(op, str):
+        return _OPS_LOOKUP[op]
+    return op
+class _fMHA(torch.autograd.Function):
+    @staticmethod
+    # type: ignore
+    def forward(ctx, op_fw, op_bw, *args: Any) -> Any:
+        inp = Inputs(*args)
+        op_fw = _unserialize_op(op_fw)
+        op_bw = _unserialize_op(op_bw)
+        out, op_ctx = _memory_efficient_attention_forward_requires_grad(
+            inp=inp, op=op_fw
+        )
+        # Saving attn_bias is a bit complicated, as the
+        # torch part should go in `save_for_backward`
+        if isinstance(inp.attn_bias, torch.Tensor):
+            attn_bias_tensor = inp.attn_bias
+            attn_bias_ctx = None
+        else:
+            attn_bias_tensor = None
+            attn_bias_ctx = inp.attn_bias
+        ctx.save_for_backward(
+            inp.query,
+            inp.key,
+            inp.value,
+            op_ctx.out,
+            op_ctx.lse,
+        )
+        ctx.rng_state = op_ctx.rng_state
+        ctx.attn_bias_tensor = attn_bias_tensor
+        if op_ctx.op_bw is not None:
+            if op_bw is not None and op_bw is not op_ctx.op_bw:
+                raise ValueError(
+                    f"Specified op_bw={op_bw.NAME}, but forward op "
+                    f"can only run with op_bw={op_ctx.op_bw.NAME}. Please set op_bw=None."
+                )
+            op_bw = op_ctx.op_bw
+        if (
+            op_fw is not None
+            and op_bw is not None
+            and isinstance(inp.attn_bias, VARLEN_BIASES)
+            and inp.attn_bias.q_seqinfo.seqstart.shape[0] > 2
+            and op_bw.VARLEN_LSE_PACKED != op_fw.VARLEN_LSE_PACKED
+        ):
+            raise ValueError(
+                f"Specified op_bw={op_bw.NAME} is not compatible with the "
+                f"op_fw={op_fw.NAME}, because they use different format of logsumexp. "
+                f"NOTE: This is new with xFormers 0.0.28"
+            )
+        if op_bw is None and (
+            inp.query.requires_grad or inp.key.requires_grad or inp.value.requires_grad
+        ):
+            varlen_lse_packed = _detect_lse_packed_or_raise(op_ctx.lse, inp)
+            if varlen_lse_packed is not None and op_fw is not None:
+                assert op_fw.VARLEN_LSE_PACKED == varlen_lse_packed, (
+                    f"{op_fw.NAME}: wrong value for `VARLEN_LSE_PACKED` ?"
+                )
+            # NOTE: We need to check tensor strides to decide which operator we run in the BW pass.
+            # Unfortunately, PyTorch only allows to call this function during the FW pass, so
+            # we decide the operator to use now.
+            op_bw = _dispatch_bw(inp, varlen_lse_packed=varlen_lse_packed)
+        ctx.op_fw = op_fw
+        ctx.op_bw = op_bw
+        ctx.p = inp.p
+        # This allows to create gradients from a single storage,
+        # to avoid a "cat" in the BW pass.
+        # The heuristic is approximative, but:
+        # (1) It's not a big issue to create a shared storage
+        # (2) The heuristic needs to pass `torch.compile`
+        #  (this is also why we run it in the FW pass, the BW pass is stricter)
+        ctx.qkv_share_storage = (
+            inp.query.shape[0] == inp.key.shape[0]
+            and inp.query.shape[-1] == inp.value.shape[-1]
+            and inp.query.stride(-2)
+            == (inp.key.shape[-1] + inp.query.shape[-1] + inp.value.shape[-1])
+        )
+        ctx.scale = inp.scale
+        ctx.attn_bias_ctx = attn_bias_ctx
+        ctx.n_args = len(args)
+        return out, op_ctx.lse
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad, grad_lse):
+        # Re-create context
+        query, key, value, out, lse = ctx.saved_tensors
+        attn_bias_tensor = ctx.attn_bias_tensor
+        rng_state = ctx.rng_state
+        inp = Inputs(
+            query=query,
+            key=key,
+            value=value,
+            attn_bias=_deserialize_bias(ctx.attn_bias_ctx, attn_bias_tensor),
+            p=ctx.p,
+            scale=ctx.scale,
+        )
+        op_ctx = Context(
+            lse=lse,
+            out=out,
+            rng_state=rng_state,
+            qkv_share_storage=ctx.qkv_share_storage,
+        )
+        grads = _memory_efficient_attention_backward(
+            ctx=op_ctx,
+            inp=inp,
+            grad=grad,
+            op=ctx.op_bw,
+            _skip_op_checks=True,
+        )
+        return (None, None, grads.dq, grads.dk, grads.dv, grads.db) + (None,) * (
+            ctx.n_args - 2
+        )
+def memory_efficient_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[AttentionOp] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """Implements the memory-efficient attention mechanism following
+    `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_.
+    :Inputs shape:
+    - Input tensors must be in format ``[B, M, H, K]``, where B is the batch size, M \
+        the sequence length, H the number of heads, and K the embeding size per head
+    - If inputs have dimension 3, it is assumed that the dimensions are ``[B, M, K]`` and ``H=1``
+    - Inputs can also be of dimension 5 with GQA - see note below
+    - Inputs can be non-contiguous - we only require the last dimension's stride to be 1
+    :Equivalent pytorch code:
+    .. code-block:: python
+        scale = 1.0 / query.shape[-1] ** 0.5
+        query = query * scale
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        attn = query @ key.transpose(-2, -1)
+        if attn_bias is not None:
+            attn = attn + attn_bias
+        attn = attn.softmax(-1)
+        attn = F.dropout(attn, p)
+        attn = attn @ value
+        return attn.transpose(1, 2).contiguous()
+    :Examples:
+    .. code-block:: python
+        import xformers.ops as xops
+        # Compute regular attention
+        y = xops.memory_efficient_attention(q, k, v)
+        # With a dropout of 0.2
+        y = xops.memory_efficient_attention(q, k, v, p=0.2)
+        # Causal attention
+        y = xops.memory_efficient_attention(
+            q, k, v,
+            attn_bias=xops.LowerTriangularMask()
+        )
+    :Supported hardware:
+        NVIDIA GPUs with compute capability above 6.0 (P100+), datatype ``f16``, ``bf16`` and ``f32``.
+    :EXPERIMENTAL: Using with Multi Query Attention (MQA) and Grouped Query Attention (GQA):
+        MQA/GQA is an experimental feature supported only for the forward pass.
+        If you have 16 heads in query, and 2 in key/value, you can provide 5-dim tensors
+        in the ``[B, M, G, H, K]`` format, where ``G`` is the number of head groups (here 2), and
+        ``H`` is the number of heads per group (8 in the example).
+        Please note that xFormers will not automatically broadcast the inputs, so you will need
+        to broadcast it manually before calling `memory_efficient_attention`.
+    :GQA/MQA example:
+    .. code-block:: python
+        import torch
+        import xformers.ops as xops
+        B, M, K = 3, 32, 128
+        kwargs = dict(device="cuda", dtype=torch.float16)
+        q = torch.randn([B, M, 8, K], **kwargs)
+        k = torch.randn([B, M, 2, K], **kwargs)
+        v = torch.randn([B, M, 2, K], **kwargs)
+        out_gqa = xops.memory_efficient_attention(
+            q.reshape([B, M, 2, 4, K]),
+            k.reshape([B, M, 2, 1, K]).expand([B, M, 2, 4, K]),
+            v.reshape([B, M, 2, 1, K]).expand([B, M, 2, 4, K]),
+        )
+    Raises:
+        NotImplementedError: if there is no operator available to compute the MHA
+        ValueError: if inputs are invalid
+    :parameter query: Tensor of shape ``[B, Mq, H, K]``
+    :parameter key: Tensor of shape ``[B, Mkv, H, K]``
+    :parameter value: Tensor of shape ``[B, Mkv, H, Kv]``
+    :parameter attn_bias: Bias to apply to the attention matrix - defaults to no masking. \
+        For common biases implemented efficiently in xFormers, see :attr:`xformers.ops.fmha.attn_bias.AttentionBias`. \
+        This can also be a :attr:`torch.Tensor` for an arbitrary mask (slower).
+    :parameter p: Dropout probability. Disabled if set to ``0.0``
+    :parameter scale: Scaling factor for ``Q @ K.transpose()``. If set to ``None``, the default \
+        scale (q.shape[-1]**-0.5) will be used.
+    :parameter op: The operators to use - see :attr:`xformers.ops.AttentionOpBase`. \
+        If set to ``None`` (recommended), xFormers \
+        will dispatch to the best available operator, depending on the inputs \
+        and options.
+    :return: multi-head attention Tensor with shape ``[B, Mq, H, Kv]``
+    """
+    return _memory_efficient_attention(
+        Inputs(
+            query=query,
+            key=key,
+            value=value,
+            p=p,
+            attn_bias=attn_bias,
+            scale=scale,
+            output_dtype=output_dtype,
+        ),
+        op=op,
+    )
+torch.library.define(
+    "mslk::memory_efficient_attention_forward",
+    "(Tensor q, Tensor k, Tensor v, Tensor? b = None, float? p = 0.0, float? scale = None) -> Tensor",
+)
+def _memory_efficient_attention_forward_torch_wrapper_meta(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+):
+    return torch.empty_like(query)
+torch.library.impl(
+    "mslk::memory_efficient_attention_forward",
+    "Meta",
+    _memory_efficient_attention_forward_torch_wrapper_meta,
+)
+# torch.compile has issue when tracing through op dispatch and ensure_op_support
+# so provide a wrapper to register it as a custom torch library op.
+def _memory_efficient_attention_forward_torch_wrapper(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    This provides a torch-compilable wrapper op to
+    memory_efficient_attention_forward in certain special cases.
+    Note that the following are not supported
+        - `op` input (?)
+        - certain attn_bias types (?)
+        - output_dtype
+        - K != Kv
+    """
+    return memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias,
+        p,
+        scale,
+    )
+torch.library.impl(
+    "mslk::memory_efficient_attention_forward",
+    "CUDA",
+    _memory_efficient_attention_forward_torch_wrapper,
+)
+torch.library.define(
+    "mslk::memory_efficient_attention_forward_with_bias",
+    "(Tensor q, Tensor k, Tensor v, Tensor b, float? p = 0.0, float? scale = None) -> Tensor",
+)
+def _memory_efficient_attention_forward_torch_wrapper_with_bias_meta(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Union[torch.Tensor, AttentionBias],
+    p: float = 0.0,
+    scale: Optional[float] = None,
+):
+    return torch.empty_like(query)
+torch.library.impl(
+    "mslk::memory_efficient_attention_forward_with_bias",
+    "Meta",
+    _memory_efficient_attention_forward_torch_wrapper_with_bias_meta,
+)
+# torch.compile has issue when tracing through op dispatch and ensure_op_support
+# so provide a wrapper to register it as a custom torch library op.
+def _memory_efficient_attention_forward_torch_wrapper_with_bias(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Union[torch.Tensor, AttentionBias],
+    p: float = 0.0,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    This provides a torch-compilable wrapper op to
+    memory_efficient_attention_forward in certain special cases.
+    Note that the following are not supported
+        - `op` input (?)
+        - certain attn_bias types (?)
+        - output_dtype
+        - K != Kv
+    """
+    return memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias,
+        p,
+        scale,
+    )
+torch.library.impl(
+    "mslk::memory_efficient_attention_forward_with_bias",
+    "CUDA",
+    _memory_efficient_attention_forward_torch_wrapper_with_bias,
+)
+def memory_efficient_attention_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Type[AttentionFwOpBase]] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """
+    Calculates the forward pass of :attr:`xformers.ops.memory_efficient_attention`.
+    """
+    return _memory_efficient_attention_forward(
+        Inputs(
+            query=query,
+            key=key,
+            value=value,
+            p=p,
+            attn_bias=attn_bias,
+            scale=scale,
+            output_dtype=output_dtype,
+        ),
+        op=op,
+    )
+def memory_efficient_attention_forward_requires_grad(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Type[AttentionFwOpBase]] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Returns a tuple (output, lse), where `lse` can be used to compute the backward pass later.
+    See :attr:`xformers.ops.memory_efficient_attention` for an explanation of the arguments
+    See :attr:`xformers.ops.memory_efficient_attention_backward` for running the backward pass
+    """
+    if p != 0.0:
+        raise NotImplementedError(
+            "dropout is not supported on the non-autograd API."
+            " If you want to use dropout, please call `memory_efficient_attention` directly"
+        )
+    out, ctx = _memory_efficient_attention_forward_requires_grad(
+        Inputs(
+            query=query,
+            key=key,
+            value=value,
+            p=p,
+            attn_bias=attn_bias,
+            scale=scale,
+            output_dtype=output_dtype,
+        ),
+        op=op,
+    )
+    return out, ctx.lse
+def memory_efficient_attention_backward(
+    grad: torch.Tensor,
+    output: torch.Tensor,
+    lse: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Type[AttentionBwOpBase]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Computes the gradient of the attention.
+    Returns a tuple (dq, dk, dv)
+    See :attr:`xformers.ops.memory_efficient_attention` for an explanation of the arguments.
+    `lse` is the tensor returned by
+    :attr:`xformers.ops.memory_efficient_attention_forward_requires_grad`
+    """
+    if p != 0.0:
+        raise NotImplementedError(
+            "dropout is not supported on the non-autograd API."
+            " If you want to use dropout, please call `memory_efficient_attention` directly"
+        )
+    gradients = _memory_efficient_attention_backward(
+        Context(out=output, lse=lse),
+        Inputs(
+            query=query, key=key, value=value, p=p, attn_bias=attn_bias, scale=scale
+        ),
+        grad,
+        op=op,
+    )
+    return (gradients.dq, gradients.dk, gradients.dv)
+def _memory_efficient_attention(
+    inp: Inputs, op: Optional[AttentionOp] = None
+) -> torch.Tensor:
+    # fast-path that doesn't require computing the logsumexp for backward computation
+    if all(x.requires_grad is False for x in [inp.query, inp.key, inp.value]):
+        return _memory_efficient_attention_forward(
+            inp, op=op[0] if op is not None else None
+        )
+    output_shape = inp.normalize_bmhk()
+    op_fw = _serialize_op(op[0] if op is not None else None)
+    op_bw = _serialize_op(op[1] if op is not None else None)
+    return _fMHA.apply(
+        op_fw, op_bw, inp.query, inp.key, inp.value, inp.attn_bias, inp.p, inp.scale
+    )[0].reshape(output_shape)
+def _memory_efficient_attention_forward(
+    inp: Inputs, op: Optional[Type[AttentionFwOpBase]]
+) -> torch.Tensor:
+    inp.validate_inputs()
+    output_shape = inp.normalize_bmhk()
+    if op is None:
+        op = _dispatch_fw(inp, False)
+    else:
+        _ensure_op_supports_or_raise(ValueError, "memory_efficient_attention", op, inp)
+    out, *_ = op.apply(inp, needs_gradient=False)
+    return out.reshape(output_shape)
+def _memory_efficient_attention_forward_requires_grad(
+    inp: Inputs, op: Optional[Type[AttentionFwOpBase]]
+) -> Tuple[torch.Tensor, Context]:
+    inp.validate_inputs()
+    output_shape = inp.normalize_bmhk()
+    if op is None:
+        op = _dispatch_fw(inp, True)
+    else:
+        _ensure_op_supports_or_raise(ValueError, "memory_efficient_attention", op, inp)
+    out, ctx = op.apply(inp, needs_gradient=True)
+    assert ctx is not None
+    return (out.reshape(output_shape), ctx)
+def _detect_lse_packed_or_raise(lse: torch.Tensor, inp: Inputs) -> Optional[bool]:
+    """
+    Detects the LSE format if we're in a varlen case.
+    Returns `None` if the format is not relevant (eg not varlen)
+    Raises an exception if the `lse` has the wrong shape
+    """
+    shape_mismatch_err = (
+        "Input tensors have incompatible shapes.\n"
+        f"  lse.shape    : {lse.shape}\n"
+        f"  query.shape  : {inp.query.shape}\n"
+        f"  attn_bias    : {type(inp.attn_bias)}"
+    )
+    # 1. Check ndim & head dimensions
+    # In any case, LSE should be [*, *GH]
+    if lse.ndim != (inp.query.ndim - 1) or lse.shape[1:-1] != inp.query.shape[2:-1]:
+        raise ValueError(shape_mismatch_err)
+    lse_bm = [lse.shape[0], lse.shape[-1]]
+    lse_packed_shape = [inp.query.shape[0], inp.query.shape[1]]
+    lse_packed = lse_bm[0] == lse_packed_shape[0] and lse_bm >= lse_packed_shape
+    # 2. Check correctness for varlen biases with query.shape = [1, M, *GH, K]
+    # Either [1, *GH, M] (packed)
+    # Or     [num_seq, *GH, Mq] .. with `Mq >= max_q` (padded)
+    if isinstance(inp.attn_bias, VARLEN_BIASES):
+        si = inp.attn_bias.q_seqinfo
+        lse_padded_shape = [si.seqstart.shape[0] - 1, si.max_seqlen]
+        lse_padded = lse_bm[0] == lse_padded_shape[0] and lse_bm >= lse_padded_shape
+        if lse_packed and lse_padded:
+            return None
+        elif lse_packed:
+            return True
+        elif lse_padded:
+            return False
+        raise ValueError(shape_mismatch_err)
+    # 3. For non-varlen, shape must be [B, *GH] with query.shape=[B, M, *GH, K]
+    if not lse_packed:
+        raise ValueError(shape_mismatch_err)
+    return None
+def _memory_efficient_attention_backward(
+    ctx: Context,
+    inp: Inputs,
+    grad: torch.Tensor,
+    op: Optional[Type[AttentionBwOpBase]],
+    *,
+    _skip_op_checks: bool = False,
+) -> Gradients:
+    """Warning: grad/ctx.out is potentially in BMK format"""
+    inp.validate_inputs()
+    if grad.ndim != inp.query.ndim or grad.ndim != ctx.out.ndim:
+        raise ValueError(
+            "All tensors should be either in BMK (ndim=3) or BMHK (ndim=4) format. \n"
+            f"grad.shape : {grad.shape} \n"
+            f"out.shape  : {ctx.out.shape} \n"
+            f"query.shape: {inp.query.shape}"
+        )
+    shape_dq, shape_dk, shape_dv = tuple(
+        x.shape for x in (inp.query, inp.key, inp.value)
+    )
+    inp.normalize_bmhk()
+    varlen_lse_packed = _detect_lse_packed_or_raise(ctx.lse, inp)
+    grad = bmk2bmhk(grad, 1)
+    ctx.out = bmk2bmhk(ctx.out, 1)
+    if op is None:
+        op = _dispatch_bw(inp, varlen_lse_packed=varlen_lse_packed)
+    elif not _skip_op_checks:
+        _ensure_op_supports_or_raise(
+            ValueError, "memory_efficient_attention_backward", op, inp
+        )
+        if varlen_lse_packed is not None and varlen_lse_packed != op.VARLEN_LSE_PACKED:
+            raise ValueError(
+                f"Wrong LSE format for {op.NAME} in variable seqlen case. "
+                f"Double-check that the BW operator {op.NAME} is compatible "
+                f"with the operator used in the FW pass."
+            )
+    grads = op.apply(ctx, inp, grad)
+    grads.dq = grads.dq.reshape(shape_dq)
+    grads.dk = grads.dk.reshape(shape_dk)
+    grads.dv = grads.dv.reshape(shape_dv)
+    return grads
+def memory_efficient_attention_partial(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Union[AttentionOp, Type[AttentionFwOpBase]]] = None,
+    output_dtype: Optional[torch.dtype] = None,
+    _allow_backward: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Returns a tuple (output, lse), where `output` is the attention in the style of
+    memory_efficient_attention, and  `lse` is extra data, a log-sum-exp.
+    The outputs of calls to this with the same query and separate keys and values
+    can be merged with merge_attentions to obtain the attention of the queries
+    against the disjoint union of the keys and values.
+    This function doesn't have a backward pass.
+    If _allow_backward is set to True, then a backward pass is allowed,
+    but it is restricted: only the gradient of the output, not the gradient of
+    the LSE, is used.
+    Note that this makes it very easy to accidentally get wrong gradients.
+    """
+    if p != 0.0:
+        raise NotImplementedError("dropout is not supported.")
+    fwop: Optional[Type[AttentionFwOpBase]] = op[0] if isinstance(op, tuple) else op
+    inp = Inputs(
+        query=query,
+        key=key,
+        value=value,
+        p=p,
+        attn_bias=attn_bias,
+        scale=scale,
+        output_dtype=output_dtype,
+        is_partial=True,
+    )
+    is_grad = (
+        _allow_backward
+        and torch.is_grad_enabled()
+        and any(x.requires_grad for x in [query, key, value])
+    )
+    if not is_grad:
+        out, ctx = _memory_efficient_attention_forward_requires_grad(
+            inp,
+            op=fwop,
+        )
+        return out, ctx.lse
+    if query.ndim == 5:
+        raise ValueError("gradients not supported for 5D tensors")
+    if isinstance(op, tuple):
+        op_fw = _serialize_op(op[0])
+        op_bw = _serialize_op(op[1])
+    elif op is None:
+        op_fw = op_bw = None
+    else:
+        op_fw = _serialize_op(op)
+        op_bw = None
+    return _fMHA.apply(
+        op_fw,
+        op_bw,
+        inp.query,
+        inp.key,
+        inp.value,
+        inp.attn_bias,
+        inp.p,
+        inp.scale,
+        inp.output_dtype,
+        inp.is_partial,
+    )
+def merge_attentions(  # noqa: C901
+    attn_split: Union[torch.Tensor, Sequence[torch.Tensor]],
+    lse_split: Union[torch.Tensor, Sequence[torch.Tensor]],
+    write_lse: bool = True,
+    output_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Combine attention output computed on different parts of K/V for the same
+    query to get attention on the whole K/V. See https://arxiv.org/abs/2402.05099
+    The result is equal to
+        Out_full = (Out1 * exp(LSE1) + Out2 * exp(LSE2) + ...) / (exp(LSE1) + exp(LSE2) + ...)
+        LSE_full = log(exp(LSE1) + exp(LSE2) + ...)
+    Args:
+        attn_split: attention outputs for chunks,
+            either as a list of tensors of shapes [B, M, G, H, Kq] or [B, M, H, Kq]
+            or as a single tensor of shape [num_chunks, B, M, G, H, Kq]
+            or [num_chunks, B, M, H, Kq]
+        lse_split: LSE for chunks,
+            either as a list of tensors of shapes [B, G, H, M] or [B, H, M]
+            or as a single tensor of shape [num_chunks, B, G, H, M] or [num_chunks, B, H, M]
+        write_lse: whether to output LSE
+        output_dtype: dtype of attn_out
+    Returns:
+        attn_out: [B, M, G, H, Kq] or [B, M, H, Kq]
+        lse_out: [B, G, H, M] or [B, H, M] if write_lse
+                 or None otherwise
+    """
+    attn_is_concat = isinstance(attn_split, torch.Tensor)
+    lse_is_concat = isinstance(lse_split, torch.Tensor)
+    attn_requires_grad = (
+        attn_split.requires_grad  # type: ignore
+        if attn_is_concat
+        else any(x.requires_grad for x in attn_split)
+    )
+    lse_requires_grad = (
+        lse_split.requires_grad  # type: ignore
+        if lse_is_concat
+        else any(x.requires_grad for x in lse_split)
+    )
+    requires_grad = torch.is_grad_enabled() and (
+        attn_requires_grad or lse_requires_grad
+    )
+    if requires_grad and not write_lse:
+        raise ValueError("write_lse should be true if inputs require gradients.")
+    concat_path = attn_is_concat and lse_is_concat and not requires_grad
+    if concat_path:
+        attn_split = cast(torch.Tensor, attn_split)
+        lse_split = cast(torch.Tensor, lse_split)
+        if attn_split.ndim != lse_split.ndim + 1:
+            raise ValueError(
+                f"Incompatible input shapes: {attn_split.shape=}, {lse_split.shape=}"
+            )
+        is_bmhk = attn_split.ndim == 5
+        if is_bmhk:
+            attn_split = attn_split.unsqueeze(3)
+            lse_split = lse_split.unsqueeze(2)
+        num_chunks, B, M, G, H, Kq = attn_split.shape
+        num_chunks1, B1, G1, H1, M1 = lse_split.shape
+        if B != B1 or G != G1 or H != H1 or num_chunks != num_chunks1 or M != M:
+            raise ValueError(
+                f"Incompatible input shapes: {attn_split.shape=} {lse_split.shape=} "
+                f"{B}/{B1}, {G}/{G1}, {H}/{H1}, {num_chunks}/{num_chunks1}, {M}/{M}"
+            )
+        attn_split = attn_split.permute(1, 3, 4, 0, 2, 5)
+        lse_split = lse_split.permute(1, 2, 3, 0, 4)
+        device = attn_split.device
+        attn_dtype = attn_split.dtype
+        lse_dtype = lse_split.dtype
+    else:
+        if attn_is_concat:
+            attn_split = attn_split.unbind(0)  # type: ignore
+        if lse_is_concat:
+            lse_split = lse_split.unbind(0)  # type: ignore
+        num_chunks = len(attn_split)
+        if len(lse_split) != num_chunks:
+            raise ValueError(
+                f"Incompatible number of LSE and attention chunks: {len(attn_split)=}, {len(lse_split)=}"
+            )
+        attn_unsqueezed = []
+        lse_unsqueezed = []
+        is_bmhk = False
+        for i in range(num_chunks):
+            if attn_split[i].ndim != lse_split[i].ndim + 1:
+                raise ValueError(
+                    f"Incompatible input shapes for chunk {i}: {attn_split[i].shape=}, {lse_split[i].shape=}"
+                )
+            is_bmhk = attn_split[i].ndim == 4
+            if is_bmhk:
+                attn_unsqueezed.append(attn_split[i].unsqueeze(2))
+                lse_unsqueezed.append(lse_split[i].unsqueeze(1))
+            else:
+                attn_unsqueezed.append(attn_split[i])
+                lse_unsqueezed.append(lse_split[i])
+        attn_split, lse_split = attn_unsqueezed, lse_unsqueezed
+        B, M, G, H, Kq = attn_split[0].shape
+        B1, G1, H1, M1 = lse_split[0].shape
+        if B != B1 or G != G1 or H != H1 or M != M:
+            raise ValueError(
+                f"Incompatible input shapes: {attn_split[0].shape=}, {lse_split[0].shape=} "
+                f"{B}/{B1}, {G}/{G1}, {H}/{H1}, {M}/{M}"
+            )
+        for i in range(num_chunks):
+            if attn_split[i].shape != (B, M, G, H, Kq):
+                raise ValueError(
+                    f"Incompatible input shapes for attention chunk {i}: "
+                    f"{attn_split[i].shape=}, {(B, M, G, H, Kq)=}"
+                )
+            if lse_split[i].shape != (B, G, H, M):
+                raise ValueError(
+                    f"Incompatible input shapes for LSE chunk {i}: "
+                    f"{lse_split[i].shape=}, {(B, G, H, M)=}"
+                )
+            attn_split[i] = attn_split[i].permute(0, 2, 3, 1, 4)  # to (B, G, H, M, Kq)
+        device = attn_split[0].device
+        attn_dtype = attn_split[0].dtype
+        lse_dtype = lse_split[0].dtype
+    if concat_path:
+        attn_out = torch.empty(
+            B,
+            M,
+            G,
+            H,
+            Kq,
+            device=device,
+            dtype=output_dtype or attn_dtype,
+        )
+        if write_lse:
+            lse_out = torch.empty(
+                B,
+                G,
+                H,
+                M,
+                device=device,
+                dtype=lse_dtype,
+            )
+        else:
+            lse_out = None
+        triton_splitk.merge_attentions(attn_out, lse_out, attn_split, lse_split)  # type: ignore
+    else:
+        outs = triton_splitk.merge_attentions_varargs(
+            attn_split, lse_split, write_lse, output_dtype, B, M, G, H, Kq
+        )  # type: ignore
+        attn_out = outs[0]
+        lse_out = outs[1] if write_lse else None
+    if is_bmhk:
+        attn_out = attn_out[:, :, 0]
+        if lse_out is not None:
+            lse_out = lse_out[:, 0]
+    return attn_out, lse_out
+ALL_FW_OPS: List[Type[AttentionFwOpBase]] = [
+    cutlass.FwOp if torch.version.cuda else ck.FwOp,
+    cutlass_blackwell.FwOp,
+    flash.FwOp,
+    flash_mtia.FwOp,
+    flash3.FwOp,
+    triton_splitk.FwOp,
+]
+ALL_BW_OPS: List[Type[AttentionBwOpBase]] = [
+    cutlass.BwOp if torch.version.cuda else ck.BwOp,
+    cutlass_blackwell.BwOp,
+    flash.BwOp,
+    flash_mtia.BwOp,
+    flash3.BwOp,
+]
+__all__ = [
+    "AttentionBias",
+    "AttentionOp",
+    "AttentionOpBase",
+    "LowerTriangularMask",
+    "MemoryEfficientAttentionCutlassFwdFlashBwOp",
+    "MemoryEfficientAttentionCutlassOp",
+    "MemoryEfficientAttentionFlashAttentionOp",
+    "MemoryEfficientAttentionFlashMtiaAttentionOp",
+    "memory_efficient_attention",
+    "MemoryEfficientAttentionCkOp",
+    "MemoryEfficientAttentionCkDecoderOp",
+    "ALL_FW_OPS",
+    "ALL_BW_OPS",
+    "attn_bias",
+    "_get_use_fa3",
+    "_set_use_fa3",
+    "BlockDiagonalMask",
+]