PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/fmha/attn_bias_utils.py ADDED Viewed

@@ -0,0 +1,536 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type
+import torch
+from .. import fmha
+from .attn_bias import AttentionBias
+from .common import AttentionOpBase
+def _create_aligned_bias(*shape: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                *shape[:-1],
+                align_to * ((shape[-1] + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    ).narrow(-1, 0, shape[-1])
+def create_attn_bias(  # noqa: C901
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    num_heads_groups: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Optional[Type[AttentionOpBase]] = None,
+    page_size: Optional[int] = None,
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    window_size = {0: 3, 1: 128, 2: 300}[r.randint(0, 2)]
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        if op is not None and issubclass(op, fmha.triton_splitk.FwOp):
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads_groups, num_heads, q_len, kv_len),
+                    device=device,
+                    dtype=dtype,
+                )
+                * 3
+            )
+            if fmt in ["BMK", "BMHK"]:
+                attn_bias = attn_bias[:, 0]
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads_groups,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+            # make sure it also works if the first columns/rows are partially masked out
+            attn_bias[0, 0, 0, : q_len - 1, : kv_len - 1] = -math.inf
+            if fmt in ["BMK", "BMHK"]:
+                attn_bias = attn_bias[:, 0]
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return bias_type()
+    if bias_type is fmha.attn_bias.LowerTriangularFromBottomRightMask:
+        return bias_type()
+    if bias_type is fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask:
+        return bias_type(window_size)
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads_groups,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if fmt in ["BMK", "BMHK"]:
+            attn_bias = attn_bias[:, 0]
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+    ]:
+        # These bias types are not supported in BMK format
+        assert fmt in ["BMGHK", "BMHK"]
+        max_q_minus_k = None
+        if bias_type in {
+            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+        }:
+            max_q_minus_k = 0
+        elif bias_type == fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask:
+            assert window_size is not None
+            max_q_minus_k = window_size - 1
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                max_q_minus_k=max_q_minus_k,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type in {
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+        }:
+            block_diag = fmha.attn_bias.BlockDiagonalMask(
+                q_seqinfo=block_diag.q_seqinfo,
+                k_seqinfo=block_diag.k_seqinfo,
+                _batch_sizes=block_diag._batch_sizes,
+            )
+            assert window_size is not None
+            if bias_type is fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask:
+                block_diag = block_diag.make_local_attention(window_size)
+            else:
+                block_diag = block_diag.make_local_attention_from_bottomright(
+                    window_size
+                )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalLocalAttentionPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalCausalLocalAttentionPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
+        fmha.attn_bias.PagedBlockDiagonalCausalLocalPaddedKeysMask,
+        fmha.attn_bias.PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
+    ]:
+        assert fmt in ["BMHK", "BMGHK"]
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        block_diag_type = (
+            bias_type._UNPAGED_TYPE
+            if issubclass(bias_type, fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask)
+            else bias_type
+        )
+        if bias_type in [
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionPaddedKeysMask,
+            fmha.attn_bias.PagedBlockDiagonalCausalLocalPaddedKeysMask,
+        ]:
+            g_block_diag = block_diag_type.from_seqlens_local(  # type: ignore
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+                window_size=min(window_size, min(k)),
+            )
+        elif bias_type is fmha.attn_bias.BlockDiagonalLocalAttentionPaddedKeysMask:
+            g_block_diag = block_diag_type.from_seqlens_local(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+                window_left=max(window_size, max(q)) + 1,
+                window_right=max(window_size, max(q)) + 1,
+            )
+        else:
+            g_block_diag = block_diag_type.from_seqlens(  # type: ignore
+                q_seqlen=q,
+                kv_padding=kv_len,  # type: ignore
+                kv_seqlen=k,
+            )
+        if issubclass(bias_type, fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask):
+            assert page_size is not None
+            pages_per_row = (kv_len + page_size - 1) // page_size
+            block_tables = torch.tensor(
+                r.sample(range(batch_size * pages_per_row), batch_size * pages_per_row),
+                device=device,
+                dtype=torch.int32,
+            ).reshape(batch_size, pages_per_row)
+            return g_block_diag.make_paged(
+                block_tables=block_tables, page_size=page_size, paged_type=bias_type
+            )
+        return g_block_diag
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalCausalWithOffsetGappyKeysMask,
+        fmha.attn_bias.BlockDiagonalGappyKeysMask,
+        fmha.attn_bias.BlockDiagonalLocalAttentionFromBottomRightGappyKeysMask,
+    ]:
+        assert fmt in ["BMHK", "BMGHK"]
+        max_q_minus_k = (
+            None if bias_type is fmha.attn_bias.BlockDiagonalGappyKeysMask else 0
+        )
+        q, k = _rand_seqlens(r, batch_size, q_len, kv_len, max_q_minus_k)
+        total_kv_len = kv_len * batch_size
+        starts = [r.randint(0, total_kv_len - ki) for ki in k] + [total_kv_len]
+        if (
+            bias_type
+            is fmha.attn_bias.BlockDiagonalLocalAttentionFromBottomRightGappyKeysMask
+        ):
+            return bias_type.from_seqlens_local_gappy(
+                q_seqlen=q,
+                kv_seqstarts=starts,
+                kv_seqlen=k,
+                window_left=r.randint(0, 5),
+                window_right=r.randint(0, 5),
+                device=device,
+            )
+        return bias_type.from_seqlens(
+            q_seqlen=q,
+            kv_seqstarts=starts,
+            kv_seqlen=k,
+        )
+    if issubclass(bias_type, fmha.attn_bias.PagedBlockDiagonalGappyKeysMask):
+        assert fmt in ["BMHK", "BMGHK"]
+        assert page_size is not None
+        pages_per_row = (kv_len + page_size - 1) // page_size
+        total_queries = q_len * batch_size
+        if issubclass(
+            bias_type, fmha.attn_bias.PagedBlockDiagonalCausalWithOffsetGappyKeysMask
+        ):
+            q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        else:
+            q = _rand_maxed_partition(
+                r, total_queries, batch_size, total_queries, False
+            )
+            k = [r.randint(1, kv_len) for _ in range(batch_size)]
+        row_size = pages_per_row * page_size
+        starts = [row_size * i + r.randint(0, row_size - ki) for i, ki in enumerate(k)]
+        starts.append(pages_per_row * batch_size * page_size)
+        block_diag_type = bias_type._UNPAGED_TYPE  # type: ignore
+        g_block_diag = block_diag_type.from_seqlens(
+            q_seqlen=q,
+            kv_seqstarts=starts,
+            kv_seqlen=k,
+        )
+        block_tables = torch.tensor(
+            r.sample(range(batch_size * pages_per_row), batch_size * pages_per_row),
+            device=device,
+            dtype=torch.int32,
+        ).reshape(batch_size, pages_per_row)
+        return g_block_diag.make_paged(
+            block_tables=block_tables,
+            page_size=page_size,
+            paged_type=bias_type,
+            notional_padding=page_size * pages_per_row,
+        )
+    if bias_type == fmha.attn_bias.LocalAttentionFromBottomRightMask:
+        return bias_type(
+            window_left=r.randint(0, 5),
+            window_right=r.randint(0, 5),
+        )
+    raise AssertionError(f"Unsupported bias type: {bias_type}")
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    max_q_minus_k: Optional[int],
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    max_q_minus_k: maximum allowed num_queries - num_keys.
+        For "bottom-right" masks it's 0, we need to have more keys than
+        queries, otherwise some queries have no keys to attend to.
+        For BlockDiagonalCausalMask it's None, there is no constraint
+        on num_queries - num_keys.
+        For BlockDiagonalCausalLocalAttentionMask it's equal
+        to the window size.
+    """
+    if max_q_minus_k == 0:
+        # In case max_q_minus_k > 0 the exact condition is
+        # kv_len >= q_len - max_q_minus_k * batch_size,
+        # but we can't check it without knowing the actual batch size,
+        # which is determined in the loop below.
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        if max_q_minus_k is None:
+            # Simple case - no constraint on the number of queries and keys.
+            num_queries = r.randrange(*step_q)
+            seqlens_q.append(num_queries)
+            seqlens_k.append(r.randrange(*step_k))
+        else:
+            # In this case we need to make sure num_queries - num_keys < max_q_minus_k holds for every batch element.
+            # To do this, when choosing num_queries and num_keys at a given step,
+            # we ensure two conditions are satisfied:
+            # 1) num_queries <= num_keys + max_q_minus_k for the current batch element
+            # 2) Same holds for the remaining keys and queries, i.e.
+            #    queries_left - num_queries <= keys_left - num_keys + max_q_minus_k
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q, 0)
+            assert keys_left >= queries_left - max_q_minus_k, (
+                f"{keys_left=} {queries_left=} {max_q_minus_k=} {kv_len=} {q_len=} {seqlens_k=} {seqlens_q=}"
+            )
+            # Limit num_queries from above: if num_queries > keys_left + max_q_minus_k,
+            # condition num_queries <= num_keys + max_q_minus_k can't be satisfied even if we take
+            # all the remaining keys
+            max_queries_to_take = min(queries_left, keys_left + max_q_minus_k)
+            num_queries = r.randrange(1, max_queries_to_take + 1)
+            seqlens_q.append(num_queries)
+            # Now we know num_queries, let's select num_keys.
+            # How many keys can we use for the current batch element so that
+            # for the remaining keys and values the constraint
+            # num_queries - num_keys < max_q_minus_k holds on the next step?
+            extra_keys_available = keys_left - queries_left + max_q_minus_k + 1
+            assert extra_keys_available >= 0
+            if extra_keys_available > 0:
+                seqlens_k.append(num_queries + r.randrange(0, extra_keys_available))
+            else:
+                seqlens_k.append(num_queries)
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more queries than keys")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 5:
+        def attn_bias_group(group: int):
+            if isinstance(attn_bias, torch.Tensor):
+                return attn_bias[:, group]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    attn_bias._bias[:, group]
+                )
+            return attn_bias
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    scale=scale,
+                    attn_bias=attn_bias_group(g),
+                )
+                for g in range(q.shape[2])
+            ],
+            dim=2,
+        )
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+    if isinstance(attn_bias, AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+def pack_kv_cache(
+    cache_k: torch.Tensor,
+    cache_v: torch.Tensor,
+    kv_seqlens: List[int],
+    BLOCK_N: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Create block tables and pages K/V cache for testing paged attention.
+    Args:
+        cache_k, cache_v: K/V caches, each of shape [B, MAX_T, H_kv, D].
+            Note that these tensors are unexpanded,
+            i.e. for multiquery case cache_k.shape[2] = 1
+        kv_seqlens: list of K/V sequence lengths
+        BLOCK_N: number of tokens per per paged attention block
+        B: batch size
+    Returns:
+        block_tables: [B, MAX_BLOCKS]
+        packed_cache_k: [1, total_len_rounded, H_kv, D]
+        packed_cache_v: [1, total_len_rounded, H_kv, D]
+    where total_len_rounded is a sum of K/V seqlens, each rounded up
+    to a multiple of BLOCK_N.
+    """
+    kv_seqlens_rounded = [(x + BLOCK_N - 1) // BLOCK_N * BLOCK_N for x in kv_seqlens]
+    total_len_rounded = sum(kv_seqlens_rounded)
+    B, MAX_T, H, D = cache_k.shape
+    packed_cache_k = torch.empty(
+        total_len_rounded, H, D, device=cache_k.device, dtype=cache_k.dtype
+    )
+    packed_cache_v = torch.empty(
+        total_len_rounded, H, D, device=cache_k.device, dtype=cache_k.dtype
+    )
+    seqstart = 0
+    for b in range(B):
+        packed_cache_k[seqstart : seqstart + kv_seqlens[b]] = cache_k[
+            b, : kv_seqlens[b]
+        ].clone()
+        packed_cache_v[seqstart : seqstart + kv_seqlens[b]] = cache_v[
+            b, : kv_seqlens[b]
+        ].clone()
+        seqstart += kv_seqlens_rounded[b]
+    num_blocks_per_row = (MAX_T + BLOCK_N - 1) // BLOCK_N
+    block_tables = (
+        torch.arange(num_blocks_per_row, device="cuda", dtype=torch.int32)
+        .unsqueeze(0)
+        .expand(B, num_blocks_per_row)
+    )
+    seqstarts = (
+        (
+            torch.tensor(kv_seqlens_rounded).cumsum(dim=0)
+            - torch.tensor(kv_seqlens_rounded)
+        )
+        .to(device="cuda")
+        .unsqueeze(1)
+    ) // BLOCK_N
+    block_tables = (block_tables + seqstarts).contiguous().to(dtype=torch.int32)
+    return (
+        block_tables,
+        packed_cache_k.unsqueeze(0),
+        packed_cache_v.unsqueeze(0),
+    )