PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/flash_attn/block_sparsity.py ADDED Viewed

@@ -0,0 +1,219 @@
+# @nolint # fbcode
+"""
+Block-sparsity utilities for FlexAttention
+"""
+from typing import Callable, NamedTuple, Tuple
+import cutlass.cute as cute
+import torch
+from mslk.attention.flash_attn.cute_dsl_utils import to_cute_tensor
+def ceildiv(a: int, b: int) -> int:
+    return (a + b - 1) // b
+class BlockSparseTensors(NamedTuple):
+    mask_block_cnt: cute.Tensor
+    mask_block_idx: cute.Tensor
+    full_block_cnt: cute.Tensor | None
+    full_block_idx: cute.Tensor | None
+    def __new_from_mlir_values__(self, values):
+        if len(values) == 2:
+            values = (*values, None, None)
+        return BlockSparseTensors(*values)
+class BlockSparseTensorsTorch(NamedTuple):
+    mask_block_cnt: torch.Tensor
+    mask_block_idx: torch.Tensor
+    full_block_cnt: torch.Tensor | None = None
+    full_block_idx: torch.Tensor | None = None
+def _expand_sparsity_tensor(
+    tensor: torch.Tensor,
+    expected_shape: Tuple[int, ...],
+    tensor_name: str,
+    context: str | None,
+    hint: str | Callable[[], str] | None,
+) -> torch.Tensor:
+    """Check if we need to expand the tensor to expected shape, and do so if possible."""
+    needs_expand = tensor.shape != expected_shape
+    if not needs_expand:
+        return tensor
+    can_expand = all(map(lambda cur, tgt: cur == tgt or cur == 1, tensor.shape, expected_shape))
+    if not can_expand:
+        context_clause = f" ({context})" if context else ""
+        resolved_hint = hint() if callable(hint) else hint
+        hint_clause = f" Hint: {resolved_hint}" if resolved_hint else ""
+        raise ValueError(
+            f"{tensor_name}{context_clause} with shape {tensor.shape} cannot be expanded to expected shape {expected_shape}."
+            f"{hint_clause}"
+        )
+    return tensor.expand(*expected_shape)
+def _check_and_expand_block(
+    name: str,
+    cnt: torch.Tensor | None,
+    idx: torch.Tensor | None,
+    expected_count_shape: Tuple[int, int, int],
+    expected_index_shape: Tuple[int, int, int, int],
+    context: str | None,
+    hint: str | Callable[[], str] | None,
+) -> Tuple[torch.Tensor | None, torch.Tensor | None]:
+    if (cnt is None) != (idx is None):
+        raise ValueError(
+            f"{name}_block_cnt and {name}_block_idx must both be provided or both be None"
+        )
+    if cnt is None or idx is None:
+        return None, None
+    if cnt.dtype != torch.int32 or idx.dtype != torch.int32:
+        raise ValueError(f"{name}_block tensors must have dtype torch.int32")
+    if cnt.device != idx.device:
+        raise ValueError(f"{name}_block_cnt and {name}_block_idx must be on the same device")
+    if not cnt.is_cuda or not idx.is_cuda:
+        raise ValueError(f"{name}_block tensors must live on CUDA")
+    expanded_cnt = _expand_sparsity_tensor(
+        cnt, expected_count_shape, f"{name}_block_cnt", context, hint
+    )
+    expanded_idx = _expand_sparsity_tensor(
+        idx, expected_index_shape, f"{name}_block_idx", context, hint
+    )
+    return expanded_cnt, expanded_idx
+def get_block_sparse_expected_shapes(
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    m_block_size: int,
+    n_block_size: int,
+    q_stage: int,
+) -> Tuple[Tuple[int, int, int], Tuple[int, int, int, int]]:
+    """Return (expected_count_shape, expected_index_shape) for block sparse normalization."""
+    m_block_size_effective = q_stage * m_block_size
+    expected_m_blocks = ceildiv(seqlen_q, m_block_size_effective)
+    expected_n_blocks = ceildiv(seqlen_k, n_block_size)
+    expected_count_shape = (batch_size, num_head, expected_m_blocks)
+    expected_index_shape = (batch_size, num_head, expected_m_blocks, expected_n_blocks)
+    return expected_count_shape, expected_index_shape
+def get_block_sparse_expected_shapes_bwd(
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    m_block_size: int,
+    n_block_size: int,
+    subtile_factor: int,
+) -> Tuple[Tuple[int, int, int], Tuple[int, int, int, int]]:
+    """Return (expected_count_shape, expected_index_shape) for backward block sparse normalization.
+    Backward uses Q-direction indexing (transposed from forward), where shapes are
+    indexed by N-blocks first, then M-blocks. The sparse_block_size_q is determined
+    by subtile_factor * m_block_size.
+    """
+    sparse_block_size_q = subtile_factor * m_block_size
+    expected_m_blocks = ceildiv(seqlen_q, sparse_block_size_q)
+    expected_n_blocks = ceildiv(seqlen_k, n_block_size)
+    expected_count_shape = (batch_size, num_head, expected_n_blocks)
+    expected_index_shape = (batch_size, num_head, expected_n_blocks, expected_m_blocks)
+    return expected_count_shape, expected_index_shape
+def normalize_block_sparse_tensors(
+    tensors: BlockSparseTensorsTorch,
+    *,
+    expected_count_shape: Tuple[int, int, int],
+    expected_index_shape: Tuple[int, int, int, int],
+    context: str | None = None,
+    hint: str | Callable[[], str] | None = None,
+) -> BlockSparseTensorsTorch:
+    if tensors.mask_block_cnt is None or tensors.mask_block_idx is None:
+        raise ValueError("mask_block_cnt and mask_block_idx must be provided for block sparsity.")
+    mask_cnt, mask_idx = _check_and_expand_block(
+        "mask",
+        tensors.mask_block_cnt,
+        tensors.mask_block_idx,
+        expected_count_shape,
+        expected_index_shape,
+        context,
+        hint,
+    )
+    if mask_cnt is None or mask_idx is None:
+        raise ValueError("mask_block_cnt and mask_block_idx must be provided for block sparsity.")
+    full_cnt, full_idx = _check_and_expand_block(
+        "full",
+        tensors.full_block_cnt,
+        tensors.full_block_idx,
+        expected_count_shape,
+        expected_index_shape,
+        context,
+        hint,
+    )
+    if full_cnt is not None and mask_cnt.device != full_cnt.device:
+        raise ValueError("All block sparse tensors must be on the same device")
+    return BlockSparseTensorsTorch(
+        mask_block_cnt=mask_cnt,
+        mask_block_idx=mask_idx,
+        full_block_cnt=full_cnt,
+        full_block_idx=full_idx,
+    )
+def is_block_sparsity_enabled(tensors: BlockSparseTensorsTorch) -> bool:
+    return any(t is not None for t in (tensors.full_block_cnt, tensors.mask_block_cnt))
+def to_cute_block_sparse_tensors(
+    tensors: BlockSparseTensorsTorch, enable_tvm_ffi: bool = True
+) -> BlockSparseTensors | None:
+    """Convert torch block sparsity tensors to CuTe tensors, optionally for tvm ffi"""
+    if not is_block_sparsity_enabled(tensors):
+        return None
+    (
+        mask_block_cnt,
+        mask_block_idx,
+        full_block_cnt,
+        full_block_idx,
+    ) = tensors
+    (
+        mask_block_cnt_tensor,
+        mask_block_idx_tensor,
+    ) = [
+        to_cute_tensor(t, assumed_align=4, leading_dim=-1, enable_tvm_ffi=enable_tvm_ffi)
+        for t in (mask_block_cnt, mask_block_idx)
+    ]
+    (
+        full_block_cnt_tensor,
+        full_block_idx_tensor,
+    ) = [
+        to_cute_tensor(t, assumed_align=4, leading_dim=-1, enable_tvm_ffi=enable_tvm_ffi)
+        if t is not None
+        else None
+        for t in (full_block_cnt, full_block_idx)
+    ]
+    return BlockSparseTensors(
+        mask_block_cnt_tensor,
+        mask_block_idx_tensor,
+        full_block_cnt_tensor,
+        full_block_idx_tensor,
+    )
+def fast_sampling(mask_mod):
+    """Convenience decorator to mark mask_mod as safe for 5-point fast sampling"""
+    mask_mod.use_fast_sampling = True
+    return mask_mod

mslk/attention/flash_attn/compute_block_sparsity.py ADDED Viewed

@@ -0,0 +1,378 @@
+# @nolint # fbcode
+from functools import partial
+from typing import Callable, Optional, Tuple
+import cutlass
+import cutlass.cute as cute
+import torch
+from cutlass import Boolean, Int8, Int32, const_expr
+from mslk.attention.flash_attn.block_sparsity import (
+    BlockSparseTensors,
+    BlockSparseTensorsTorch,
+    to_cute_block_sparse_tensors,
+)
+from mslk.attention.flash_attn.utils import hash_callable, scalar_to_ssa, ssa_to_scalar
+from mslk.attention.flash_attn.seqlen_info import SeqlenInfoQK
+class BlockSparsityKernel:
+    """Block sparsity kernel for FlexAttention.
+    This kernel computes `mask_mod` for every token of each block
+    to determine if an n block is full, masked, or neither.
+    Writes block counts and indices to a BlockSparseTensors object.
+    When use_fast_sampling=True, uses 5-point sampling (4 corners + center)
+    which is much faster but only suitable for masks where this is sufficient.
+    TODO:
+        - optimize mask_mod evaluation
+        - varlen support
+        - transposed tensors for bwd pass
+    """
+    def __init__(
+        self,
+        mask_mod: Callable,
+        tile_mn: Tuple[int, int],
+        compute_full_blocks: bool = True,
+        use_aux_tensors: bool = False,
+        use_fast_sampling: bool = False,
+    ):
+        self.mask_mod = mask_mod
+        self.tile_mn = tile_mn
+        self.compute_full_blocks = compute_full_blocks
+        self.use_aux_tensors = use_aux_tensors
+        self.use_fast_sampling = use_fast_sampling
+    @cute.jit
+    def __call__(
+        self,
+        blocksparse_tensors: BlockSparseTensors,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        aux_tensors: Optional[list] = None,
+    ):
+        self.mask_cnt, self.mask_idx, self.full_cnt, self.full_idx = blocksparse_tensors
+        if const_expr(self.compute_full_blocks):
+            assert self.full_cnt is not None and self.full_idx is not None, (
+                "full block tensors must be provided when computing full blocks"
+            )
+        batch_size, num_heads, num_m_blocks, num_n_blocks = self.mask_idx.shape
+        # launch 1 CTA per m block
+        grid = [num_m_blocks, num_heads, batch_size]
+        if const_expr(self.use_fast_sampling):
+            num_threads = 5
+            self.num_warps = 1
+        else:
+            num_threads = self.tile_mn[0]
+            self.num_warps = (num_threads + 32 - 1) // 32
+        self.kernel(
+            self.mask_cnt,
+            self.mask_idx,
+            self.full_cnt,
+            self.full_idx,
+            num_n_blocks,
+            seqlen_q,
+            seqlen_k,
+            aux_tensors,
+        ).launch(grid=grid, block=[num_threads, 1, 1])
+    @cute.kernel
+    def kernel(
+        self,
+        mask_cnt: cute.Tensor,
+        mask_idx: cute.Tensor,
+        full_cnt: cute.Tensor,
+        full_idx: cute.Tensor,
+        num_n_blocks: Int32,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        aux_tensors: Optional[list] = None,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        warp_idx = cute.arch.warp_idx()
+        lane_id = cute.arch.lane_idx()
+        m_block, head_idx, batch_idx = cute.arch.block_idx()
+        ssa = partial(scalar_to_ssa, dtype=Int32)
+        seqlen = SeqlenInfoQK.create(
+            batch_idx,
+            seqlen_q,
+            seqlen_k,
+            mCuSeqlensQ=None,
+            mCuSeqlensK=None,
+            mSeqUsedQ=None,
+            mSeqUsedK=None,
+        )
+        @cute.struct
+        class SharedStorage:
+            reduction_buffer_smem: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int8, 2 * self.num_warps], 1024
+            ]
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(SharedStorage, 16)
+        reduction_buffer = storage.reduction_buffer_smem.get_tensor(
+            cute.make_layout((self.num_warps, 2))
+        )
+        num_mask_blocks = Int32(0)
+        num_full_blocks = Int32(0)
+        for n_block in cutlass.range(num_n_blocks, unroll_full=True):
+            m_base = m_block * self.tile_mn[0]
+            n_base = n_block * self.tile_mn[1]
+            if const_expr(self.use_fast_sampling):
+                # Fast path: 5-point sampling (4 corners + center)
+                # Clamps OOB indices to nearest in bounds.
+                thread_result = Boolean(False)
+                thread_is_valid = Boolean(False)
+                q_idx = Int32(0)
+                kv_idx = Int32(0)
+                if tidx == 0:
+                    # Top-left corner (0, 0); always in bounds
+                    q_idx = m_base
+                    kv_idx = n_base
+                elif tidx == 1:
+                    # Top-right corner
+                    q_idx = m_base
+                    kv_idx = cutlass.min(n_base + self.tile_mn[1] - 1, seqlen_k - 1)
+                elif tidx == 2:
+                    # Bottom-left corner
+                    q_idx = cutlass.min(m_base + self.tile_mn[0] - 1, seqlen_q - 1)
+                    kv_idx = n_base
+                elif tidx == 3:
+                    # Bottom-right corner
+                    q_idx = cutlass.min(m_base + self.tile_mn[0] - 1, seqlen_q - 1)
+                    kv_idx = cutlass.min(n_base + self.tile_mn[1] - 1, seqlen_k - 1)
+                elif tidx == 4:
+                    # Center point
+                    q_idx = m_base + (cutlass.min(seqlen_q - m_base, self.tile_mn[0])) // 2
+                    kv_idx = n_base + (cutlass.min(seqlen_k - n_base, self.tile_mn[1])) // 2
+                else:
+                    thread_is_valid = Boolean(False)
+                # Check bounds and determine if this thread has a valid index pair
+                if tidx < 5 and q_idx < seqlen_q and kv_idx < seqlen_k:
+                    thread_is_valid = Boolean(True)
+                    q_idx_ssa = ssa(q_idx)
+                    kv_idx_ssa = ssa(kv_idx)
+                    thread_result = ssa_to_scalar(
+                        self.mask_mod(
+                            ssa(batch_idx),
+                            ssa(head_idx),
+                            q_idx_ssa,
+                            kv_idx_ssa,
+                            seqlen,
+                            aux_tensors,
+                        )
+                    )
+                else:
+                    thread_is_valid = Boolean(False)
+                # Use vote_any_sync to see if any valid thread found unmasked or masked
+                # Only count results from threads that checked valid indices
+                has_unmasked = cute.arch.vote_any_sync(thread_result & thread_is_valid)
+                has_masked = cute.arch.vote_any_sync((Boolean(not thread_result)) & thread_is_valid)
+            else:
+                # Full path: check all elements in the block
+                # Track if this thread's row has any masked or unmasked elements
+                thread_has_unmasked = Boolean(False)
+                thread_has_masked = Boolean(False)
+                thread_is_valid = Boolean(False)
+                # Each thread handles 1 row
+                q_idx = m_base + tidx
+                kv_idx = Int32(0)
+                if tidx < self.tile_mn[0] and q_idx < seqlen_q:
+                    thread_is_valid = Boolean(True)
+                    q_idx_ssa = ssa(q_idx)
+                    # Loop over all columns in this row
+                    for c in cutlass.range(self.tile_mn[1], unroll_full=True):
+                        kv_idx = n_base + c
+                        kv_idx_ssa = ssa(kv_idx)
+                        # Only check elements within valid sequence bounds
+                        if kv_idx < seqlen_k:
+                            # Direct scalar call
+                            mask_val = ssa_to_scalar(
+                                self.mask_mod(
+                                    ssa(batch_idx),
+                                    ssa(head_idx),
+                                    q_idx_ssa,
+                                    kv_idx_ssa,
+                                    seqlen,
+                                    aux_tensors,
+                                )
+                            )
+                            # Update tracking flags
+                            if mask_val:
+                                thread_has_unmasked = Boolean(True)
+                            else:
+                                thread_has_masked = Boolean(True)
+                # Block-level reduction to combine results across all threads
+                # Only count votes from threads that checked valid indices
+                warp_has_unmasked_mask = cute.arch.vote_any_sync(
+                    thread_has_unmasked & thread_is_valid
+                )
+                warp_has_masked_mask = cute.arch.vote_any_sync(thread_has_masked & thread_is_valid)
+                # lane 0 writes the ballot mask to shared memory
+                lane_id = tidx % 32
+                if lane_id == 0:
+                    # Store as Int8
+                    reduction_buffer[warp_idx, 0] = Int8(1) if warp_has_unmasked_mask else Int8(0)
+                    reduction_buffer[warp_idx, 1] = Int8(1) if warp_has_masked_mask else Int8(0)
+                cute.arch.sync_threads()
+                # Thread 0 ORs all warp results together
+                has_unmasked = Boolean(False)
+                has_masked = Boolean(False)
+                if tidx == 0:
+                    for w in cutlass.range(self.num_warps):
+                        if reduction_buffer[w, 0]:
+                            has_unmasked = Boolean(True)
+                        if reduction_buffer[w, 1]:
+                            has_masked = Boolean(True)
+            # Only thread 0 updates the output arrays (common to both paths)
+            if tidx == 0:
+                # Block classification based on what we found:
+                # - If has_masked and has_unmasked: partial block (needs masking)
+                # - If only has_unmasked: full block (no masking needed)
+                # - If only has_masked: skip this block entirely
+                is_partial = Boolean(has_masked and has_unmasked)
+                is_full = Boolean(has_unmasked and (not has_masked))
+                if is_partial:
+                    mask_idx[batch_idx, head_idx, m_block, num_mask_blocks] = n_block
+                    num_mask_blocks += 1
+                elif is_full and const_expr(self.compute_full_blocks):
+                    full_idx[batch_idx, head_idx, m_block, num_full_blocks] = n_block
+                    num_full_blocks += 1
+        # Only thread 0 writes back the counts
+        if tidx == 0:
+            mask_cnt[batch_idx, head_idx, m_block] = num_mask_blocks
+            if const_expr(self.compute_full_blocks):
+                full_cnt[batch_idx, head_idx, m_block] = num_full_blocks
+def compute_block_sparsity(
+    tile_m,
+    tile_n,
+    batch_size,
+    num_heads,
+    seqlen_q,
+    seqlen_k,
+    mask_mod: Callable,
+    aux_tensors: Optional[list],  # list[cute.Tensor]
+    device,
+    compute_full_blocks: bool = True,
+    use_fast_sampling: bool = False,
+) -> Tuple[BlockSparseTensors, BlockSparseTensorsTorch]:
+    """
+    Computes block sparsity for a given `mask_mod`.
+    Args:
+        tile_m: The tile size for the m dimension.
+        tile_n: The tile size for the n dimension.
+        batch_size: The batch size.
+        num_heads: The number of heads.
+        seqlen_q: The sequence length for the query.
+        seqlen_k: The sequence length for the key.
+        mask_mod: The `mask_mod` callable to use.
+        aux_tensors: A list of auxiliary tensors.
+        device: The device to use.
+        compute_full_blocks: Whether to compute full blocks. If False, only partially-masked blocks are computed.
+        use_fast_sampling: Whether to use 5-point sampling (4 corners + center). This is much faster, but only suitable for masks where this check is sufficient.
+    Returns:
+        A tuple of `BlockSparseTensors` and `BlockSparseTensorsTorch`.
+    """
+    # Check if mask_mod is marked as suitable for 5-point fast sampling
+    use_fast_sampling = getattr(mask_mod, "use_fast_sampling", use_fast_sampling)
+    num_m_blocks = (seqlen_q + tile_m - 1) // tile_m
+    num_n_blocks = (seqlen_k + tile_n - 1) // tile_n
+    mask_block_cnt = torch.zeros(
+        (batch_size, num_heads, num_m_blocks), device=device, dtype=torch.int32
+    )
+    mask_block_idx = torch.zeros(
+        (batch_size, num_heads, num_m_blocks, num_n_blocks), device=device, dtype=torch.int32
+    )
+    full_block_cnt = (
+        torch.zeros((batch_size, num_heads, num_m_blocks), device=device, dtype=torch.int32)
+        if compute_full_blocks
+        else None
+    )
+    full_block_idx = (
+        torch.zeros(
+            (batch_size, num_heads, num_m_blocks, num_n_blocks), device=device, dtype=torch.int32
+        )
+        if compute_full_blocks
+        else None
+    )
+    blocksparse_tensors_torch = BlockSparseTensorsTorch(
+        mask_block_cnt=mask_block_cnt,
+        mask_block_idx=mask_block_idx,
+        full_block_cnt=full_block_cnt,
+        full_block_idx=full_block_idx,
+    )
+    mask_mod_hash = hash_callable(mask_mod)
+    blocksparse_tensors = to_cute_block_sparse_tensors(
+        blocksparse_tensors_torch, enable_tvm_ffi=True
+    )
+    compile_key = (
+        tile_m,
+        tile_n,
+        mask_mod_hash,
+        compute_full_blocks,
+        aux_tensors is not None,
+        use_fast_sampling,
+    )
+    if compile_key not in compute_block_sparsity.compile_cache:
+        kernel = BlockSparsityKernel(
+            mask_mod,
+            tile_mn=(tile_m, tile_n),
+            compute_full_blocks=compute_full_blocks,
+            use_aux_tensors=aux_tensors is not None,
+            use_fast_sampling=use_fast_sampling,
+        )
+        compute_block_sparsity.compile_cache[compile_key] = cute.compile(
+            kernel, blocksparse_tensors, seqlen_q, seqlen_k, aux_tensors, options="--enable-tvm-ffi"
+        )
+    compute_block_sparsity.compile_cache[compile_key](
+        blocksparse_tensors_torch,
+        seqlen_q,
+        seqlen_k,
+        aux_tensors,
+    )
+    return blocksparse_tensors, blocksparse_tensors_torch
+compute_block_sparsity.compile_cache = {}