PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/flash_attn/block_sparse_utils.py ADDED Viewed

@@ -0,0 +1,1452 @@
+# @nolint # fbcode
+"""
+Block-sparse runtime utilities for CUTE DSL kernels.
+This module contains runtime execution functions for block-sparse attention kernels.
+These utilities are used by CUTE DSL kernels to produce and consume block-sparse loads.
+"""
+from typing import Callable, Optional
+from functools import partial
+import math
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+# Import data structures from block_sparsity
+from mslk.attention.flash_attn.block_sparsity import BlockSparseTensors
+from mslk.attention.flash_attn import utils
+from mslk.attention.flash_attn import copy_utils
+from mslk.attention.flash_attn.named_barrier import NamedBarrierBwd
+@cute.jit
+def load_block_list(
+    block_indices: cute.Tensor,
+    block_count,
+    load_q_with_first: cutlass.Constexpr,
+    first_block_preloaded: cutlass.Constexpr,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_k,
+    pipeline_v,
+    use_tma_q: cutlass.Constexpr,
+    tma_q_bytes: cutlass.Constexpr,
+    intra_wg_overlap: cutlass.Constexpr,
+):
+    """Iterate over the sparse blocks and load K, V (and Q) into the pipeline.
+    for the intra_wg_overlap case, we overlap the loads of K and V. And this
+    means we need to pipeline the last V load from the partial block case,
+    with the loads for the full blocks. Set first_block_preloaded when the
+    caller has already issued the first K load for the list.
+    Note:
+        we iterate along the block_n indices in reverse.
+    Returns:
+        Updated kv_producer_state after processing the block list.
+    """
+    if block_count > 0:
+        if const_expr(not intra_wg_overlap):
+            # Peel first iteration: the first block may need to load Q alongside K,
+            # Parameters are already Constexpr, so no need to wrap in const_expr()
+            n_block_first = block_indices[block_count - 1]
+            extra_tx = tma_q_bytes if const_expr(load_q_with_first) and const_expr(use_tma_q) else 0
+            pipeline_k.producer_acquire(kv_producer_state, extra_tx_count=extra_tx)
+            if const_expr(load_q_with_first and use_tma_q):
+                load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
+            load_K(src_idx=n_block_first, producer_state=kv_producer_state)
+            pipeline_v.producer_acquire(kv_producer_state)
+            load_V(src_idx=n_block_first, producer_state=kv_producer_state)
+            kv_producer_state.advance()
+            for offset in cutlass.range(1, block_count):
+                n_block = block_indices[block_count - 1 - offset]
+                pipeline_k.producer_acquire(kv_producer_state)
+                load_K(src_idx=n_block, producer_state=kv_producer_state)
+                pipeline_v.producer_acquire(kv_producer_state)
+                load_V(src_idx=n_block, producer_state=kv_producer_state)
+                kv_producer_state.advance()
+        else:
+            n_block_first = block_indices[block_count - 1]
+            if const_expr(not first_block_preloaded):
+                extra_tx = (
+                    tma_q_bytes if const_expr(load_q_with_first) and const_expr(use_tma_q) else 0
+                )
+                pipeline_k.producer_acquire(kv_producer_state, extra_tx_count=extra_tx)
+                if const_expr(load_q_with_first and use_tma_q):
+                    load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
+                load_K(src_idx=n_block_first, producer_state=kv_producer_state)
+            for idx in cutlass.range(block_count - 1, unroll=1):
+                n_block_prev = block_indices[block_count - 1 - idx]
+                n_block = block_indices[block_count - 2 - idx]
+                kv_producer_state_prev = kv_producer_state.clone()
+                kv_producer_state.advance()
+                pipeline_k.producer_acquire(kv_producer_state)
+                load_K(src_idx=n_block, producer_state=kv_producer_state)
+                pipeline_v.producer_acquire(kv_producer_state_prev)
+                load_V(src_idx=n_block_prev, producer_state=kv_producer_state_prev)
+    return kv_producer_state
+@cute.jit
+def finish_overlap_v_load(
+    block_indices: cute.Tensor,
+    block_count,
+    load_V,
+    pipeline_v,
+    kv_producer_state,
+):
+    """Load the final V block after overlapped K/V loads."""
+    if block_count > 0:
+        n_block_last = block_indices[0]
+        pipeline_v.producer_acquire(kv_producer_state)
+        load_V(src_idx=n_block_last, producer_state=kv_producer_state)
+        kv_producer_state.advance()
+    return kv_producer_state
+@cute.jit
+def sparse_tensor_m_block(
+    m_block,
+    qhead_per_kvhead: cutlass.Constexpr[int],
+):
+    """Map packed m_block indices to block-sparse tensor indices."""
+    if const_expr(qhead_per_kvhead != 1):
+        return m_block // qhead_per_kvhead
+    return m_block
+@cute.jit
+def produce_block_sparse_loads(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_k,
+    pipeline_v,
+    use_tma_q: cutlass.Constexpr,
+    tma_q_bytes: cutlass.Constexpr,
+    intra_wg_overlap: cutlass.Constexpr,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+):
+    """Iterate over the mask and full block lists for a single tile.
+    The masked (partial) list may leave the last V load pending when intra-warp-group
+    overlap is enabled. The first full block must consume that pending V while
+    issuing its own K load on the next pipeline stage.
+    In the intra-wg-overlap path, the last masked block leaves its V copy in flight
+    while we advance the producer state to start the next full K. Either the full list
+    overlaps that pending V load, or, if no full blocks exist, we explicitly drain it.
+    Args:
+        qhead_per_kvhead: Pack-GQA factor. When > 1, m_block is in packed space and
+            must be converted to unpacked for sparse tensor indexing.
+    """
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead)
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    if const_expr(full_block_cnt is not None):
+        curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    else:
+        curr_full_block_cnt = Int32(0)
+        curr_full_block_idx = None
+    mask_empty = curr_mask_block_cnt == 0
+    full_empty = curr_full_block_cnt == 0
+    if mask_empty:
+        # No masked blocks: the full list owns the initial Q+K load.
+        kv_producer_state = load_block_list(
+            curr_full_block_idx,
+            curr_full_block_cnt,
+            load_q_with_first=True,
+            first_block_preloaded=False,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_k=pipeline_k,
+            pipeline_v=pipeline_v,
+            use_tma_q=use_tma_q,
+            tma_q_bytes=tma_q_bytes,
+            intra_wg_overlap=intra_wg_overlap,
+        )
+        if const_expr(intra_wg_overlap) and curr_full_block_cnt > 0:
+            kv_producer_state = finish_overlap_v_load(
+                curr_full_block_idx,
+                curr_full_block_cnt,
+                load_V,
+                pipeline_v,
+                kv_producer_state,
+            )
+    else:
+        # Masked blocks present: load Q together with the first masked K so consumers can
+        # start immediately. When overlap is disabled this fully drains the list.
+        kv_producer_state = load_block_list(
+            curr_mask_block_idx,
+            curr_mask_block_cnt,
+            load_q_with_first=True,
+            first_block_preloaded=False,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_k=pipeline_k,
+            pipeline_v=pipeline_v,
+            use_tma_q=use_tma_q,
+            tma_q_bytes=tma_q_bytes,
+            intra_wg_overlap=intra_wg_overlap,
+        )
+        if full_empty:
+            if const_expr(intra_wg_overlap):
+                kv_producer_state = finish_overlap_v_load(
+                    curr_mask_block_idx,
+                    curr_mask_block_cnt,
+                    load_V,
+                    pipeline_v,
+                    kv_producer_state,
+                )
+        else:
+            if const_expr(intra_wg_overlap):
+                # Bridge the masked list to the full list by overlapping the pending masked V
+                # with the first full K load.
+                n_block_mask_last = curr_mask_block_idx[0]
+                n_block_full_first = curr_full_block_idx[curr_full_block_cnt - 1]
+                kv_producer_state_prev = kv_producer_state.clone()
+                kv_producer_state.advance()
+                pipeline_k.producer_acquire(kv_producer_state)
+                load_K(src_idx=n_block_full_first, producer_state=kv_producer_state)
+                pipeline_v.producer_acquire(kv_producer_state_prev)
+                load_V(src_idx=n_block_mask_last, producer_state=kv_producer_state_prev)
+                kv_producer_state = load_block_list(
+                    curr_full_block_idx,
+                    curr_full_block_cnt,
+                    load_q_with_first=False,
+                    first_block_preloaded=True,
+                    kv_producer_state=kv_producer_state,
+                    load_Q=load_Q,
+                    load_K=load_K,
+                    load_V=load_V,
+                    pipeline_k=pipeline_k,
+                    pipeline_v=pipeline_v,
+                    use_tma_q=use_tma_q,
+                    tma_q_bytes=tma_q_bytes,
+                    intra_wg_overlap=intra_wg_overlap,
+                )
+                kv_producer_state = finish_overlap_v_load(
+                    curr_full_block_idx,
+                    curr_full_block_cnt,
+                    load_V,
+                    pipeline_v,
+                    kv_producer_state,
+                )
+            else:
+                # Non-overlap path with both lists: run the full list normally (skipping the Q
+                # reload because the masked list already issued it).
+                kv_producer_state = load_block_list(
+                    curr_full_block_idx,
+                    curr_full_block_cnt,
+                    load_q_with_first=False,
+                    first_block_preloaded=False,
+                    kv_producer_state=kv_producer_state,
+                    load_Q=load_Q,
+                    load_K=load_K,
+                    load_V=load_V,
+                    pipeline_k=pipeline_k,
+                    pipeline_v=pipeline_v,
+                    use_tma_q=use_tma_q,
+                    tma_q_bytes=tma_q_bytes,
+                    intra_wg_overlap=intra_wg_overlap,
+                )
+    return kv_producer_state
+@cute.jit
+def consume_block_sparse_loads(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    seqlen,
+    kv_consumer_state,
+    mma_pv_fn,
+    mma_one_n_block,
+    process_first_half_block,
+    process_last_half_block,
+    mask_fn,
+    score_mod_fn,
+    O_should_accumulate,
+    mask_mod,
+    fastdiv_mods,
+    intra_wg_overlap: cutlass.Constexpr,
+    warp_scheduler_barrier_sync: Callable,
+    warp_scheduler_barrier_arrive: Callable,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+):
+    """Consume the mask and full block lists for a single tile on the consumer side.
+    Mirrors `produce_block_sparse_loads` so that the consumer pipeline uses
+    the same sparse tensor indexing.
+    Args:
+        qhead_per_kvhead: Pack-GQA factor. When > 1, m_block is in packed space and
+            must be converted to unpacked for sparse tensor indexing.
+    """
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead)
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    processed_any = curr_mask_block_cnt + curr_full_block_cnt > 0
+    if const_expr(not intra_wg_overlap):
+        if curr_mask_block_cnt > 0:
+            mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1]
+            warp_scheduler_barrier_sync()
+            kv_consumer_state = mma_one_n_block(
+                kv_consumer_state,
+                n_block=mask_n_block,
+                mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                mask_fn=partial(
+                    mask_fn,
+                    mask_mod=mask_mod,
+                    mask_seqlen=True,
+                    fastdiv_mods=fastdiv_mods if cutlass.const_expr(mask_mod is not None) else None,
+                ),
+                is_first_n_block=True,
+            )
+            O_should_accumulate = True
+            for i in cutlass.range(1, curr_mask_block_cnt):
+                mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1 - i]
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=mask_n_block,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=mask_mod, mask_seqlen=False),
+                    is_first_n_block=False,
+                )
+                O_should_accumulate = True
+            if curr_full_block_cnt == 0:
+                warp_scheduler_barrier_arrive()
+        if curr_full_block_cnt > 0:
+            full_n_block = curr_full_block_idx[curr_full_block_cnt - 1]
+            if curr_mask_block_cnt == 0:
+                warp_scheduler_barrier_sync()
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_seqlen=True),
+                    is_first_n_block=True,
+                )
+                O_should_accumulate = True
+                for i in cutlass.range(1, curr_full_block_cnt):
+                    full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                    kv_consumer_state = mma_one_n_block(
+                        kv_consumer_state,
+                        n_block=full_n_block,
+                        mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                        mask_fn=partial(mask_fn, mask_seqlen=False),
+                        is_first_n_block=False,
+                    )
+                    O_should_accumulate = True
+            else:
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=True),
+                    is_first_n_block=False,
+                )
+                O_should_accumulate = True
+                for i in cutlass.range(1, curr_full_block_cnt):
+                    full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                    kv_consumer_state = mma_one_n_block(
+                        kv_consumer_state,
+                        n_block=full_n_block,
+                        mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                        mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=False),
+                        is_first_n_block=False,
+                    )
+                    O_should_accumulate = True
+            warp_scheduler_barrier_arrive()
+    else:
+        if curr_mask_block_cnt > 0:
+            mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1]
+            kv_consumer_state = process_first_half_block(
+                n_block=mask_n_block,
+                seqlen=seqlen,
+                kv_consumer_state=kv_consumer_state,
+                mask_fn=partial(
+                    mask_fn,
+                    mask_mod=mask_mod,
+                    mask_seqlen=True,
+                    fastdiv_mods=fastdiv_mods if cutlass.const_expr(mask_mod is not None) else None,
+                ),
+                score_mod_fn=score_mod_fn,
+                is_first_block=True,
+            )
+            for i in cutlass.range(1, curr_mask_block_cnt):
+                mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1 - i]
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=mask_n_block,
+                    seqlen=seqlen,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=mask_mod, mask_seqlen=False),
+                )
+                O_should_accumulate = True
+        if curr_full_block_cnt > 0:
+            full_n_block = curr_full_block_idx[curr_full_block_cnt - 1]
+            if curr_mask_block_cnt == 0:
+                kv_consumer_state = process_first_half_block(
+                    n_block=full_n_block,
+                    seqlen=seqlen,
+                    kv_consumer_state=kv_consumer_state,
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=True),
+                    score_mod_fn=score_mod_fn,
+                    is_first_block=True,
+                )
+            else:
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    seqlen=seqlen,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=True),
+                )
+                O_should_accumulate = True
+            for i in cutlass.range(1, curr_full_block_cnt):
+                full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    seqlen=seqlen,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=False),
+                )
+                O_should_accumulate = True
+        if curr_mask_block_cnt + curr_full_block_cnt > 0:
+            kv_consumer_state = process_last_half_block(
+                kv_consumer_state=kv_consumer_state,
+                zero_init=not O_should_accumulate,
+            )
+            O_should_accumulate = True
+    return kv_consumer_state, O_should_accumulate, processed_any
+@cute.jit
+def load_block_list_sm100(
+    block_indices: cute.Tensor,
+    block_count,
+    load_q_with_first: cutlass.Constexpr,
+    m_block,
+    q_stage: cutlass.Constexpr,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_kv,
+):
+    """SM100 version of load_block_list (no intra_wg_overlap, no extra_tx_count)."""
+    if block_count > 0:
+        # First iteration: load Q alongside K if requested
+        n_block_first = block_indices[block_count - 1]
+        if const_expr(load_q_with_first):
+            # SM100 loads Q0 and optionally Q1
+            load_Q(block=q_stage * m_block + 0, stage=0)
+            if const_expr(q_stage == 2):
+                load_Q(block=q_stage * m_block + 1, stage=1)
+        # SM100 doesn't use producer_acquire for pipeline_kv in load path
+        # The pipeline barriers are handled inside load_KV
+        load_K(block=n_block_first, producer_state=kv_producer_state, page_idx=None)
+        kv_producer_state.advance()
+        load_V(block=n_block_first, producer_state=kv_producer_state, page_idx=None)
+        kv_producer_state.advance()
+        # Remaining blocks
+        for offset in cutlass.range(1, block_count):
+            n_block = block_indices[block_count - 1 - offset]
+            load_K(block=n_block, producer_state=kv_producer_state, page_idx=None)
+            kv_producer_state.advance()
+            load_V(block=n_block, producer_state=kv_producer_state, page_idx=None)
+            kv_producer_state.advance()
+    return kv_producer_state
+# SM100-specific tile processor using SM100 helpers
+@cute.jit
+def produce_block_sparse_loads_sm100(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_kv,
+    q_stage: cutlass.Constexpr,
+    q_producer_phase: Int32,
+    qhead_per_kvhead: cutlass.Constexpr,
+):
+    """SM100 entry point for sparse block iteration.
+    SM100 uses PipelineTmaUmma which doesn't support extra_tx_count, so we use
+    simplified block processing that just calls producer_acquire without extras.
+    Args:
+        m_block: which tile of m we are processing
+        qhead_per_kvhead: Constexpr pack factor
+    """
+    # NB: Compute unpacked index for sparse tensor access
+    if const_expr(qhead_per_kvhead != 1):
+        m_block_sparse = m_block // qhead_per_kvhead
+    else:
+        m_block_sparse = m_block
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    if const_expr(full_block_cnt is not None):
+        curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    else:
+        curr_full_block_cnt = Int32(0)
+        curr_full_block_idx = None
+    mask_empty = curr_mask_block_cnt == 0
+    full_empty = curr_full_block_cnt == 0
+    q_phase_flipped = False
+    if mask_empty:
+        # No masked blocks: process full list with Q loading
+        kv_producer_state = load_block_list_sm100(
+            curr_full_block_idx,
+            curr_full_block_cnt,
+            load_q_with_first=True,
+            m_block=m_block,
+            q_stage=q_stage,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_kv=pipeline_kv,
+        )
+        q_phase_flipped = not full_empty
+    else:
+        # Process masked blocks with Q loading
+        kv_producer_state = load_block_list_sm100(
+            curr_mask_block_idx,
+            curr_mask_block_cnt,
+            load_q_with_first=True,
+            m_block=m_block,
+            q_stage=q_stage,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_kv=pipeline_kv,
+        )
+        q_phase_flipped = True
+        if not full_empty:
+            # Process full blocks without Q loading
+            kv_producer_state = load_block_list_sm100(
+                curr_full_block_idx,
+                curr_full_block_cnt,
+                load_q_with_first=False,
+                m_block=m_block,
+                q_stage=q_stage,
+                kv_producer_state=kv_producer_state,
+                load_Q=load_Q,
+                load_K=load_K,
+                load_V=load_V,
+                pipeline_kv=pipeline_kv,
+            )
+    if q_phase_flipped:
+        q_producer_phase ^= 1
+    return kv_producer_state, q_producer_phase
+@cute.jit
+def get_total_block_count(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    qhead_per_kvhead: cutlass.Constexpr,
+):
+    # NB: Convert packed m_block to unpacked for sparse tensor indexing
+    if const_expr(qhead_per_kvhead != 1):
+        m_block_sparse = m_block // qhead_per_kvhead
+    else:
+        m_block_sparse = m_block
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    if const_expr(full_block_cnt is not None):
+        return (
+            mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+            + full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        )
+    else:
+        return mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+@cute.jit
+def handle_block_sparse_empty_tile_correction_sm100(
+    tidx: Int32,
+    q_stage: cutlass.Constexpr,
+    m_block_size: cutlass.Constexpr,
+    qhead_per_kvhead,
+    pack_gqa: cutlass.Constexpr,
+    is_split_kv: cutlass.Constexpr,
+    learnable_sink,
+    mLSE,
+    seqlen,
+    m_block: Int32,
+    head_idx: Int32,
+    batch_idx: Int32,
+    split_idx: Int32,
+    sScale: cute.Tensor,
+    stats: list,
+    correction_epilogue: Callable,
+    thr_mma_pv: cute.core.ThrMma,
+    tOtOs: tuple[cute.Tensor],
+    sO: cute.Tensor,
+    mbar_ptr,
+    mbar_softmax_corr_full_offset: Int32,
+    mbar_softmax_corr_empty_offset: Int32,
+    mbar_P_full_O_rescaled_offset: Int32,
+    mbar_P_full_2_offset: Int32,
+    mbar_corr_epi_full_offset: Int32,
+    mbar_corr_epi_empty_offset: Int32,
+    softmax_corr_consumer_phase: Int32,
+    o_corr_consumer_phase: Int32,
+    corr_epi_producer_phase: Int32,
+    softmax_scale_log2: Float32,
+    mO_cur: Optional[cute.Tensor] = None,
+    gO: Optional[cute.Tensor] = None,
+    gmem_tiled_copy_O: Optional[cute.TiledCopy] = None,
+):
+    """Handle the block-sparse case where a tile is fully masked:
+    * zero staged results
+    * seed stats
+    * satisfy the usual barrier protocol so downstream warps continue to make progress.
+    """
+    LOG2_E = Float32(math.log2(math.e))
+    for stage in cutlass.range_constexpr(q_stage):
+        row_sum_value = Float32(1.0)
+        row_max_value = (
+            -Float32.inf if const_expr(mLSE is not None or learnable_sink is not None) else None
+        )
+        if const_expr(learnable_sink is not None):
+            sink_val = -Float32.inf
+            if const_expr(not pack_gqa):
+                sink_val = Float32(learnable_sink[head_idx])
+            elif tidx < m_block_size:
+                q_head_idx = (
+                    (q_stage * m_block + stage) * m_block_size + tidx
+                ) % qhead_per_kvhead + head_idx * qhead_per_kvhead
+                sink_val = Float32(learnable_sink[q_head_idx])
+            if sink_val != -Float32.inf and (const_expr(not is_split_kv) or split_idx == 0):
+                if row_max_value == -Float32.inf:
+                    row_max_value = sink_val * (LOG2_E / softmax_scale_log2)
+                    row_sum_value = Float32(1.0)
+                else:
+                    row_sum_value = row_sum_value + utils.exp2f(
+                        sink_val * LOG2_E - row_max_value * softmax_scale_log2
+                    )
+        if tidx < m_block_size:
+            scale_row_idx = tidx + stage * m_block_size
+            sScale[scale_row_idx] = row_sum_value
+            if const_expr(mLSE is not None or learnable_sink is not None):
+                sScale[scale_row_idx + m_block_size * 2] = row_max_value
+        acc_flag = row_sum_value == Float32(0.0) or row_sum_value != row_sum_value
+        stats[stage] = (row_sum_value, row_max_value, acc_flag)
+        cute.arch.mbarrier_wait(
+            mbar_ptr + mbar_softmax_corr_full_offset + stage,
+            softmax_corr_consumer_phase,
+        )
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_softmax_corr_empty_offset + stage)
+        if const_expr(gmem_tiled_copy_O is None):
+            cute.arch.mbarrier_wait(
+                mbar_ptr + mbar_corr_epi_empty_offset + stage,
+                corr_epi_producer_phase,
+            )
+        correction_epilogue(
+            thr_mma_pv,
+            tOtOs[stage],
+            tidx,
+            stage,
+            m_block,
+            seqlen.seqlen_q,
+            Float32(0.0),  # zero scale ensures empty tile writes zeros into staged outputs
+            sO[None, None, stage],
+            mO_cur,
+            gO,
+            gmem_tiled_copy_O,
+        )
+        if const_expr(gmem_tiled_copy_O is None):
+            cute.arch.mbarrier_arrive(mbar_ptr + mbar_corr_epi_full_offset + stage)
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_P_full_O_rescaled_offset + stage)
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_P_full_2_offset + stage)
+    softmax_corr_consumer_phase ^= 1
+    o_corr_consumer_phase ^= 1
+    corr_epi_producer_phase ^= 1
+    return (
+        softmax_corr_consumer_phase,
+        o_corr_consumer_phase,
+        corr_epi_producer_phase,
+    )
+@cute.jit
+def softmax_block_sparse_sm100(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    softmax_step: Callable,
+    mask_fn: Callable,
+    mask_fn_none: Callable,
+    mma_si_consumer_phase: Int32,
+    si_corr_producer_phase: Int32,
+    s0_s1_sequence_phase: Int32,
+    mbar_ptr,
+    mbar_softmax_corr_full_offset: Int32,
+    mbar_softmax_corr_empty_offset: Int32,
+    mbar_P_full_O_rescaled_offset: Int32,
+    mbar_P_full_2_offset: Int32,
+    q_stage: cutlass.Constexpr,
+    stage_idx: Int32,
+    check_m_boundary: bool,
+    qhead_per_kvhead: cutlass.Constexpr,
+):
+    # Convert packed m_block to unpacked for sparse tensor indexing
+    if const_expr(qhead_per_kvhead != 1):
+        m_block_sparse = m_block // qhead_per_kvhead
+    else:
+        m_block_sparse = m_block
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    if const_expr(full_block_cnt is not None):
+        curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    else:
+        curr_full_block_cnt = Int32(0)
+        curr_full_block_idx = None
+    total_block_cnt = curr_mask_block_cnt + curr_full_block_cnt
+    if total_block_cnt == 0:
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_softmax_corr_full_offset + stage_idx)
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_P_full_O_rescaled_offset + stage_idx)
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_P_full_2_offset + stage_idx)
+        cute.arch.mbarrier_arrive(mbar_ptr + mbar_softmax_corr_empty_offset + stage_idx)
+    else:
+        if curr_mask_block_cnt > 0:
+            mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1]
+            (
+                mma_si_consumer_phase,
+                si_corr_producer_phase,
+                s0_s1_sequence_phase,
+            ) = softmax_step(
+                mma_si_consumer_phase,
+                si_corr_producer_phase,
+                s0_s1_sequence_phase,
+                mask_n_block,
+                is_first=True,
+                mask_fn=partial(mask_fn, mask_seqlen=True, check_q_boundary=check_m_boundary),
+            )
+            for i in cutlass.range(1, curr_mask_block_cnt):
+                mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1 - i]
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    mask_n_block,
+                    mask_fn=partial(mask_fn, mask_seqlen=False, check_q_boundary=check_m_boundary),
+                )
+        if curr_full_block_cnt > 0:
+            full_n_block = curr_full_block_idx[curr_full_block_cnt - 1]
+            if curr_mask_block_cnt == 0:
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    full_n_block,
+                    is_first=True,
+                    mask_fn=partial(
+                        mask_fn_none, mask_seqlen=True, check_q_boundary=check_m_boundary
+                    ),
+                )
+            else:
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    full_n_block,
+                    is_first=False,
+                    mask_fn=partial(
+                        mask_fn_none, mask_seqlen=False, check_q_boundary=check_m_boundary
+                    ),
+                )
+            for i in cutlass.range(1, curr_full_block_cnt):
+                full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    full_n_block,
+                    mask_fn=partial(
+                        mask_fn_none, mask_seqlen=False, check_q_boundary=check_m_boundary
+                    ),
+                )
+    return (
+        mma_si_consumer_phase,
+        si_corr_producer_phase,
+        s0_s1_sequence_phase,
+        total_block_cnt == 0,
+    )
+# =============================================================================
+# Backward-specific block-sparse helpers (SM100)
+# =============================================================================
+#
+# In backward, iteration is transposed compared to forward:
+# - Forward: outer loop over m_blocks (Q tiles), inner loop over n_blocks (KV tiles)
+# - Backward: outer loop over n_blocks (KV tiles), inner loop over m_blocks (Q tiles)
+#
+# The backward block-sparse tensors use "Q direction" indexing:
+# - q_block_cnt[batch, head, n_block] → count of m_blocks to process for this KV tile
+# - q_block_idx[batch, head, n_block, :] → indices of m_blocks to process
+#
+@cute.jit
+def get_total_q_block_count_bwd(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """Count total tile iterations for given n_block (KV tile) in backward."""
+    q_block_cnt, _, full_block_cnt, _ = blocksparse_tensors
+    total = q_block_cnt[batch_idx, head_idx, n_block]
+    if const_expr(full_block_cnt is not None):
+        total = total + full_block_cnt[batch_idx, head_idx, n_block]
+    return total * subtile_factor
+@cute.jit
+def produce_block_sparse_q_loads_bwd_sm100(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    # Pipeline states (will be returned after advancing)
+    producer_state_Q_LSE,
+    producer_state_dO_dPsum,
+    # Pipelines
+    pipeline_Q,
+    pipeline_LSE,
+    pipeline_dO,
+    pipeline_dPsum,
+    # Load functions
+    load_K,
+    load_V,
+    load_Q,
+    load_dO,
+    copy_stats,
+    # Global tensors for LSE/dPsum
+    gLSE,
+    sLSE,
+    gdPsum,
+    sdPsum,
+    # TMA copy bytes for extra_tx_count
+    tma_copy_bytes_K,
+    tma_copy_bytes_V,
+    # Flags for which loads to perform
+    should_load_Q: cutlass.Constexpr,
+    should_load_dO: cutlass.Constexpr,
+    # Subtiling factor and bounds
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """SM100 backward block sparse loading with subtiling.
+    Returns updated (producer_state_Q_LSE, producer_state_dO_dPsum).
+    First iteration loads K/V alongside Q/dO; subsequent iterations load only Q/dO.
+    """
+    (
+        curr_q_cnt,
+        curr_q_idx,
+        curr_full_cnt,
+        curr_full_idx,
+        loop_count,
+    ) = get_block_sparse_iteration_info_bwd(
+        blocksparse_tensors, batch_idx, head_idx, n_block, subtile_factor, m_block_max
+    )
+    for iter_idx in cutlass.range(loop_count, unroll=1):
+        m_block, _ = get_m_block_from_iter_bwd(
+            iter_idx,
+            curr_q_cnt,
+            curr_q_idx,
+            curr_full_cnt,
+            curr_full_idx,
+            subtile_factor,
+            m_block_max,
+        )
+        m_block_safe = m_block
+        if m_block_max > 0:
+            m_block_safe = cutlass.min(m_block, m_block_max - 1)
+        if iter_idx == 0:
+            # First block: load K/V alongside Q/dO
+            if const_expr(should_load_Q):
+                pipeline_Q.producer_acquire(producer_state_Q_LSE, extra_tx_count=tma_copy_bytes_K)
+                load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q_LSE))
+                load_Q(m_block_safe, producer_state=producer_state_Q_LSE)
+                pipeline_Q.producer_commit(producer_state_Q_LSE)
+                pipeline_LSE.producer_acquire(producer_state_Q_LSE)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gLSE[None, m_block_safe],
+                        sLSE[None, producer_state_Q_LSE.index],
+                        mbar_ptr=pipeline_LSE.producer_get_barrier(producer_state_Q_LSE),
+                    )
+                producer_state_Q_LSE.advance()
+            if const_expr(should_load_dO):
+                pipeline_dO.producer_acquire(
+                    producer_state_dO_dPsum, extra_tx_count=tma_copy_bytes_V
+                )
+                load_V(tma_bar_ptr=pipeline_dO.producer_get_barrier(producer_state_dO_dPsum))
+                load_dO(m_block_safe, producer_state=producer_state_dO_dPsum)
+                pipeline_dO.producer_commit(producer_state_dO_dPsum)
+                pipeline_dPsum.producer_acquire(producer_state_dO_dPsum)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gdPsum[None, m_block_safe],
+                        sdPsum[None, producer_state_dO_dPsum.index],
+                        mbar_ptr=pipeline_dPsum.producer_get_barrier(producer_state_dO_dPsum),
+                    )
+                producer_state_dO_dPsum.advance()
+        else:
+            # Subsequent blocks: just load Q/dO (K/V already loaded)
+            if const_expr(should_load_Q):
+                pipeline_Q.producer_acquire(producer_state_Q_LSE)
+                load_Q(m_block_safe, producer_state=producer_state_Q_LSE)
+                pipeline_Q.producer_commit(producer_state_Q_LSE)
+                pipeline_LSE.producer_acquire(producer_state_Q_LSE)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gLSE[None, m_block_safe],
+                        sLSE[None, producer_state_Q_LSE.index],
+                        mbar_ptr=pipeline_LSE.producer_get_barrier(producer_state_Q_LSE),
+                    )
+                producer_state_Q_LSE.advance()
+            if const_expr(should_load_dO):
+                pipeline_dO.producer_acquire(producer_state_dO_dPsum)
+                load_dO(m_block_safe, producer_state=producer_state_dO_dPsum)
+                pipeline_dO.producer_commit(producer_state_dO_dPsum)
+                pipeline_dPsum.producer_acquire(producer_state_dO_dPsum)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gdPsum[None, m_block_safe],
+                        sdPsum[None, producer_state_dO_dPsum.index],
+                        mbar_ptr=pipeline_dPsum.producer_get_barrier(producer_state_dO_dPsum),
+                    )
+                producer_state_dO_dPsum.advance()
+    return producer_state_Q_LSE, producer_state_dO_dPsum
+@cute.jit
+def get_block_sparse_iteration_info_bwd(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """Extract block-sparse iteration info for backward pass.
+    Returns (curr_q_cnt, curr_q_idx, curr_full_cnt, curr_full_idx, total_count).
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    sparse_block_count = curr_q_cnt
+    if const_expr(full_cnt is not None):
+        sparse_block_count = sparse_block_count + curr_full_cnt
+    total_count = sparse_block_count * subtile_factor
+    return curr_q_cnt, curr_q_idx, curr_full_cnt, curr_full_idx, total_count
+@cute.jit
+def get_m_block_from_iter_bwd(
+    iter_idx,
+    curr_q_cnt,
+    curr_q_idx: cute.Tensor,
+    curr_full_cnt,
+    curr_full_idx: Optional[cute.Tensor],
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """Derive m_block index and is_full_block flag from iteration index.
+    Returns (m_block, is_full_block):
+        - m_block: The actual Q-tile block index
+        - is_full_block: True if this is a full block (no mask_mod needed)
+    """
+    sparse_iter_idx = iter_idx // subtile_factor
+    subtile_offset = iter_idx % subtile_factor
+    sparse_m_block = Int32(0)
+    is_full_block = False
+    if const_expr(curr_full_idx is not None):
+        if sparse_iter_idx < curr_q_cnt:
+            sparse_m_block = curr_q_idx[sparse_iter_idx]
+        else:
+            sparse_m_block = curr_full_idx[sparse_iter_idx - curr_q_cnt]
+            is_full_block = True
+    else:
+        sparse_m_block = curr_q_idx[sparse_iter_idx]
+    return sparse_m_block * subtile_factor + subtile_offset, is_full_block
+@cute.jit
+def _load_q_do_block_sm90(
+    m_block,
+    producer_state_Q,
+    producer_state_dO,
+    pipeline_Q,
+    pipeline_dO,
+    load_K,
+    load_V,
+    load_Q,
+    load_dO,
+    load_LSE,
+    load_dPsum,
+    tma_copy_bytes_K,
+    tma_copy_bytes_V,
+    Q_stage_eq_dO_stage: cutlass.Constexpr,
+    load_kv: bool,
+):
+    """Load one Q/dO block, optionally loading K/V on first iteration."""
+    if load_kv:
+        pipeline_Q.producer_acquire(producer_state_Q, extra_tx_count=tma_copy_bytes_K)
+        load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q))
+    else:
+        pipeline_Q.producer_acquire(producer_state_Q)
+    load_Q(m_block, producer_state=producer_state_Q)
+    with cute.arch.elect_one():
+        load_LSE(m_block, producer_state=producer_state_Q)
+    producer_state_dO_cur = (
+        producer_state_dO if const_expr(not Q_stage_eq_dO_stage) else producer_state_Q
+    )
+    if load_kv:
+        pipeline_dO.producer_acquire(producer_state_dO_cur, extra_tx_count=tma_copy_bytes_V)
+        load_V(tma_bar_ptr=pipeline_dO.producer_get_barrier(producer_state_dO_cur))
+    else:
+        pipeline_dO.producer_acquire(producer_state_dO_cur)
+    load_dO(m_block, producer_state=producer_state_dO_cur)
+    with cute.arch.elect_one():
+        load_dPsum(m_block, producer_state=producer_state_dO_cur)
+    producer_state_Q.advance()
+    producer_state_dO.advance()
+    return producer_state_Q, producer_state_dO
+@cute.jit
+def produce_block_sparse_q_loads_bwd_sm90(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    producer_state_Q,
+    producer_state_dO,
+    pipeline_Q,
+    pipeline_dO,
+    load_K,
+    load_V,
+    load_Q,
+    load_dO,
+    load_LSE,
+    load_dPsum,
+    tma_copy_bytes_K,
+    tma_copy_bytes_V,
+    Q_stage_eq_dO_stage: cutlass.Constexpr,
+    subtile_factor: cutlass.Constexpr,
+    m_block_max: int,
+):
+    """SM90 backward block sparse loading with separate partial/full loops.
+    K/V are loaded with the first valid block. Iterates partial blocks first,
+    then full blocks, matching consumer order.
+    Returns updated (producer_state_Q, producer_state_dO).
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    kv_loaded = False
+    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
+        sparse_idx = iter_idx // subtile_factor
+        subtile_offset = iter_idx % subtile_factor
+        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+        if m_block < m_block_max:
+            producer_state_Q, producer_state_dO = _load_q_do_block_sm90(
+                m_block,
+                producer_state_Q,
+                producer_state_dO,
+                pipeline_Q,
+                pipeline_dO,
+                load_K,
+                load_V,
+                load_Q,
+                load_dO,
+                load_LSE,
+                load_dPsum,
+                tma_copy_bytes_K,
+                tma_copy_bytes_V,
+                Q_stage_eq_dO_stage,
+                load_kv=not kv_loaded,
+            )
+            kv_loaded = True
+    if const_expr(full_cnt is not None):
+        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
+            sparse_idx = iter_idx // subtile_factor
+            subtile_offset = iter_idx % subtile_factor
+            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+            if m_block < m_block_max:
+                producer_state_Q, producer_state_dO = _load_q_do_block_sm90(
+                    m_block,
+                    producer_state_Q,
+                    producer_state_dO,
+                    pipeline_Q,
+                    pipeline_dO,
+                    load_K,
+                    load_V,
+                    load_Q,
+                    load_dO,
+                    load_LSE,
+                    load_dPsum,
+                    tma_copy_bytes_K,
+                    tma_copy_bytes_V,
+                    Q_stage_eq_dO_stage,
+                    load_kv=not kv_loaded,
+                )
+                kv_loaded = True
+    return producer_state_Q, producer_state_dO
+@cute.jit
+def consume_block_sparse_mma_bwd_sm90(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    consumer_state_Q,
+    consumer_state_dO,
+    mma_one_m_block_fn,
+    mask,
+    mask_mod,
+    is_causal: cutlass.Constexpr,
+    is_local: cutlass.Constexpr,
+    thr_mma_SdP,
+    softmax_scale,
+    seqlen,
+    subtile_factor: cutlass.Constexpr,
+    m_block_max: int,
+    aux_tensors=None,
+    fastdiv_mods=(None, None),
+):
+    """SM90 backward block sparse MMA consumption with separate partial/full loops.
+    Partial blocks are processed first (with mask_mod applied), then full blocks
+    (without mask_mod). This ensures mask_mod is only applied where needed.
+    Returns updated (consumer_state_Q, consumer_state_dO).
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    dKV_accumulate = False
+    mask_fn_partial = partial(
+        mask.apply_mask,
+        batch_idx=batch_idx,
+        head_idx=head_idx,
+        n_block=n_block,
+        thr_mma=thr_mma_SdP,
+        mask_seqlen=True,
+        mask_causal=is_causal,
+        mask_local=is_local,
+        mask_mod=mask_mod,
+        aux_tensors=aux_tensors,
+        fastdiv_mods=fastdiv_mods,
+    )
+    mask_fn_full = partial(
+        mask.apply_mask,
+        batch_idx=batch_idx,
+        head_idx=head_idx,
+        n_block=n_block,
+        thr_mma=thr_mma_SdP,
+        mask_seqlen=True,
+        mask_causal=is_causal,
+        mask_local=is_local,
+        aux_tensors=aux_tensors,
+        fastdiv_mods=fastdiv_mods,
+    )
+    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
+        sparse_idx = iter_idx // subtile_factor
+        subtile_offset = iter_idx % subtile_factor
+        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+        if m_block < m_block_max:
+            consumer_state_Q, consumer_state_dO = mma_one_m_block_fn(
+                m_block,
+                consumer_state_Q,
+                consumer_state_dO,
+                mask_fn=mask_fn_partial,
+                dKV_accumulate=dKV_accumulate,
+                thr_mma_SdP=thr_mma_SdP,
+                batch_idx=batch_idx,
+                head_idx=head_idx,
+                n_block=n_block,
+                softmax_scale=softmax_scale,
+                seqlen=seqlen,
+                aux_tensors=aux_tensors,
+                fastdiv_mods=fastdiv_mods,
+            )
+            dKV_accumulate = True
+    if const_expr(full_cnt is not None):
+        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
+            sparse_idx = iter_idx // subtile_factor
+            subtile_offset = iter_idx % subtile_factor
+            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+            if m_block < m_block_max:
+                consumer_state_Q, consumer_state_dO = mma_one_m_block_fn(
+                    m_block,
+                    consumer_state_Q,
+                    consumer_state_dO,
+                    mask_fn=mask_fn_full,
+                    dKV_accumulate=dKV_accumulate,
+                    thr_mma_SdP=thr_mma_SdP,
+                    batch_idx=batch_idx,
+                    head_idx=head_idx,
+                    n_block=n_block,
+                    softmax_scale=softmax_scale,
+                    seqlen=seqlen,
+                    aux_tensors=aux_tensors,
+                    fastdiv_mods=fastdiv_mods,
+                )
+                dKV_accumulate = True
+    return consumer_state_Q, consumer_state_dO
+@cute.jit
+def _store_one_dQaccum_sm90(
+    m_block,
+    sdQaccum: cute.Tensor,
+    gdQaccum: cute.Tensor,
+    num_mma_warp_groups: cutlass.Constexpr,
+    num_threads_per_warp_group: cutlass.Constexpr,
+    tma_copy_bytes_dQ,
+):
+    """Store dQaccum for a single m_block."""
+    for warp_group_idx in cutlass.range_constexpr(num_mma_warp_groups):
+        cute.arch.barrier(
+            barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
+            number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
+        )
+        with cute.arch.elect_one():
+            copy_utils.cpasync_reduce_bulk_add_f32(
+                sdQaccum[None, warp_group_idx].iterator,
+                gdQaccum[None, warp_group_idx, m_block].iterator,
+                tma_copy_bytes_dQ,
+            )
+        cute.arch.cp_async_bulk_commit_group()
+    for warp_group_idx in cutlass.range_constexpr(num_mma_warp_groups):
+        cute.arch.cp_async_bulk_wait_group(num_mma_warp_groups - 1 - warp_group_idx, read=True)
+        cute.arch.barrier_arrive(
+            barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
+            number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
+        )
+@cute.jit
+def dQaccum_store_block_sparse_bwd_sm90(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    sdQaccum: cute.Tensor,
+    gdQaccum: cute.Tensor,
+    subtile_factor: cutlass.Constexpr,
+    m_block_max: int,
+    num_mma_warp_groups: cutlass.Constexpr,
+    num_threads_per_warp_group: cutlass.Constexpr,
+    tma_copy_bytes_dQ,
+):
+    """SM90 backward block sparse dQaccum store with separate partial/full loops.
+    Iterates partial blocks first, then full blocks, matching producer/consumer order.
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
+        sparse_idx = iter_idx // subtile_factor
+        subtile_offset = iter_idx % subtile_factor
+        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+        if m_block < m_block_max:
+            _store_one_dQaccum_sm90(
+                m_block,
+                sdQaccum,
+                gdQaccum,
+                num_mma_warp_groups,
+                num_threads_per_warp_group,
+                tma_copy_bytes_dQ,
+            )
+    if const_expr(full_cnt is not None):
+        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
+            sparse_idx = iter_idx // subtile_factor
+            subtile_offset = iter_idx % subtile_factor
+            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+            if m_block < m_block_max:
+                _store_one_dQaccum_sm90(
+                    m_block,
+                    sdQaccum,
+                    gdQaccum,
+                    num_mma_warp_groups,
+                    num_threads_per_warp_group,
+                    tma_copy_bytes_dQ,
+                )