PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/flash_attn/softmax.py ADDED Viewed

@@ -0,0 +1,583 @@
+# @nolint # fbcode
+# Copyright (c) 2025, Tri Dao.
+import math
+import operator
+from typing import Tuple
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32
+import mslk.attention.flash_attn.utils as utils
+from mslk.attention.flash_attn.cute_dsl_utils import ParamsBase
+from mslk.attention.flash_attn.seqlen_info import SeqlenInfoQK
+@dataclass
+class Softmax(ParamsBase):
+    scale_log2: Float32
+    num_rows: cutlass.Constexpr[int]
+    row_max: cute.Tensor
+    row_sum: cute.Tensor
+    arch: cutlass.Constexpr[int] = 80
+    softmax_scale: Float32 | None = None
+    @staticmethod
+    def create(
+        scale_log2: Float32,
+        num_rows: cutlass.Constexpr[int],
+        arch: cutlass.Constexpr[int] = 80,
+        softmax_scale: Float32 | None = None,
+    ):
+        row_max = cute.make_rmem_tensor(num_rows, Float32)
+        row_sum = cute.make_rmem_tensor(num_rows, Float32)
+        return Softmax(scale_log2, num_rows, row_max, row_sum, arch, softmax_scale)
+    def reset(self) -> None:
+        self.row_max.fill(-Float32.inf)
+        self.row_sum.fill(0.0)
+    def _compute_row_max(
+        self, acc_S_row: cute.TensorSSA, init_val: float | Float32 | None = None
+    ) -> Float32:
+        return utils.fmax_reduce(acc_S_row, init_val, arch=self.arch)
+    def _compute_row_sum(
+        self, acc_S_row_exp: cute.TensorSSA, init_val: float | Float32 | None = None
+    ) -> Float32:
+        return utils.fadd_reduce(acc_S_row_exp, init_val, arch=self.arch)
+    @cute.jit
+    def online_softmax(
+        self,
+        acc_S: cute.Tensor,
+        is_first: cutlass.Constexpr[bool] = False,
+        check_inf: cutlass.Constexpr[bool] = True,
+    ) -> cute.Tensor:
+        """Apply online softmax and return the row_scale to rescale O.
+        :param acc_S: acc_S tensor
+        :type acc_S: cute.Tensor
+        :param is_first: is first n_block
+        :type is_first: cutlass.Constexpr
+        """
+        # Change acc_S to M,N layout view.
+        acc_S_mn = utils.make_acc_tensor_mn_view(acc_S)
+        row_scale = cute.make_fragment_like(self.row_max, Float32)
+        row_max = self.row_max
+        row_sum = self.row_sum
+        scale_log2 = self.scale_log2
+        arch = self.arch
+        # Each iteration processes one row of acc_S
+        for r in cutlass.range(cute.size(row_max), unroll_full=True):
+            acc_S_row = acc_S_mn[r, None].load()  # (n_block_size)
+            row_max_cur = utils.fmax_reduce(
+                acc_S_row,
+                init_val=row_max[r] if cutlass.const_expr(not is_first) else None,
+                arch=arch,
+            )
+            row_max_cur = utils.warp_reduce(row_max_cur, cute.arch.fmax, width=4)
+            # Update row_max before changing row_max_cur to safe value for -inf
+            row_max_prev = row_max[r]
+            row_max[r] = row_max_cur
+            if cutlass.const_expr(check_inf):
+                row_max_cur = 0.0 if row_max_cur == -Float32.inf else row_max_cur
+            if cutlass.const_expr(is_first):
+                row_max_cur_scaled = row_max_cur * scale_log2
+                acc_S_row_exp = utils.exp2f(acc_S_row * scale_log2 - row_max_cur_scaled)
+                acc_S_row_sum = utils.fadd_reduce(acc_S_row_exp, init_val=None, arch=arch)
+                row_scale[r] = 1.0
+            else:
+                row_max_cur_scaled = row_max_cur * scale_log2
+                acc_S_row_exp = utils.exp2f(acc_S_row * scale_log2 - row_max_cur_scaled)
+                # row_scale[r] = utils.exp2f(row_max_prev * self.scale_log2 - row_max_cur_scaled)
+                row_scale[r] = utils.exp2f((row_max_prev - row_max_cur) * scale_log2)
+                acc_S_row_sum = utils.fadd_reduce(
+                    acc_S_row_exp, init_val=row_sum[r] * row_scale[r], arch=arch
+                )
+            row_sum[r] = acc_S_row_sum
+            acc_S_mn[r, None].store(acc_S_row_exp)
+        return row_scale
+    @cute.jit
+    def finalize(
+        self, final_scale: Float32 = 1.0, sink_val: Float32 | cute.Tensor | None = None
+    ) -> cute.Tensor:
+        """Finalize the online softmax by computing the scale and logsumexp."""
+        if cutlass.const_expr(sink_val is not None and isinstance(sink_val, cute.Tensor)):
+            assert cute.size(sink_val) == cute.size(self.row_sum)
+        row_sum = self.row_sum
+        row_max = self.row_max
+        scale_log2 = self.scale_log2
+        # quad reduction for row_sum as we didn't do it during each iteration of online softmax
+        row_sum.store(utils.warp_reduce(row_sum.load(), operator.add, width=4))
+        row_scale = cute.make_fragment_like(row_max, Float32)
+        for r in cutlass.range(cute.size(row_sum), unroll_full=True):
+            if cutlass.const_expr(sink_val is not None):
+                sink_val_cur = sink_val if not isinstance(sink_val, cute.Tensor) else sink_val[r]
+                LOG2_E = math.log2(math.e)
+                row_sum[r] += utils.exp2f(sink_val_cur * LOG2_E - row_max[r] * scale_log2)
+            # if row_sum is zero or nan, set acc_O_mn_row to 1.0
+            acc_O_mn_row_is_zero_or_nan = row_sum[r] == 0.0 or row_sum[r] != row_sum[r]
+            row_scale[r] = (
+                cute.arch.rcp_approx(row_sum[r] if not acc_O_mn_row_is_zero_or_nan else 1.0)
+            ) * final_scale
+            row_sum_cur = row_sum[r]
+            LN2 = math.log(2.0)
+            row_sum[r] = (
+                (row_max[r] * scale_log2 + utils.log2f(row_sum_cur)) * LN2
+                if not acc_O_mn_row_is_zero_or_nan
+                else -Float32.inf
+            )
+        return row_scale
+    @cute.jit
+    def rescale_O(self, acc_O: cute.Tensor, row_scale: cute.Tensor) -> None:
+        """Scale each row of acc_O by the given scale tensor.
+        :param acc_O: input tensor
+        :type acc_O: cute.Tensor
+        :param row_scale: row_scale tensor
+        :type row_scale: cute.Tensor
+        """
+        acc_O_mn = utils.make_acc_tensor_mn_view(acc_O)
+        assert cute.size(row_scale) == cute.size(acc_O_mn, mode=[0])
+        for r in cutlass.range(cute.size(row_scale), unroll_full=True):
+            acc_O_mn[r, None].store(acc_O_mn[r, None].load() * row_scale[r])
+@dataclass
+class SoftmaxSm100(Softmax):
+    rescale_threshold: cutlass.Constexpr[float] = 0.0
+    @staticmethod
+    def create(
+        scale_log2: Float32,
+        rescale_threshold: cutlass.Constexpr[float] = 0.0,
+        softmax_scale: Float32 | None = None,
+    ):
+        num_rows = 1
+        arch = 100
+        row_max = cute.make_rmem_tensor(num_rows, Float32)
+        row_sum = cute.make_rmem_tensor(num_rows, Float32)
+        return SoftmaxSm100(
+            scale_log2,
+            num_rows,
+            row_max,
+            row_sum,
+            arch,
+            softmax_scale,
+            rescale_threshold=rescale_threshold,
+        )
+    @cute.jit
+    def update_row_max(self, acc_S_row: cute.TensorSSA, is_first: int) -> Tuple[Float32, Float32]:
+        if cutlass.const_expr(is_first):
+            row_max_new = self._compute_row_max(acc_S_row)
+            row_max_safe = row_max_new if row_max_new != -cutlass.Float32.inf else 0.0
+            acc_scale = 0.0
+        else:
+            row_max_old = self.row_max[0]
+            row_max_new = self._compute_row_max(acc_S_row, init_val=row_max_old)
+            row_max_safe = row_max_new if row_max_new != -cutlass.Float32.inf else 0.0
+            acc_scale_ = (row_max_old - row_max_safe) * self.scale_log2
+            acc_scale = utils.exp2f(acc_scale_)
+            if cutlass.const_expr(self.rescale_threshold > 0.0):
+                if acc_scale_ >= -self.rescale_threshold:
+                    row_max_new = row_max_old
+                    row_max_safe = row_max_old
+                    acc_scale = 1.0
+        self.row_max[0] = row_max_new
+        return row_max_safe, acc_scale
+    def update_row_sum(
+        self, acc_S_row_exp: cute.TensorSSA, row_scale: Float32, is_first: int = False
+    ) -> None:
+        init_val = self.row_sum[0] * row_scale if cutlass.const_expr(not is_first) else None
+        # self.row_sum[0] = self._compute_row_sum(acc_S_row_exp, init_val=self.row_sum[0] * row_scale)
+        self.row_sum[0] = self._compute_row_sum(acc_S_row_exp, init_val=init_val)
+        # tmp = self._compute_row_sum(acc_S_row_exp)
+        # self.row_sum[0] = self.row_sum[0] * row_scale + tmp
+    @cute.jit
+    def scale_subtract_rowmax(
+        self,
+        acc_S_row: cute.Tensor,
+        row_max: Float32,
+    ):
+        assert cute.size(acc_S_row.shape) % 2 == 0, "acc_S_row must have an even number of elements"
+        row_max_scaled = row_max * self.scale_log2
+        for i in cutlass.range(0, cute.size(acc_S_row.shape), 2, unroll_full=True):
+            acc_S_row[i], acc_S_row[i + 1] = utils.fma_packed_f32x2(
+                (acc_S_row[i], acc_S_row[i + 1]),
+                (self.scale_log2, self.scale_log2),
+                (-row_max_scaled, -row_max_scaled),
+            )
+    @cute.jit
+    def apply_exp2_convert(
+        self,
+        acc_S_row: cute.Tensor,
+        acc_S_row_converted: cute.Tensor,
+        e2e: cutlass.Constexpr[bool] = False,
+        e2e_freq: cutlass.Constexpr[int] = 16,
+        e2e_res: cutlass.Constexpr[int] = 4,
+        e2e_frg_limit: cutlass.Constexpr[int] = 1,
+    ):
+        assert cute.size(acc_S_row.shape) % 2 == 0, "acc_S_row must have an even number of elements"
+        frg_tile = 32
+        assert frg_tile % 2 == 0
+        frg_cnt = cute.size(acc_S_row) // frg_tile
+        assert cute.size(acc_S_row) % frg_tile == 0
+        acc_S_row_frg = cute.logical_divide(acc_S_row, cute.make_layout(frg_tile))
+        acc_S_row_converted_frg = cute.logical_divide(
+            acc_S_row_converted, cute.make_layout(frg_tile)
+        )
+        for j in cutlass.range_constexpr(frg_cnt):
+            for k in cutlass.range_constexpr(0, cute.size(acc_S_row_frg, mode=[0]), 2):
+                # acc_S_row_frg[k, j] = utils.exp2f(acc_S_row_frg[k, j])
+                # acc_S_row_frg[k + 1, j] = utils.exp2f(acc_S_row_frg[k + 1, j])
+                if cutlass.const_expr(not e2e):
+                    acc_S_row_frg[k, j] = cute.arch.exp2(acc_S_row_frg[k, j])
+                    acc_S_row_frg[k + 1, j] = cute.arch.exp2(acc_S_row_frg[k + 1, j])
+                else:
+                    if cutlass.const_expr(
+                        k % e2e_freq < e2e_freq - e2e_res or j >= frg_cnt - e2e_frg_limit
+                    ):
+                        acc_S_row_frg[k, j] = cute.arch.exp2(acc_S_row_frg[k, j])
+                        acc_S_row_frg[k + 1, j] = cute.arch.exp2(acc_S_row_frg[k + 1, j])
+                    else:
+                        # acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j] = utils.e2e_asm2(acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j])
+                        acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j] = utils.ex2_emulation_2(
+                            acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j]
+                        )
+            acc_S_row_converted_frg[None, j].store(
+                acc_S_row_frg[None, j].load().to(acc_S_row_converted.element_type)
+            )
+    @cute.jit
+    def scale_apply_exp2_convert(
+        self,
+        acc_S_row: cute.Tensor,
+        row_max: Float32,
+        acc_S_row_converted: cute.Tensor,
+    ):
+        assert cute.size(acc_S_row.shape) % 2 == 0, "acc_S_row must have an even number of elements"
+        minus_row_max_scaled = -row_max * self.scale_log2
+        for i in cutlass.range_constexpr(0, cute.size(acc_S_row.shape), 2):
+            acc_S_row[i], acc_S_row[i + 1] = utils.fma_packed_f32x2(
+                (acc_S_row[i], acc_S_row[i + 1]),
+                (self.scale_log2, self.scale_log2),
+                (minus_row_max_scaled, minus_row_max_scaled),
+            )
+        # for i in cutlass.range_constexpr(0, cute.size(acc_S_row.shape), 2):
+        #     acc_S_row[i], acc_S_row[i + 1] = utils.fma_packed_f32x2(
+        #         (acc_S_row[i], acc_S_row[i + 1]),
+        #         (self.scale_log2, self.scale_log2),
+        #         (minus_row_max_scaled, minus_row_max_scaled),
+        #     )
+        #     acc_S_row[i] = cute.arch.exp2(acc_S_row[i])
+        #     acc_S_row[i + 1] = cute.arch.exp2(acc_S_row[i + 1])
+        frg_tile = 32
+        assert frg_tile % 2 == 0
+        frg_cnt = cute.size(acc_S_row) // frg_tile
+        assert cute.size(acc_S_row) % frg_tile == 0
+        acc_S_row_frg = cute.logical_divide(acc_S_row, cute.make_layout(frg_tile))
+        acc_S_row_converted_frg = cute.logical_divide(
+            acc_S_row_converted, cute.make_layout(frg_tile)
+        )
+        for j in cutlass.range_constexpr(frg_cnt):
+            for k in cutlass.range_constexpr(0, cute.size(acc_S_row_frg, mode=[0]), 2):
+                # acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j] = (
+                #     utils.fma_packed_f32x2(
+                #         (acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j]),
+                #         (self.scale_log2, self.scale_log2),
+                #         (minus_row_max_scaled, minus_row_max_scaled),
+                #     )
+                # )
+                # acc_S_row_frg[k, j] = utils.exp2f(acc_S_row_frg[k, j])
+                # acc_S_row_frg[k + 1, j] = utils.exp2f(acc_S_row_frg[k + 1, j])
+                acc_S_row_frg[k, j] = cute.arch.exp2(acc_S_row_frg[k, j])
+                acc_S_row_frg[k + 1, j] = cute.arch.exp2(acc_S_row_frg[k + 1, j])
+            acc_S_row_converted_frg[None, j].store(
+                acc_S_row_frg[None, j].load().to(acc_S_row_converted.element_type)
+            )
+@cute.jit
+def floor_if_packed(
+    q_idx,
+    qhead_per_kvhead: cutlass.Constexpr[int],
+) -> cute.Tensor:
+    """Convert q_idx to packed format for Pack-GQA."""
+    if cutlass.const_expr(qhead_per_kvhead == 1):
+        return q_idx
+    return q_idx // qhead_per_kvhead
+@cute.jit
+def apply_score_mod_inner(
+    score_tensor,
+    index_tensor,
+    score_mod: cutlass.Constexpr,
+    batch_idx,
+    head_idx,
+    softmax_scale,
+    vec_size: cutlass.Constexpr,
+    qk_acc_dtype: cutlass.Constexpr,
+    aux_tensors,
+    fastdiv_mods,
+    seqlen_info: SeqlenInfoQK,
+    constant_q_idx: cutlass.Constexpr,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+    transpose_indices: cutlass.Constexpr[bool] = False,
+):
+    """Shared implementation for applying score modification.
+    Args:
+        score_tensor: The scores to modify (acc_S for flash_fwd, tSrS_t2r for sm100)
+        index_tensor: Index positions (tScS for flash_fwd, tScS_t2r for sm100)
+        score_mod: The score modification function to apply
+        batch_idx: Batch index
+        head_idx: Head index
+        softmax_scale: Scale to apply
+        vec_size: Vector size for processing elements
+        qk_acc_dtype: Data type for accumulator
+        aux_tensors: Optional aux_tensors for FlexAttention
+        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
+        seqlen_info: Sequence length info
+        constant_q_idx: If provided, use this constant for all q_idx values
+                        If None, compute q_idx per-element
+        qhead_per_kvhead_packgqa: Pack-GQA replication factor. Divide q_idx by this
+                                  when greater than 1 so score mods see logical heads.
+        transpose_indices: If True, swap q_idx/kv_idx in index_tensor (for bwd kernel where S is transposed)
+    """
+    # Index positions in the index_tensor tuple
+    # Forward: index_tensor[...][0] = q_idx, index_tensor[...][1] = kv_idx
+    # Backward (transposed): index_tensor[...][0] = kv_idx, index_tensor[...][1] = q_idx
+    if cutlass.const_expr(transpose_indices):
+        q_idx_pos = cutlass.const_expr(1)
+        kv_idx_pos = cutlass.const_expr(0)
+    else:
+        q_idx_pos = cutlass.const_expr(0)
+        kv_idx_pos = cutlass.const_expr(1)
+    n_vals = cutlass.const_expr(cute.size(score_tensor.shape))
+    score_vec = cute.make_rmem_tensor(vec_size, qk_acc_dtype)
+    kv_idx_vec = cute.make_rmem_tensor(vec_size, cutlass.Int32)
+    # SSA values for batch (constant across all elements)
+    batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32).broadcast_to((vec_size,))
+    # Handle q_idx based on whether it's constant
+    q_idx_vec = cute.make_rmem_tensor(vec_size, cutlass.Int32)
+    # For Pack-GQA with non-constant q_idx, we need per-element head indices
+    # since a thread my process multiple query head indices
+    if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+        head_idx_vec = cute.make_rmem_tensor(vec_size, cutlass.Int32)
+    for i in cutlass.range(0, n_vals, vec_size, unroll_full=True):
+        for j in cutlass.range(vec_size, unroll_full=True):
+            score_vec[j] = score_tensor[i + j] * softmax_scale
+            # Extract head offset from packed q_idx for Pack-GQA
+            if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+                q_idx_packed = index_tensor[i + j][q_idx_pos]
+                # Building up the logical q_head idx: final_q_head = kv_head * qhead_per_kvhead + (q_physical % qhead_per_kvhead)
+                q_idx_logical = q_idx_packed // qhead_per_kvhead
+                head_offset = q_idx_packed - q_idx_logical * qhead_per_kvhead
+                head_idx_vec[j] = head_idx * qhead_per_kvhead + head_offset
+            # If we will do loads we mod, in order to not read OOB
+            if cutlass.const_expr(aux_tensors is not None and fastdiv_mods is not None):
+                if cutlass.const_expr(constant_q_idx is None):
+                    seqlen_q_divmod, seqlen_k_divmod = fastdiv_mods
+                    q_idx_floored = floor_if_packed(
+                        index_tensor[i + j][q_idx_pos], qhead_per_kvhead
+                    )
+                    _, q_idx_wrapped = divmod(q_idx_floored, seqlen_q_divmod)
+                    q_idx_vec[j] = q_idx_wrapped
+                else:
+                    _, seqlen_k_divmod = fastdiv_mods
+                _, kv_idx_wrapped = divmod(index_tensor[i + j][kv_idx_pos], seqlen_k_divmod)
+                kv_idx_vec[j] = kv_idx_wrapped
+            else:
+                # No bounds checking - direct indexing
+                if constant_q_idx is None:
+                    q_idx_vec[j] = floor_if_packed(index_tensor[i + j][q_idx_pos], qhead_per_kvhead)
+                kv_idx_vec[j] = index_tensor[i + j][kv_idx_pos]
+        # Convert to SSA for score_mod call
+        score_ssa = score_vec.load()
+        kv_idx_ssa = kv_idx_vec.load()
+        if cutlass.const_expr(constant_q_idx is None):
+            q_idx_ssa = q_idx_vec.load()
+        else:
+            # NB we do not apply Pack-GQA division here, as constant_q_idx is assumed to already be logical
+            q_idx_const = constant_q_idx
+            q_idx_ssa = utils.scalar_to_ssa(q_idx_const, cutlass.Int32).broadcast_to((vec_size,))
+        # Compute head_idx_ssa: per-element for Pack-GQA with non-constant q_idx, constant otherwise
+        if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+            head_idx_ssa = head_idx_vec.load()
+        else:
+            head_idx_ssa = utils.scalar_to_ssa(head_idx, cutlass.Int32).broadcast_to((vec_size,))
+        aux_args = []
+        if cutlass.const_expr(aux_tensors is not None):
+            aux_args = aux_tensors
+        post_mod_scores = score_mod(
+            score_ssa,
+            batch_idx_ssa,
+            head_idx_ssa,
+            q_idx=q_idx_ssa,
+            kv_idx=kv_idx_ssa,
+            seqlen_info=seqlen_info,
+            aux_tensors=aux_args,
+        )
+        # Write back modified scores
+        score_vec.store(post_mod_scores)
+        for j in cutlass.range(vec_size, unroll_full=True):
+            score_tensor[i + j] = score_vec[j]
+@cute.jit
+def apply_score_mod_bwd_inner(
+    grad_tensor,
+    score_tensor,
+    index_tensor,
+    score_mod_bwd: cutlass.Constexpr,
+    batch_idx,
+    head_idx,
+    softmax_scale,
+    vec_size: cutlass.Constexpr,
+    qk_acc_dtype: cutlass.Constexpr,
+    aux_tensors,
+    fastdiv_mods,
+    seqlen_info,
+    constant_q_idx: cutlass.Constexpr,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+    transpose_indices: cutlass.Constexpr[bool] = False,
+):
+    """Apply backward score modification (joint graph).
+    Args:
+        grad_tensor: in/out: dlogits rewritten in-place with d(scaled_scores)
+        score_tensor: pre-mod scores (unscaled QK tile), scaled by softmax_scale internally
+        index_tensor: Index positions (same as forward)
+        score_mod_bwd: The backward score modification function (joint graph)
+        batch_idx: Batch index
+        head_idx: Head index
+        softmax_scale: Scale to apply to score_tensor
+        vec_size: Vector size for processing elements
+        qk_acc_dtype: Data type for accumulator
+        aux_tensors: Optional aux_tensors for FlexAttention
+        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
+        seqlen_info: Sequence length info
+        constant_q_idx: If provided, use this constant for all q_idx values
+        qhead_per_kvhead: Pack-GQA replication factor
+        transpose_indices: If True, swap q_idx/kv_idx in index_tensor
+    """
+    # Index positions in the index_tensor tuple
+    # Forward: index_tensor[...][0] = q_idx, index_tensor[...][1] = kv_idx
+    # Backward (transposed): index_tensor[...][0] = kv_idx, index_tensor[...][1] = q_idx
+    if cutlass.const_expr(transpose_indices):
+        q_idx_pos = cutlass.const_expr(1)
+        kv_idx_pos = cutlass.const_expr(0)
+    else:
+        q_idx_pos = cutlass.const_expr(0)
+        kv_idx_pos = cutlass.const_expr(1)
+    n_vals = cutlass.const_expr(cute.size(grad_tensor.shape))
+    grad_vec = cute.make_fragment(vec_size, qk_acc_dtype)
+    score_vec = cute.make_fragment(vec_size, qk_acc_dtype)
+    kv_idx_vec = cute.make_fragment(vec_size, cutlass.Int32)
+    batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32).broadcast_to((vec_size,))
+    q_idx_vec = cute.make_fragment(vec_size, cutlass.Int32)
+    # For Pack-GQA with non-constant q_idx, we need per-element head indices
+    if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+        head_idx_vec = cute.make_fragment(vec_size, cutlass.Int32)
+    for i in cutlass.range(0, n_vals, vec_size, unroll_full=True):
+        for j in cutlass.range(vec_size, unroll_full=True):
+            grad_vec[j] = grad_tensor[i + j]
+            # Scale score so joint graph sees same value as forward score_mod
+            score_vec[j] = score_tensor[i + j] * softmax_scale
+            if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+                q_idx_packed = index_tensor[i + j][q_idx_pos]
+                q_idx_logical = q_idx_packed // qhead_per_kvhead
+                head_offset = q_idx_packed - q_idx_logical * qhead_per_kvhead
+                head_idx_vec[j] = head_idx * qhead_per_kvhead + head_offset
+            if cutlass.const_expr(aux_tensors is not None and fastdiv_mods is not None):
+                if cutlass.const_expr(constant_q_idx is None):
+                    seqlen_q_divmod, seqlen_k_divmod = fastdiv_mods
+                    q_idx_floored = floor_if_packed(
+                        index_tensor[i + j][q_idx_pos], qhead_per_kvhead
+                    )
+                    _, q_idx_wrapped = divmod(q_idx_floored, seqlen_q_divmod)
+                    q_idx_vec[j] = q_idx_wrapped
+                else:
+                    _, seqlen_k_divmod = fastdiv_mods
+                _, kv_idx_wrapped = divmod(index_tensor[i + j][kv_idx_pos], seqlen_k_divmod)
+                kv_idx_vec[j] = kv_idx_wrapped
+            else:
+                # No bounds checking - direct indexing
+                if constant_q_idx is None:
+                    q_idx_vec[j] = floor_if_packed(index_tensor[i + j][q_idx_pos], qhead_per_kvhead)
+                kv_idx_vec[j] = index_tensor[i + j][kv_idx_pos]
+        grad_ssa = grad_vec.load()
+        score_ssa = score_vec.load()
+        kv_idx_ssa = kv_idx_vec.load()
+        if cutlass.const_expr(constant_q_idx is None):
+            q_idx_ssa = q_idx_vec.load()
+        else:
+            q_idx_ssa = utils.scalar_to_ssa(constant_q_idx, cutlass.Int32).broadcast_to((vec_size,))
+        if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+            head_idx_ssa = head_idx_vec.load()
+        else:
+            head_idx_ssa = utils.scalar_to_ssa(head_idx, cutlass.Int32).broadcast_to((vec_size,))
+        aux_args = []
+        if cutlass.const_expr(aux_tensors is not None):
+            aux_args = aux_tensors
+        grad_out_ssa = score_mod_bwd(
+            grad_ssa,
+            score_ssa,
+            batch_idx_ssa,
+            head_idx_ssa,
+            q_idx=q_idx_ssa,
+            kv_idx=kv_idx_ssa,
+            seqlen_info=seqlen_info,
+            aux_tensors=aux_args,
+        )
+        grad_vec.store(grad_out_ssa)
+        for j in cutlass.range(vec_size, unroll_full=True):
+            grad_tensor[i + j] = grad_vec[j]