PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/flash_attn/tile_scheduler.py ADDED Viewed

@@ -0,0 +1,720 @@
+# @nolint # fbcode
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple
+from dataclasses import dataclass, fields
+try:
+    from typing import override
+except ImportError:  # Python < 3.12
+    from typing_extensions import override
+import cutlass
+from cutlass._mlir import ir
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+import mslk.attention.flash_attn.utils as utils
+from mslk.attention.flash_attn.fast_math import clz
+from cutlass.cute import FastDivmodDivisor
+class WorkTileInfo(cutlass.utils.WorkTileInfo):
+    """Altered WorkTileInfo which includes four axes: (block, head, batch, split)"""
+    @override
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "WorkTileInfo":
+        assert len(values) == 5
+        new_tile_idx = cutlass.new_from_mlir_values(self._tile_idx, values[:-1])
+        new_is_valid_tile = cutlass.new_from_mlir_values(self._is_valid_tile, [values[-1]])
+        return WorkTileInfo(new_tile_idx, new_is_valid_tile)
+@dataclass
+class ParamsBase:
+    def __extract_mlir_values__(self):
+        all_fields = [getattr(self, field.name) for field in fields(self)]
+        non_constexpr_fields = [f for f in all_fields if not isinstance(f, cutlass.Constexpr)]
+        values, self._values_pos = [], []
+        for obj in non_constexpr_fields:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
+        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, cutlass.Constexpr)}
+        non_constexpr_fields = {
+            n: f for n, f in all_fields.items() if not isinstance(f, cutlass.Constexpr)
+        }
+        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
+            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
+            values = values[n_items:]
+        return self.__class__(**non_constexpr_fields, **constexpr_fields)
+@dataclass
+class TileSchedulerArguments(ParamsBase):
+    num_block: Int32
+    num_head: Int32
+    num_batch: Int32
+    num_splits: Int32
+    seqlen_k: Int32
+    headdim: Int32
+    headdim_v: Int32
+    total_q: Int32
+    tile_shape_mn: cutlass.Constexpr[Tuple[int, int]]
+    cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+    mCuSeqlensQ: Optional[cute.Tensor] = None
+    mSeqUsedQ: Optional[cute.Tensor] = None
+    qhead_per_kvhead_packgqa: cutlass.Constexpr[int] = 1
+    element_size: cutlass.Constexpr[int] = 2
+    is_persistent: cutlass.Constexpr[bool] = False
+    lpt: cutlass.Constexpr[bool] = False
+    is_split_kv: cutlass.Constexpr[bool] = False
+    head_swizzle: cutlass.Constexpr[bool] = False
+class SingleTileScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        num_block: Int32
+        num_head: Int32
+        num_batch: Int32
+        num_splits: Int32
+        num_splits_divmod: FastDivmodDivisor
+        is_split_kv: cutlass.Constexpr[bool] = False
+        cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+        @staticmethod
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileScheduler.Params":
+            return SingleTileScheduler.Params(
+                args.num_block,
+                args.num_head,
+                args.num_batch,
+                args.num_splits,
+                FastDivmodDivisor(args.num_splits),
+                args.is_split_kv,
+                args.cluster_shape_mn,
+            )
+    def __init__(self, params: Params, blk_coord: cute.Coord, *, loc=None, ip=None):
+        self.params = params
+        self._blk_coord = blk_coord
+        self._is_first_block = True
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileScheduler":
+        blk_coord = cute.arch.block_idx()
+        return SingleTileScheduler(params, blk_coord, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        # TODO: this hard-codes the fact that we only use cluster = (1, 1) or (2, 1)
+        assert params.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
+        return (
+            cute.round_up(params.num_block, params.cluster_shape_mn[0]),
+            params.num_head * params.num_splits,
+            params.num_batch,
+        )
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        block_idx, head_idx, batch_idx = self._blk_coord
+        if const_expr(self.params.is_split_kv):
+            head_idx, split_idx = divmod(head_idx, self.params.num_splits_divmod)
+        else:
+            split_idx = Int32(0)
+        return WorkTileInfo(
+            (block_idx, head_idx, batch_idx, split_idx),
+            self._is_first_block,
+        )
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        self._is_first_block = False
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._blk_coord]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.params, self._blk_coord], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return SingleTileScheduler(*(tuple(obj_list)), loc=self._loc)
+class StaticPersistentTileScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        num_block_divmod: FastDivmodDivisor
+        num_head_divmod: FastDivmodDivisor
+        total_blocks: Int32
+        @staticmethod
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "StaticPersistentTileScheduler.Params":
+            total_blocks = args.num_block * args.num_head * args.num_batch
+            return StaticPersistentTileScheduler.Params(
+                FastDivmodDivisor(args.num_block), FastDivmodDivisor(args.num_head), total_blocks
+            )
+    def __init__(self, params: Params, tile_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return StaticPersistentTileScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    def create(params: Params, *, loc=None, ip=None) -> "StaticPersistentTileScheduler":
+        tile_idx = cute.arch.block_idx()[0]
+        return StaticPersistentTileScheduler(params, tile_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        hardware_info = cutlass.utils.HardwareInfo()
+        sm_count = hardware_info.get_device_multiprocessor_count()
+        return (cutlass.min(sm_count, params.total_blocks), Int32(1), Int32(1))
+    # @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        hn_idx, block_idx = divmod(self._tile_idx, self.params.num_block_divmod)
+        batch_idx, head_idx = divmod(hn_idx, self.params.num_head_divmod)
+        is_valid = self._tile_idx < self.params.total_blocks
+        # if cute.arch.thread_idx()[0] == 0:
+        #     cute.printf("TileScheduler: tile_idx=%d, hn_idx=%d, block_idx=%d, batch_idx=%d, head_idx=%d, is_valid=%d", self._tile_idx, hn_idx, block_idx, batch_idx, head_idx, is_valid)
+        return WorkTileInfo(
+            (Int32(block_idx), Int32(head_idx), Int32(batch_idx), Int32(0)), is_valid
+        )
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        self._tile_idx += cute.arch.grid_dim()[0]
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.params, self._tile_idx],
+            self._values_pos,
+        ):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return StaticPersistentTileScheduler(*(tuple(obj_list)), loc=self._loc)
+class SingleTileLPTScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        total_blocks: Int32
+        num_splits: Int32
+        num_block: Int32
+        l2_minor: Int32
+        num_block_divmod: FastDivmodDivisor
+        num_head_divmod: FastDivmodDivisor
+        l2_minor_divmod: FastDivmodDivisor
+        l2_major_divmod: FastDivmodDivisor
+        l2_minor_residual_divmod: FastDivmodDivisor
+        num_hb_quotient: Int32
+        is_split_kv: cutlass.Constexpr[bool] = False
+        @staticmethod
+        @cute.jit
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileLPTScheduler.Params":
+            # cute.printf(args.num_block, args.num_head, args.num_batch, args.seqlen_k, args.headdim, args.headdim_v, args.total_q, args.tile_shape_mn, args.qhead_per_kvhead_packgqa, args.element_size)
+            size_one_kv_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
+            size_one_head = size_one_kv_head
+            size_l2 = 50 * 1024 * 1024  # 40 MB for K & V
+            # Swizzle is the size of each "section". Round swizzle to a power of 2
+            # Need to be careful about the case where only one head will fit
+            # swizzle is how many heads can fit in L2
+            # swizzle = 1 if size_l2 < size_one_head else (size_l2 // size_one_head)
+            # Seems faster if swizzle if a power of 2
+            log2_floor = lambda n: 31 - clz(n)
+            swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
+            # swizzle = 1 if size_l2 < size_one_head else (size_l2 // size_one_head)
+            # If we're in the last section (called residual), we don't want to divide by
+            # swizzle. Instead we want to divide by the remainder.
+            num_hb_quotient = (args.num_head * args.num_batch) // swizzle
+            num_hb_remainder = (args.num_head * args.num_batch) % swizzle
+            return SingleTileLPTScheduler.Params(
+                total_blocks=args.num_block * args.num_head * args.num_batch,
+                num_block=args.num_block,
+                l2_minor=Int32(swizzle),
+                num_block_divmod=FastDivmodDivisor(args.num_block),
+                num_head_divmod=FastDivmodDivisor(args.num_head),
+                l2_minor_divmod=FastDivmodDivisor(swizzle),
+                l2_major_divmod=FastDivmodDivisor(swizzle * args.num_block),
+                l2_minor_residual_divmod=FastDivmodDivisor(
+                    max(num_hb_remainder, 1)
+                ),  # don't divide by 0
+                num_hb_quotient=Int32(num_hb_quotient),
+                num_splits=args.num_splits,
+                is_split_kv=args.is_split_kv,
+            )
+    def __init__(self, params: Params, tile_idx: Int32, split_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._split_idx = split_idx
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileLPTScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    @cute.jit
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileLPTScheduler":
+        tile_idx, split_idx, _ = cute.arch.block_idx()
+        return SingleTileLPTScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        return (params.total_blocks, params.num_splits, Int32(1))
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        params = self.params
+        # Implement LPT scheduling coordinate calculation
+        bidhb, l2_mod = divmod(self._tile_idx, params.l2_major_divmod)
+        # If we're in the last section (called residual), we don't want to divide by
+        # swizzle. Instead we want to divide by the remainder.
+        block, bidhb_residual = 0, 0
+        if bidhb < params.num_hb_quotient:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_divmod)
+        else:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_residual_divmod)
+        bidhb_actual = bidhb * params.l2_minor + bidhb_residual
+        batch_idx, head_idx = divmod(bidhb_actual, params.num_head_divmod)
+        # Longest-processing-time-first
+        block = params.num_block - 1 - block
+        is_valid = self._tile_idx < params.total_blocks
+        return WorkTileInfo(
+            (Int32(block), Int32(head_idx), Int32(batch_idx), Int32(self._split_idx)), is_valid
+        )
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        # Single tile scheduler - set to invalid tile_idx to indicate no more work
+        self._tile_idx = self.params.total_blocks
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx, self._split_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.params, self._tile_idx, self._split_idx], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return self.__class__(*(tuple(obj_list)), loc=self._loc)
+class SingleTileLPTBwdScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        total_blocks: Int32
+        num_block: Int32
+        l2_minor: Int32
+        num_head_divmod: FastDivmodDivisor
+        l2_minor_divmod: FastDivmodDivisor
+        l2_major_divmod: FastDivmodDivisor
+        l2_minor_residual_divmod: FastDivmodDivisor
+        num_hb_quotient: Int32
+        cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+        spt: cutlass.Constexpr[bool] = True
+        @staticmethod
+        @cute.jit
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileLPTBwdScheduler.Params":
+            size_l2 = 50 * 1024 * 1024
+            size_one_qdo_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
+            # size_one_dqaccum_head = args.seqlen_k * (args.headdim) * 4
+            size_one_dqaccum_head = 0
+            size_one_head = size_one_qdo_head + size_one_dqaccum_head
+            log2_floor = lambda n: 31 - clz(n)
+            swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
+            # swizzle = 8
+            # If we're in the last section (called residual), we don't want to divide by
+            # swizzle. Instead we want to divide by the remainder.
+            num_hb_quotient = (args.num_head * args.num_batch) // swizzle
+            num_hb_remainder = (args.num_head * args.num_batch) % swizzle
+            num_block = cute.ceil_div(args.num_block, args.cluster_shape_mn[0])
+            return SingleTileLPTBwdScheduler.Params(
+                total_blocks=(num_block * args.cluster_shape_mn[0])
+                * args.num_head
+                * args.num_batch,
+                num_block=num_block,
+                l2_minor=Int32(swizzle),
+                num_head_divmod=FastDivmodDivisor(args.num_head),
+                l2_minor_divmod=FastDivmodDivisor(swizzle),
+                l2_major_divmod=FastDivmodDivisor(swizzle * num_block),
+                l2_minor_residual_divmod=FastDivmodDivisor(
+                    max(num_hb_remainder, 1)
+                ),  # don't divide by 0
+                num_hb_quotient=Int32(num_hb_quotient),
+                cluster_shape_mn=args.cluster_shape_mn,
+                spt=args.lpt,
+            )
+    def __init__(self, params: Params, tile_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileLPTBwdScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    @cute.jit
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileLPTBwdScheduler":
+        tile_idx = cute.arch.block_idx()[0]
+        return SingleTileLPTBwdScheduler(params, tile_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        return (params.total_blocks, Int32(1), Int32(1))
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> cutlass.utils.WorkTileInfo:
+        cluster_idx = self._tile_idx // self.params.cluster_shape_mn[0]
+        params = self.params
+        # Implement LPT scheduling coordinate calculation
+        bidhb, l2_mod = divmod(cluster_idx, params.l2_major_divmod)
+        # If we're in the last section (called residual), we don't want to divide by
+        # swizzle. Instead we want to divide by the remainder.
+        block, bidhb_residual = 0, 0
+        if bidhb < params.num_hb_quotient:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_divmod)
+        else:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_residual_divmod)
+        bidhb_actual = bidhb * params.l2_minor + bidhb_residual
+        batch_idx, head_idx = divmod(bidhb_actual, params.num_head_divmod)
+        is_valid = self._tile_idx < params.total_blocks
+        bidx_in_cluster = cute.arch.block_in_cluster_idx()
+        block = block * params.cluster_shape_mn[0] + bidx_in_cluster[0]
+        if cutlass.const_expr(params.spt):
+            block = params.num_block - 1 - block
+        return WorkTileInfo((Int32(block), Int32(head_idx), Int32(batch_idx), Int32(0)), is_valid)
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        # Single tile scheduler - set to invalid tile_idx to indicate no more work
+        self._tile_idx = self.params.total_blocks
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.params, self._tile_idx], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return self.__class__(*(tuple(obj_list)), loc=self._loc)
+class SingleTileVarlenScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        num_head: Int32
+        num_batch: Int32
+        total_q: Int32
+        num_splits: Int32
+        max_kvblock_in_l2: Int32
+        tile_shape_mn: cutlass.Constexpr[Tuple[int, int]]
+        mCuSeqlensQ: Optional[cute.Tensor] = None
+        mSeqUsedQ: Optional[cute.Tensor] = None
+        qhead_per_kvhead_packgqa: cutlass.Constexpr[int] = 1
+        lpt: cutlass.Constexpr[bool] = False
+        is_split_kv: cutlass.Constexpr[bool] = False
+        head_swizzle: cutlass.Constexpr[bool] = False
+        @staticmethod
+        @cute.jit
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileVarlenScheduler.Params":
+            size_l2 = 50 * 1024 * 1024  # 50 MB for K & V
+            max_kvblock_in_l2 = size_l2 // (
+                (args.headdim + args.headdim_v) * args.element_size * args.tile_shape_mn[1]
+            )
+            assert args.mCuSeqlensQ is not None or args.mSeqUsedQ is not None, (
+                "At least one of mCuSeqlensQ or mSeqUsedQ must be provided"
+            )
+            return SingleTileVarlenScheduler.Params(
+                num_head=args.num_head,
+                num_batch=args.num_batch,
+                total_q=args.total_q,
+                num_splits=args.num_splits,
+                max_kvblock_in_l2=max_kvblock_in_l2,
+                tile_shape_mn=args.tile_shape_mn,
+                mCuSeqlensQ=args.mCuSeqlensQ,
+                mSeqUsedQ=args.mSeqUsedQ,
+                qhead_per_kvhead_packgqa=args.qhead_per_kvhead_packgqa,
+                lpt=args.lpt,
+                is_split_kv=args.is_split_kv,
+                head_swizzle=args.head_swizzle,
+            )
+    def __init__(self, params: Params, tile_idx: Int32, split_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._split_idx = split_idx
+        self._is_first_block = True
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileVarlenScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileVarlenScheduler":
+        tile_idx, split_idx, _ = cute.arch.block_idx()
+        return SingleTileVarlenScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        total_blocks_max = (
+            params.total_q + params.num_batch * (params.tile_shape_mn[0] - 1)
+        ) // params.tile_shape_mn[0]
+        return (total_blocks_max * params.num_head, params.num_splits, Int32(1))
+    @cute.jit
+    def _get_num_m_blocks(self, lane: Int32, bidb_start: Int32) -> Int32:
+        params = self.params
+        batch_idx = lane + bidb_start
+        if cutlass.const_expr(params.mSeqUsedQ is not None):
+            seqlen = Int32(0)
+            if batch_idx < params.num_batch:
+                seqlen = params.mSeqUsedQ[batch_idx]
+        else:
+            assert params.mCuSeqlensQ is not None
+            cur_cu_seqlen = Int32(0)
+            if batch_idx <= params.num_batch:
+                cur_cu_seqlen = params.mCuSeqlensQ[batch_idx]
+            next_cu_seqlen = cute.arch.shuffle_sync_down(cur_cu_seqlen, offset=1)
+            seqlen = next_cu_seqlen - cur_cu_seqlen
+        if cutlass.const_expr(params.qhead_per_kvhead_packgqa > 1):
+            seqlen *= params.qhead_per_kvhead_packgqa
+        return (
+            cute.ceil_div(seqlen, params.tile_shape_mn[0])
+            if batch_idx < params.num_batch and lane < cute.arch.WARP_SIZE - 1
+            else Int32(0)
+        )
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        params = self.params
+        lane_idx = cute.arch.lane_idx()
+        num_m_blocks = self._get_num_m_blocks(lane_idx, bidb_start=0)
+        num_m_blocks_cumulative = utils.warp_prefix_sum(num_m_blocks, lane_idx)
+        # Total number of blocks for the next 31 batches
+        m_blocks_in_group = cute.arch.shuffle_sync(num_m_blocks_cumulative, cute.arch.WARP_SIZE - 1)
+        # Same for all lanes
+        group_end_tile = m_blocks_in_group * params.num_head
+        # if cute.arch.thread_idx()[0] == 128 + 31: cute.printf("SingleTileVarlenScheduler: tile_idx=%d, group_end_tile = %d, num_m_blocks=%d, num_m_blocks_cumulative = %d, m_blocks_in_group = %d", self._tile_idx, group_end_tile, num_m_blocks, num_m_blocks_cumulative, m_blocks_in_group)
+        block, head_idx, batch_idx = Int32(0), Int32(0), Int32(0)
+        next_tile_idx = self._tile_idx
+        while group_end_tile <= next_tile_idx:
+            batch_idx += cute.arch.WARP_SIZE - 1
+            if batch_idx >= params.num_batch:
+                batch_idx = Int32(params.num_batch)
+                group_end_tile = next_tile_idx + 1
+            else:
+                num_m_blocks = self._get_num_m_blocks(lane_idx, bidb_start=batch_idx)
+                num_m_blocks_cumulative = utils.warp_prefix_sum(num_m_blocks, lane_idx)
+                m_blocks_in_group = cute.arch.shuffle_sync(
+                    num_m_blocks_cumulative, cute.arch.WARP_SIZE - 1
+                )
+                group_end_tile += m_blocks_in_group * params.num_head
+        is_valid = False
+        if batch_idx >= params.num_batch:
+            block, head_idx, batch_idx = Int32(0), Int32(0), Int32(params.num_batch)
+        else:
+            group_start_tile = group_end_tile - m_blocks_in_group * params.num_head
+            # if cute.arch.thread_idx()[0] == 128 + 31: cute.printf("SingleTileVarlenScheduler: tile_idx=%d, group_end_tile = %d, num_m_blocks=%d, batch_idx = %d", self._tile_idx, group_end_tile, num_m_blocks, batch_idx)
+            # The next problem to process is the first one that does not have ending tile position
+            # that is greater than or equal to tile index.
+            batch_idx_in_group = cute.arch.popc(
+                cute.arch.vote_ballot_sync(
+                    group_start_tile + num_m_blocks_cumulative * params.num_head <= next_tile_idx
+                )
+            )
+            batch_idx += batch_idx_in_group
+            num_m_blocks_prev_lane = (
+                0
+                if batch_idx_in_group == 0
+                else cute.arch.shuffle_sync(num_m_blocks_cumulative, batch_idx_in_group - 1)
+            )
+            num_m_blocks = cute.arch.shuffle_sync(num_m_blocks, batch_idx_in_group)
+            mh_block = next_tile_idx - group_start_tile - num_m_blocks_prev_lane * params.num_head
+            if cutlass.const_expr(params.lpt or params.head_swizzle):
+                # This is a version of the SingleTileLPTScheduler, complicated by the fact that
+                # the seqlen can vary per batch.
+                # TODO: is there any case where num_m_blocks is 0?
+                # TODO: by right we should read the seqlen_kv but we're assuming seqlen_q == seqlen_k here
+                num_n_blocks = (
+                    num_m_blocks
+                    * params.tile_shape_mn[0]
+                    // params.qhead_per_kvhead_packgqa
+                    // params.tile_shape_mn[1]
+                )
+                # nheads_in_l2 = min(max(self.max_kvblock_in_l2 // num_n_blocks, 1), self.num_head)
+                # Seems faster to have this be a power of 2
+                nheads_in_l2 = (
+                    16
+                    if num_n_blocks * 16 <= params.max_kvblock_in_l2
+                    else (
+                        8
+                        if num_n_blocks * 8 <= params.max_kvblock_in_l2
+                        else (
+                            4
+                            if num_n_blocks * 4 <= params.max_kvblock_in_l2
+                            else (2 if num_n_blocks * 2 <= params.max_kvblock_in_l2 else 1)
+                        )
+                    )
+                )
+                nheads_in_l2 = min(nheads_in_l2, params.num_head)
+                mh_in_l2 = nheads_in_l2 * num_m_blocks
+                section_idx = mh_block // mh_in_l2
+                l2_mod = mh_block - section_idx * mh_in_l2
+                # Deal with tail section
+                nheads_in_this_section = (
+                    nheads_in_l2
+                    if nheads_in_l2 * (section_idx + 1) <= params.num_head
+                    else params.num_head - section_idx * nheads_in_l2
+                )
+                block = l2_mod // nheads_in_this_section
+                head_idx_residual = l2_mod - block * nheads_in_this_section
+                head_idx = section_idx * nheads_in_l2 + head_idx_residual
+                if cutlass.const_expr(params.lpt):
+                    block = num_m_blocks - 1 - block
+            else:
+                head_idx = mh_block // num_m_blocks
+                block = mh_block - head_idx * num_m_blocks
+            is_valid = self._is_first_block and batch_idx < params.num_batch
+        # if cute.arch.thread_idx()[0] == 128: cute.printf("SingleTileVarlenScheduler: tile_idx=%d, batch_idx=%d, head_idx=%d, block=%d, is_valid = %d", self._tile_idx, batch_idx, head_idx, block, is_valid)
+        split_idx = self._split_idx if const_expr(params.is_split_kv) else Int32(0)
+        return WorkTileInfo((Int32(block), Int32(head_idx), Int32(batch_idx), split_idx), is_valid)
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        # Single tile scheduler - set to invalid tile_idx to indicate no more work
+        self._is_first_block = False
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx, self._split_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.params, self._tile_idx, self._split_idx],
+            self._values_pos,
+        ):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return SingleTileVarlenScheduler(*(tuple(obj_list)), loc=self._loc)