PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/{dense_gemm_sm100.py → gemm_sm100.py} RENAMED Viewed

@@ -1,33 +1,8 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Based on the cute-dsl example:
+# https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
 import argparse
-from typing import Optional, Type, Tuple, Union, Callable
+from typing import Optional, Type, Tuple, Union, Callable, Literal
 from functools import partial
 import cuda.bindings.driver as cuda
@@ -40,15 +15,25 @@ import cutlass.torch as cutlass_torch
 import cutlass.pipeline as pipeline
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.nvgpu.warp import (
+    LdMatrix8x8x16bOp,
+    LdMatrix16x16x8bOp,
+    StMatrix8x8x16bOp,
+    StMatrix16x8x8bOp,
+)
+from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass.utils import LayoutEnum
 from cutlass.cute.runtime import from_dlpack, make_ptr
-from cutlass import Int32, const_expr
-from quack.cute_dsl_utils import ParamsBase
-from quack.tile_scheduler import (
-    TileSchedulerArguments,
-    TileScheduler,
-    RasterOrderOption,
-)
+from quack.pipeline import PipelineTmaCpAsyncUmma
+from quack.cute_dsl_utils import ParamsBase, ArgumentsBase
+from quack.tile_scheduler import TileSchedulerOptions
+from quack.varlen_utils import VarlenArguments, VarlenManager
+from quack.gemm_sm90 import GemmSm90, NamedBarrierGemm
+import quack.copy_utils as copy_utils
+import quack.sm100_utils as quack_sm100_utils
+# return PipelineStateWAdvance instead of PipelineState
 """
 A high-performance persistent batched dense GEMM example for the NVIDIA Blackwell SM100 architecture
@@ -72,8 +57,6 @@ This GEMM works as follows:
     - Type convert C matrix to output type.
     - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
       or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
-    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
-      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
 SM100 tcgen05.mma instructions operate as follows:
 - Read matrix A from SMEM
@@ -105,7 +88,7 @@ To collect performance with NCU profiler:
 Constraints are same as dense_gemm.py:
 * Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2),
-  see detailed valid dtype combinations in below PersistentDenseGemmKernel class documentation
+  see detailed valid dtype combinations in below GemmSm100 class documentation
 * A/B tensor must have the same data type
 * Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
 * Mma tiler N must be 32-256, step 32
@@ -118,14 +101,12 @@ Constraints are same as dense_gemm.py:
 """
-class PersistentDenseGemmKernel:
+class GemmSm100(GemmSm90):
     """This class implements batched matrix multiplication (C = A x B) with support for various data types
     and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
     :param acc_dtype: Data type for accumulation during computation
     :type acc_dtype: type[cutlass.Numeric]
-    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
-    :type use_2cta_instrs: bool
     :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
     :type mma_tiler_mn: Tuple[int, int]
     :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
@@ -159,22 +140,28 @@ class PersistentDenseGemmKernel:
         - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
     Example:
-        >>> gemm = PersistentDenseGemmKernel(
-        ...     acc_dtype=cutlass.Float32,
-        ...     use_2cta_instrs=True,
+        >>> gemm = GemmSm100(
+        ...     acc_dtype=Float32,
         ...     mma_tiler_mn=(128, 128),
         ...     cluster_shape_mn=(2, 2)
         ... )
         >>> gemm(mA, mB, mD, max_active_clusters, stream)
     """
+    arch = 100
+    num_epi_tensormaps = GemmSm90.num_epi_tensormaps
+    EpilogueArguments = GemmSm90.EpilogueArguments
+    EpilogueParams = GemmSm90.EpilogueParams
     def __init__(
         self,
         acc_dtype: Type[cutlass.Numeric],
-        use_2cta_instrs: bool,
+        a_dtype: Type[cutlass.Numeric],  # ignored for now
         mma_tiler_mn: Tuple[int, int],
-        cluster_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
         sf_vec_size: Optional[int] = None,
+        gather_A: bool = False,
     ):
         """Initializes the configuration for a Blackwell dense GEMM kernel.
@@ -187,50 +174,54 @@ class PersistentDenseGemmKernel:
               with cta_group=2 should be used.
         2.  Cluster Shape:
-            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+            - cluster_shape_mnk: The (ClusterM, ClusterN) shape of the CTA cluster.
         :param acc_dtype: Data type of the accumulator.
         :type acc_dtype: type[cutlass.Numeric]
         :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
         :type mma_tiler_mn: Tuple[int, int]
-        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
-        :type use_2cta_instrs: bool
-        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
-        :type cluster_shape_mn: Tuple[int, int]
+        :param cluster_shape_mnk: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mnk: Tuple[int, int]
         """
         self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
-        self.use_2cta_instrs = use_2cta_instrs
-        self.cluster_shape_mn = cluster_shape_mn
+        self.use_2cta_instrs = cluster_shape_mnk[0] == 2 and mma_tiler_mn[0] in (256,)
+        self.cluster_shape_mnk = cluster_shape_mnk
+        assert cluster_shape_mnk[2] == 1, "Cluster shape K must be 1"
         # K dimension is deferred in _setup_attributes
         self.mma_tiler = (*mma_tiler_mn, 1)
         self.sf_vec_size = sf_vec_size
         self.blockscaled = sf_vec_size is not None
+        self.is_persistent = True
+        self.pingpong = False  # for compatibility with GemmSm90
+        self.gather_A = gather_A
+        if gather_A:
+            assert cluster_shape_mnk[1] == 1, "Cluster shape N must be 1 for gather A "
-        self.cta_group = tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        self.num_ab_load_warps = 1 if not self.gather_A else 5
         self.occupancy = 1
         # Set specialized warp ids
-        self.epilog_warp_id = (
-            0,
-            1,
-            2,
-            3,
-        )
+        self.epilog_warp_id = (0, 1, 2, 3)
         self.mma_warp_id = 4
-        self.tma_warp_id = 5
-        self.tma_epi_warp_id = 6
-        self.threads_per_cta = 32 * len(
-            (self.mma_warp_id, self.tma_warp_id, self.tma_epi_warp_id, *self.epilog_warp_id)
+        self.ab_load_warp_id = 5
+        self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
+        self.scheduler_warp_id = self.epi_load_warp_id + 1
+        self.num_epi_warps = len(self.epilog_warp_id)
+        self.threads_per_cta = cute.arch.WARP_SIZE * (
+            self.num_ab_load_warps
+            + len(
+                (
+                    self.mma_warp_id,
+                    self.epi_load_warp_id,
+                    self.scheduler_warp_id,
+                    *self.epilog_warp_id,
+                )
+            )
         )
-        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
-        self.cta_sync_bar_id = 0
-        self.epilog_sync_bar_id = 1
-        self.tmem_ptr_sync_bar_id = 2
-        self.epilog_load_bar_id = 3
-        self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_100")
-    def _setup_attributes(self):
+    def _setup_attributes(self, epilogue_args: EpilogueArguments, varlen_args: VarlenArguments):
         """Set up configurations that are dependent on GEMM inputs
         This method configures various attributes based on the input tensor properties
@@ -261,7 +252,7 @@ class PersistentDenseGemmKernel:
         # Configure tiled mma
         if const_expr(not self.blockscaled):
-            tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.tiled_mma = sm100_utils.make_trivial_tiled_mma(
                 self.a_dtype,
                 self.a_major_mode,
                 self.b_major_mode,
@@ -269,9 +260,9 @@ class PersistentDenseGemmKernel:
                 self.cta_group,
                 self.mma_tiler[:2],
             )
-            tiled_mma_sfb = None
+            self.tiled_mma_sfb = None
         else:
-            tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
                 self.a_dtype,
                 self.a_major_mode,
                 self.b_major_mode,
@@ -280,13 +271,13 @@ class PersistentDenseGemmKernel:
                 self.cta_group,
                 self.mma_inst_shape_mnk[:2],
             )
-            tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
                 self.a_dtype,
                 self.a_major_mode,
                 self.b_major_mode,
                 self.sf_dtype,
                 self.sf_vec_size,
-                cute.nvgpu.tcgen05.CtaGroup.ONE,
+                tcgen05.CtaGroup.ONE,
                 self.mma_inst_shape_mnk_sfb[:2],
             )
@@ -306,26 +297,28 @@ class PersistentDenseGemmKernel:
         else:
             self.mma_tiler_sfb = None
         self.cta_tile_shape_mnk = (
-            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[0] // cute.size(self.tiled_mma.thr_id.shape),
             self.mma_tiler[1],
             self.mma_tiler[2],
         )
         # Compute cluster layout
         self.cluster_layout_vmnk = cute.tiled_divide(
-            cute.make_layout((*self.cluster_shape_mn, 1)),
-            (tiled_mma.thr_id.shape,),
+            cute.make_layout(self.cluster_shape_mnk),
+            (self.tiled_mma.thr_id.shape,),
         )
         if const_expr(self.blockscaled):
             self.cluster_layout_sfb_vmnk = cute.tiled_divide(
-                cute.make_layout((*self.cluster_shape_mn, 1)),
-                (tiled_mma_sfb.thr_id.shape,),
+                cute.make_layout(self.cluster_shape_mnk),
+                (self.tiled_mma_sfb.thr_id.shape,),
             )
         else:
             self.cluster_layout_sfb_vmnk = None
         # Compute number of multicast CTAs for A/B
         self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        if self.gather_A:
+            assert self.num_mcast_ctas_a == 1
         self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
         self.is_a_mcast = self.num_mcast_ctas_a > 1
         self.is_b_mcast = self.num_mcast_ctas_b > 1
@@ -337,60 +330,82 @@ class PersistentDenseGemmKernel:
         self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
             self.cta_tile_shape_mnk,
             self.use_2cta_instrs,
-            self.d_layout,
-            self.d_dtype,
+            self.d_layout if self.d_layout is not None else LayoutEnum.ROW_MAJOR,
+            self.d_dtype if self.d_dtype is not None else cutlass.BFloat16,
+            layout_c=self.c_layout,
+            elem_ty_c=self.c_dtype,
         )
         # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        prefetch_A_idx = (
+            None
+            if not self.gather_A
+            else ("varlen_m" if varlen_args.mCuSeqlensM is not None else "varlen_k")
+        )
         (
             self.num_acc_stage,
-            self.num_ab_stage,
-            self.num_d_stage,
-            self.num_c_stage,
+            self.ab_stage,
+            self.epi_stage,
+            self.epi_c_stage,
         ) = self._compute_stages(
-            tiled_mma,
+            self.tiled_mma,
             self.mma_tiler,
+            self.cta_tile_shape_mnk,
+            self.epi_tile,
             self.a_dtype,
             self.b_dtype,
-            self.epi_tile,
+            self.sf_dtype,
+            self.sf_vec_size,
             self.d_dtype,
             self.c_dtype,
             self.d_layout,
             self.c_layout,
-            self.sf_dtype,
-            self.sf_vec_size,
-            self.smem_capacity,
+            epilogue_args,
+            prefetch_A_idx,
+            cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
             self.occupancy,
         )
+        self.sched_stage = 1
+        self.a_prefetch_stage = (
+            0
+            if not self.gather_A
+            else (2 if varlen_args.mCuSeqlensM is not None else self.ab_stage)
+        )
         # Compute A/B/SFA/SFB/C shared memory layout
         self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-            tiled_mma, self.mma_tiler, self.a_dtype, self.num_ab_stage
+            self.tiled_mma, self.mma_tiler, self.a_dtype, self.ab_stage
         )
+        self.a_smem_load_layout_staged = self.a_smem_layout_staged
+        if const_expr(self.gather_A):
+            self.a_smem_load_layout_staged = quack_sm100_utils.make_smem_layout_cpasync_a(
+                self.tiled_mma, self.mma_tiler, self.a_dtype, self.ab_stage
+            )
         self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-            tiled_mma, self.mma_tiler, self.b_dtype, self.num_ab_stage
-        )
-        self.d_smem_layout_staged = sm100_utils.make_smem_layout_epi(
-            self.d_dtype, self.d_layout, self.epi_tile, self.num_d_stage
+            self.tiled_mma, self.mma_tiler, self.b_dtype, self.ab_stage
         )
+        self.epi_smem_layout_staged = None
+        if const_expr(self.d_dtype is not None):
+            self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+                self.d_dtype, self.d_layout, self.epi_tile, self.epi_stage
+            )
+        self.epi_c_smem_layout_staged = None
         if const_expr(self.c_dtype is not None):
             self.epi_c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
-                self.c_dtype, self.c_layout, self.epi_tile, self.num_c_stage
+                self.c_dtype, self.c_layout, self.epi_tile, self.epi_c_stage
             )
-        else:
-            self.epi_c_smem_layout_staged = None
         if const_expr(self.blockscaled):
             self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
-                tiled_mma,
+                self.tiled_mma,
                 self.mma_tiler,
                 self.sf_vec_size,
-                self.num_ab_stage,
+                self.ab_stage,
             )
             self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
-                tiled_mma,
+                self.tiled_mma,
                 self.mma_tiler,
                 self.sf_vec_size,
-                self.num_ab_stage,
+                self.ab_stage,
             )
         else:
             self.sfa_smem_layout_staged, self.sfb_smem_layout_staged = None, None
@@ -398,7 +413,7 @@ class PersistentDenseGemmKernel:
         # Compute the number of tensor memory allocation columns
         if const_expr(not self.blockscaled):
             self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(
-                tiled_mma, self.mma_tiler, self.num_acc_stage
+                self.tiled_mma, self.mma_tiler, self.num_acc_stage
             )
         else:
             SM100_TMEM_CAPACITY_COLUMNS = 512
@@ -409,14 +424,14 @@ class PersistentDenseGemmKernel:
         self,
         mA: cute.Tensor,
         mB: cute.Tensor,
-        mD: cute.Tensor,
+        mD: Optional[cute.Tensor],
         mC: Optional[cute.Tensor],
-        tile_count_semaphore: Optional[cute.Pointer],
-        max_active_clusters: cutlass.Constexpr,
+        epilogue_args: ArgumentsBase,
+        scheduler_args: TileSchedulerOptions,
+        varlen_args: Optional[VarlenArguments],
         stream: cuda.CUstream,
         mSFA: Optional[cute.Tensor] = None,
         mSFB: Optional[cute.Tensor] = None,
-        epilogue_op: cutlass.Constexpr = lambda x: x,
     ):
         """Execute the GEMM operation in steps:
         - Setup static attributes before smem/grid/tma computation
@@ -435,32 +450,48 @@ class PersistentDenseGemmKernel:
         :type max_active_clusters: cutlass.Constexpr
         :param stream: CUDA stream for asynchronous execution
         :type stream: cuda.CUstream
-        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
-        :type epilogue_op: cutlass.Constexpr
         :raises TypeError: If input data types are incompatible with the MMA instruction.
         :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
         """
         if const_expr(self.blockscaled):
             assert mSFA is not None and mSFB is not None
         # Setup static attributes before smem/grid/tma computation
-        self.a_dtype: Type[cutlass.Numeric] = mA.element_type
-        self.b_dtype: Type[cutlass.Numeric] = mB.element_type
-        self.d_dtype: Type[cutlass.Numeric] = mD.element_type
+        self.a_dtype = mA.element_type
+        self.b_dtype = mB.element_type
+        self.d_dtype = mD.element_type if mD is not None else None
         self.c_dtype = mC.element_type if mC is not None else None
         self.sf_dtype: Optional[Type[cutlass.Numeric]] = (
             mSFA.element_type if mSFA is not None else None
         )
-        self.a_major_mode = cutlass.utils.LayoutEnum.from_tensor(mA).mma_major_mode()
-        self.b_major_mode = cutlass.utils.LayoutEnum.from_tensor(mB).mma_major_mode()
-        self.d_layout = cutlass.utils.LayoutEnum.from_tensor(mD)
-        self.c_layout = cutlass.utils.LayoutEnum.from_tensor(mC) if mC is not None else None
+        self.a_layout = LayoutEnum.from_tensor(mA)
+        self.b_layout = LayoutEnum.from_tensor(mB)
+        self.d_layout = LayoutEnum.from_tensor(mD) if mD is not None else None
+        self.c_layout = LayoutEnum.from_tensor(mC) if mC is not None else None
+        self.a_major_mode = LayoutEnum.from_tensor(mA).mma_major_mode()
+        self.b_major_mode = LayoutEnum.from_tensor(mB).mma_major_mode()
         # Check if input data types are compatible with MMA instruction
         if const_expr(self.a_dtype != self.b_dtype):
             raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+        if const_expr(varlen_args is None):
+            varlen_args = VarlenArguments()
+        assert (varlen_args.mAIdx is not None) == self.gather_A
+        # Assume all strides are divisible by 128 bits except the last stride
+        new_stride = lambda t: tuple(
+            cute.assume(s, divby=128 // t.element_type.width) if not cute.is_static(s) else s
+            for s in t.stride
+        )
+        mA, mD = [
+            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
+            if t is not None
+            else None
+            for t in (mA, mD)
+        ]
         # Setup attributes that dependent on gemm inputs
-        self._setup_attributes()
+        self._setup_attributes(epilogue_args, varlen_args)
         if const_expr(self.blockscaled):
             # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
@@ -471,67 +502,44 @@ class PersistentDenseGemmKernel:
             sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(mB.shape, self.sf_vec_size)
             mSFB = cute.make_tensor(mSFB.iterator, sfb_layout)
-        if const_expr(not self.blockscaled):
-            tiled_mma = sm100_utils.make_trivial_tiled_mma(
-                self.a_dtype,
-                self.a_major_mode,
-                self.b_major_mode,
-                self.acc_dtype,
-                self.cta_group,
-                self.mma_tiler[:2],
-            )
-            tiled_mma_sfb = None
-        else:
-            tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
-                self.a_dtype,
-                self.a_major_mode,
-                self.b_major_mode,
-                self.sf_dtype,
-                self.sf_vec_size,
-                self.cta_group,
-                self.mma_inst_shape_mnk[:2],
-            )
-            tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
-                self.a_dtype,
-                self.a_major_mode,
-                self.b_major_mode,
-                self.sf_dtype,
-                self.sf_vec_size,
-                cute.nvgpu.tcgen05.CtaGroup.ONE,
-                self.mma_inst_shape_mnk_sfb[:2],
-            )
-        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+        atom_thr_size = cute.size(self.tiled_mma.thr_id.shape)
-        # Setup TMA load for A
-        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        # Setup TMA load for A & B
         a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
-        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
-            a_op,
-            mA,
-            a_smem_layout,
-            self.mma_tiler,
-            tiled_mma,
-            self.cluster_layout_vmnk.shape,
-            internal_type=(cutlass.TFloat32 if mA.element_type is cutlass.Float32 else None),
-        )
-        # Setup TMA load for B
-        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
         b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = None, None
+        if const_expr(not self.gather_A):
+            a_op = sm100_utils.cluster_shape_to_tma_atom_A(
+                self.cluster_shape_mnk, self.tiled_mma.thr_id
+            )
+            tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+                a_op,
+                mA,
+                a_smem_layout,
+                self.mma_tiler,
+                self.tiled_mma,
+                self.cluster_layout_vmnk.shape,
+                internal_type=(cutlass.TFloat32 if mA.element_type is Float32 else None),
+            )
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(
+            self.cluster_shape_mnk, self.tiled_mma.thr_id
+        )
         tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
             b_op,
             mB,
             b_smem_layout,
             self.mma_tiler,
-            tiled_mma,
+            self.tiled_mma,
             self.cluster_layout_vmnk.shape,
-            internal_type=(cutlass.TFloat32 if mB.element_type is cutlass.Float32 else None),
+            internal_type=(cutlass.TFloat32 if mB.element_type is Float32 else None),
         )
+        tma_atom_sfa, tma_tensor_sfa = None, None
+        tma_atom_sfb, tma_tensor_sfb = None, None
         if const_expr(self.blockscaled):
             # Setup TMA load for SFA
             sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(
-                self.cluster_shape_mn, tiled_mma.thr_id
+                self.cluster_shape_mnk, self.tiled_mma.thr_id
             )
             sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
             tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
@@ -539,13 +547,13 @@ class PersistentDenseGemmKernel:
                 mSFA,
                 sfa_smem_layout,
                 self.mma_tiler,
-                tiled_mma,
+                self.tiled_mma,
                 self.cluster_layout_vmnk.shape,
                 internal_type=cutlass.Int16,
             )
             # Setup TMA load for SFB
             sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
-                self.cluster_shape_mn, tiled_mma.thr_id
+                self.cluster_shape_mnk, self.tiled_mma.thr_id
             )
             sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
             tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
@@ -553,58 +561,50 @@ class PersistentDenseGemmKernel:
                 mSFB,
                 sfb_smem_layout,
                 self.mma_tiler_sfb,
-                tiled_mma_sfb,
+                self.tiled_mma_sfb,
                 self.cluster_layout_sfb_vmnk.shape,
                 internal_type=cutlass.Int16,
             )
-        else:
-            tma_atom_sfa, tma_tensor_sfa = None, None
-            tma_atom_sfb, tma_tensor_sfb = None, None
-        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
-        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
-        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+        self.num_tma_load_bytes = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        if const_expr(not self.gather_A):
+            self.num_tma_load_bytes += cute.size_in_bytes(self.a_dtype, a_smem_layout)
         if const_expr(self.blockscaled):
             sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
             sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
-            self.num_tma_load_bytes += (sfa_copy_size + sfb_copy_size) * atom_thr_size
+            self.num_tma_load_bytes += sfa_copy_size + sfb_copy_size
+        self.num_tma_load_bytes *= atom_thr_size
         # Setup TMA store for D
-        epi_smem_layout = cute.slice_(self.d_smem_layout_staged, (None, None, 0))
-        tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
-            cpasync.CopyBulkTensorTileS2GOp(),
-            mD,
-            epi_smem_layout,
-            self.epi_tile,
-        )
-        if const_expr(mC is not None):
-            epi_c_smem_layout = cute.slice_(self.epi_c_smem_layout_staged, (None, None, 0))
-            tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
-                cpasync.CopyBulkTensorTileG2SOp(),
-                mC,
-                epi_c_smem_layout,
+        tma_atom_d, tma_tensor_d = None, None
+        if const_expr(mD is not None):
+            tma_atom_d, tma_tensor_d = self._make_tma_epi_atoms_and_tensors(
+                mD,
+                self.epi_smem_layout_staged,
                 self.epi_tile,
+                op_type="store"
+                if not (hasattr(epilogue_args, "add_to_output") and epilogue_args.add_to_output)
+                else "add",
+            )
+        tma_atom_c, tma_tensor_c = None, None
+        if const_expr(mC is not None):
+            tma_atom_c, tma_tensor_c = self._make_tma_epi_atoms_and_tensors(
+                mC, self.epi_c_smem_layout_staged, self.epi_tile, op_type="load"
             )
-        else:
-            tma_atom_c, tma_tensor_c = None, None
-        problem_shape_ntile_mnl = cute.ceil_div(mD.shape[:2], self.cta_tile_shape_mnk[:2]) + (
-            mD.shape[2],
-        )
-        TileSchedulerCls = TileScheduler
-        tile_sched_args = TileSchedulerArguments(
-            problem_shape_ntile_mnl=problem_shape_ntile_mnl,
-            raster_order=RasterOrderOption.Heuristic,
-            group_size=8,
-            cluster_shape_mnk=(*self.cluster_shape_mn, 1),
-            tile_count_semaphore=tile_count_semaphore,
-            is_persistent=True,
-        )
+        epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
+        varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
+        TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
+        tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
         tile_sched_params = TileSchedulerCls.to_underlying_arguments(tile_sched_args)
-        grid = TileSchedulerCls.get_grid_shape(tile_sched_params, max_active_clusters)
+        grid = TileSchedulerCls.get_grid_shape(
+            tile_sched_params, scheduler_args.max_active_clusters
+        )
         self.buffer_align_bytes = 1024
+        epi_smem_size = cute.cosize(self.epi_smem_layout_staged) if mD is not None else 0
         epi_c_smem_size = cute.cosize(self.epi_c_smem_layout_staged) if mC is not None else 0
         sf_dtype = self.sf_dtype if const_expr(self.blockscaled) else cutlass.Float8E8M0FNU
         sfa_smem_size = (
@@ -613,22 +613,33 @@ class PersistentDenseGemmKernel:
         sfb_smem_size = (
             cute.cosize(self.sfb_smem_layout_staged) if const_expr(self.blockscaled) else 0
         )
+        a_idx_smem_size = 0
+        if const_expr(self.gather_A):
+            a_idx_smem_size = self.a_prefetch_stage * (
+                self.cta_tile_shape_mnk[0]
+                if varlen_args.mCuSeqlensM is not None
+                else self.cta_tile_shape_mnk[2]
+            )
         # Define shared storage for kernel
         @cute.struct
         class SharedStorage:
-            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
-            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
-            epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.num_c_stage * 2]
-            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
-            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
+            epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
+            acc_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
+            a_prefetch_pipeline_array_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.a_prefetch_stage * 2
+            ]
+            tile_count: cute.struct.MemRange[Int32, self.sched_stage]
             tmem_dealloc_mbar_ptr: cutlass.Int64
             tmem_holding_buf: Int32
-            sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, 2]
-            tile_count: cute.struct.MemRange[cutlass.Int32, 1]
+            sAIdx: cute.struct.Align[cute.struct.MemRange[Int32, a_idx_smem_size], 16]
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sD: cute.struct.Align[
-                cute.struct.MemRange[self.d_dtype, cute.cosize(self.d_smem_layout_staged.outer)],
+                cute.struct.MemRange[
+                    self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
+                ],
                 self.buffer_align_bytes,
             ]
             sC: cute.struct.Align[
@@ -637,6 +648,7 @@ class PersistentDenseGemmKernel:
                 ],
                 self.buffer_align_bytes,
             ]
+            epi: self.epi_get_smem_struct(epilogue_params)
             # (MMA, MMA_M, MMA_K, STAGE)
             sA: cute.struct.Align[
                 cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
@@ -662,10 +674,10 @@ class PersistentDenseGemmKernel:
         # Launch the kernel synchronously
         self.kernel(
-            tiled_mma,
-            tiled_mma_sfb,
+            self.tiled_mma,
+            self.tiled_mma_sfb,
             tma_atom_a,
-            tma_tensor_a,
+            tma_tensor_a if const_expr(not self.gather_A) else mA,
             tma_atom_b,
             tma_tensor_b,
             tma_atom_sfa,
@@ -676,24 +688,26 @@ class PersistentDenseGemmKernel:
             tma_tensor_d,
             tma_atom_c,
             tma_tensor_c,
+            epilogue_params,
+            varlen_params,
             self.cluster_layout_vmnk,
             self.cluster_layout_sfb_vmnk,
             self.a_smem_layout_staged,
+            self.a_smem_load_layout_staged,
             self.b_smem_layout_staged,
             self.sfa_smem_layout_staged,
             self.sfb_smem_layout_staged,
-            self.d_smem_layout_staged,
+            self.epi_smem_layout_staged,
             self.epi_c_smem_layout_staged,
             self.epi_tile,
             tile_sched_params,
             TileSchedulerCls,
-            epilogue_op,
         ).launch(
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
-            cluster=(*self.cluster_shape_mn, 1),
-            smem=self.shared_storage.size_in_bytes(),
+            cluster=self.cluster_shape_mnk,
             stream=stream,
+            min_blocks_per_mp=1,
         )
         return
@@ -703,7 +717,7 @@ class PersistentDenseGemmKernel:
         self,
         tiled_mma: cute.TiledMma,
         tiled_mma_sfb: Optional[cute.TiledMma],
-        tma_atom_a: cute.CopyAtom,
+        tma_atom_a: Optional[cute.CopyAtom],
         mA_mkl: cute.Tensor,
         tma_atom_b: cute.CopyAtom,
         mB_nkl: cute.Tensor,
@@ -712,37 +726,52 @@ class PersistentDenseGemmKernel:
         tma_atom_sfb: Optional[cute.CopyAtom],
         mSFB_nkl: Optional[cute.Tensor],
         tma_atom_d: Optional[cute.CopyAtom],
-        mD_mnl: cute.Tensor,
+        mD_mnl: Optional[cute.Tensor],
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
+        epilogue_params: ParamsBase,
+        varlen_params: VarlenManager.Params,
         cluster_layout_vmnk: cute.Layout,
         cluster_layout_sfb_vmnk: Optional[cute.Layout],
-        a_smem_layout_staged: cute.ComposedLayout,
-        b_smem_layout_staged: cute.ComposedLayout,
-        sfa_smem_layout_staged: Optional[cute.Layout],
-        sfb_smem_layout_staged: Optional[cute.Layout],
-        d_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
-        epi_c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        a_smem_layout: cute.ComposedLayout,
+        a_smem_load_layout: cute.ComposedLayout,
+        b_smem_layout: cute.ComposedLayout,
+        sfa_smem_layout: Optional[cute.Layout],
+        sfb_smem_layout: Optional[cute.Layout],
+        epi_smem_layout: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_c_smem_layout: Union[cute.Layout, cute.ComposedLayout, None],
         epi_tile: cute.Tile,
         tile_sched_params: ParamsBase,
         TileSchedulerCls: cutlass.Constexpr[Callable],
-        epilogue_op: cutlass.Constexpr[Callable],
     ):
         """
         GPU device kernel performing the Persistent batched GEMM computation.
         """
+        varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
+        varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
+        assert not (varlen_m and varlen_k)
+        if const_expr(self.gather_A):
+            assert varlen_m or varlen_k
+        has_D = const_expr(mD_mnl is not None)
+        has_C = const_expr(mC_mnl is not None)
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
-        #
-        # Prefetch tma desc
-        #
-        if warp_idx == self.tma_warp_id:
-            cpasync.prefetch_descriptor(tma_atom_a)
-            cpasync.prefetch_descriptor(tma_atom_b)
-            if const_expr(self.blockscaled):
-                cpasync.prefetch_descriptor(tma_atom_sfa)
-                cpasync.prefetch_descriptor(tma_atom_sfb)
-            cpasync.prefetch_descriptor(tma_atom_d)
+        # /////////////////////////////////////////////////////////////////////////////
+        #  Prefetch Tma desc
+        # /////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.ab_load_warp_id:
+            for tma_atom in (
+                tma_atom_a,
+                tma_atom_b,
+                tma_atom_sfa,
+                tma_atom_sfb,
+                tma_atom_d,
+                tma_atom_c,
+            ):
+                if const_expr(tma_atom is not None):
+                    cpasync.prefetch_descriptor(tma_atom)
         use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
@@ -754,13 +783,6 @@ class PersistentDenseGemmKernel:
         mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
         is_leader_cta = mma_tile_coord_v == 0
         cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
-        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
-        if const_expr(self.blockscaled):
-            block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
-                cta_rank_in_cluster
-            )
-        else:
-            block_in_cluster_coord_sfb_vmnk = None
         # Coord inside cta
         tidx, _, _ = cute.arch.thread_idx()
@@ -775,104 +797,68 @@ class PersistentDenseGemmKernel:
         # Tensor memory dealloc barrier init
         if use_2cta_instrs:
-            if warp_idx == self.tma_warp_id:
+            if warp_idx == self.ab_load_warp_id:
                 num_tmem_dealloc_threads = 32
                 cute.arch.mbarrier_init(tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads)
-        # Initialize mainloop ab_pipeline (barrier) and states
-        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
-        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
-            pipeline.Agent.Thread, num_tma_producer
+        # Initialize pipelines and states
+        ab_pipeline = self.make_ab_pipeline(
+            tiled_mma=tiled_mma,
+            cluster_layout_vmnk=cluster_layout_vmnk,
+            ab_pipeline_mbar_ptr=storage.ab_pipeline_array_ptr.data_ptr(),
+            is_leader_cta=is_leader_cta,
         )
-        ab_pipeline = pipeline.PipelineTmaUmma.create(
-            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
-            num_stages=self.num_ab_stage,
-            producer_group=ab_pipeline_producer_group,
-            consumer_group=ab_pipeline_consumer_group,
-            tx_count=self.num_tma_load_bytes,
-            cta_layout_vmnk=cluster_layout_vmnk,
+        epi_pipeline = None
+        if const_expr(has_C):
+            epi_pipeline = self.make_epi_pipeline(
+                c_smem_layout=cute.slice_(epi_c_smem_layout, (None, None, 0)),
+                epi_pipeline_mbar_ptr=storage.epi_pipeline_array_ptr.data_ptr(),
+            )
+        acc_pipeline = self.make_acc_pipeline(
+            cluster_layout_vmnk=cluster_layout_vmnk,
+            acc_pipeline_mbar_ptr=storage.acc_pipeline_array_ptr.data_ptr(),
         )
-        if const_expr(mC_mnl is not None):
-            # Threads/warps participating in this pipeline
-            epi_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-            # Each warp will contribute 1 to the arrive count
-            consumer_arrive_cnt = len(self.epilog_warp_id)
-            epi_pipeline_consumer_group = pipeline.CooperativeGroup(
-                pipeline.Agent.Thread, consumer_arrive_cnt
+        sched_pipeline = None
+        tile_count = None
+        if const_expr(tile_sched_params.tile_count_semaphore is not None):
+            # Dynamic persistent scheduler
+            sched_pipeline = self.make_sched_pipeline(
+                self.cluster_shape_mnk,
+                sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
+                has_C=has_C,
             )
-            c_smem_layout = cute.slice_(epi_c_smem_layout_staged, (None, None, 0))
-            tma_copy_c_bytes = cute.size_in_bytes(self.c_dtype, c_smem_layout)
-            epi_pipeline = pipeline.PipelineTmaAsync.create(
-                barrier_storage=storage.epi_pipeline_array_ptr.data_ptr(),
-                num_stages=self.num_c_stage,
-                producer_group=epi_pipeline_producer_group,
-                consumer_group=epi_pipeline_consumer_group,
-                tx_count=tma_copy_c_bytes,
+            tile_count = storage.tile_count.get_tensor((self.sched_stage,))
+        a_prefetch_pipeline = None
+        if const_expr(self.gather_A):
+            a_prefetch_pipeline = self.make_a_prefetch_pipeline(
+                storage.a_prefetch_pipeline_array_ptr.data_ptr(),
             )
-        else:
-            epi_pipeline = None
-        # Initialize acc_pipeline (barrier) and states
-        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
-        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
-            pipeline.Agent.Thread, num_acc_consumer_threads
-        )
-        acc_pipeline = pipeline.PipelineUmmaAsync.create(
-            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
-            num_stages=self.num_acc_stage,
-            producer_group=acc_pipeline_producer_group,
-            consumer_group=acc_pipeline_consumer_group,
-            cta_layout_vmnk=cluster_layout_vmnk,
-        )
-        # if const_expr(tile_sched_params.tile_count_semaphore is not None):
-        #     # Dynamic persistent scheduler
-        #     # Threads/warps participating in this pipeline
-        #     sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-        #     cluster_size = cute.size(cluster_layout_vmnk)
-        #     # Each warp that are not the scheduler warp will contribute 1 to the arrive count
-        #     consumer_arrive_cnt = (
-        #         (self.mma_warp_groups if not self.pingpong else 1) * 4 + self.num_ab_load_warps
-        #     ) * cluster_size - 1
-        #     sched_pipeline_consumer_group = pipeline.CooperativeGroup(
-        #         pipeline.Agent.Thread, consumer_arrive_cnt
-        #     )
-        #     sched_pipeline = pipeline.PipelineAsync.create(
-        #         barrier_storage=storage.sched_pipeline_array_ptr.data_ptr(),
-        #         num_stages=self.sched_stage,
-        #         producer_group=sched_pipeline_producer_group,
-        #         consumer_group=sched_pipeline_consumer_group,
-        #         # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
-        #         consumer_mask=None if const_expr(cute.size(cluster_layout_mnk) == 1) else 0,
-        #     )
-        #     tile_count = storage.tile_count.get_tensor((self.sched_stage,))
-        # else:
-        #     sched_pipeline = None
-        #     tile_count = None
         # Setup smem tensor A/B/D
         # (MMA, MMA_M, MMA_K, STAGE)
-        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        sA_mma = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
+        sA = storage.sA.get_tensor(a_smem_load_layout.outer, swizzle=a_smem_load_layout.inner)
         # (MMA, MMA_N, MMA_K, STAGE)
-        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
+        sAIdx = None
+        if const_expr(self.gather_A):
+            a_idx_smem_dim = self.cta_tile_shape_mnk[0] if varlen_m else self.cta_tile_shape_mnk[2]
+            a_idx_smem_layout = cute.make_layout((a_idx_smem_dim, self.a_prefetch_stage))
+            sAIdx = storage.sAIdx.get_tensor(a_idx_smem_layout)
+        sSFA, sSFB = None, None
         if const_expr(self.blockscaled):
             # (MMA, MMA_M, MMA_K, STAGE)
-            sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+            sSFA = storage.sSFA.get_tensor(sfa_smem_layout)
             # (MMA, MMA_N, MMA_K, STAGE)
-            sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
-        else:
-            sSFA, sSFB = None, None
-        # (EPI_TILE_M, EPI_TILE_N, STAGE)
-        sD = storage.sD.get_tensor(d_smem_layout_staged.outer, swizzle=d_smem_layout_staged.inner)
-        if const_expr(mC_mnl is not None):
-            sC = storage.sC.get_tensor(
-                epi_c_smem_layout_staged.outer, swizzle=epi_c_smem_layout_staged.inner
-            )
-        else:
-            sC = None
+            sSFB = storage.sSFB.get_tensor(sfb_smem_layout)
+        sD = None
+        if const_expr(has_D):
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sD = storage.sD.get_tensor(epi_smem_layout.outer, swizzle=epi_smem_layout.inner)
+        sC = None
+        if const_expr(has_C):
+            sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
+        epi_smem_tensors = self.epi_get_smem_tensors(epilogue_params, storage)
         thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
         thr_mma_sfb = (
@@ -884,26 +870,51 @@ class PersistentDenseGemmKernel:
         # (MMA, MMA_M, MMA_N, STAGE)
         tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
-        tmem_ptr_read_threads = cute.arch.WARP_SIZE * len((self.mma_warp_id, *self.epilog_warp_id))
-        tmem_alloc_barrier = pipeline.NamedBarrier(
-            barrier_id=self.tmem_ptr_sync_bar_id, num_threads=tmem_ptr_read_threads
+        varlen_manager = VarlenManager.create(
+            varlen_params,
+            has_D,
+            self.num_epi_tensormaps,
+            # Only used if not varlen_m
+            len_m_static=Int32(
+                mA_mkl.shape[0]
+                if varlen_k or varlen_params.mAIdx is None
+                else varlen_params.mAIdx.shape[0]
+            ),
+            len_k_static=Int32(mA_mkl.shape[1]),
         )
-        TileSchedulerCls = partial(TileSchedulerCls.create, tile_sched_params)
-        k_tile_cnt = cute.ceil_div(cute.size(mA_mkl.shape[1]), self.mma_tiler[2])
+        TileSchedulerCls = partial(
+            TileSchedulerCls.create, tile_sched_params, tile_count, sched_pipeline
+        )
-        if const_expr(mC_mnl is not None):
+        tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierGemm.TmemPtr),
+            num_threads=cute.arch.WARP_SIZE * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        epi_load_barrier = None
+        if const_expr(has_C):
             epi_load_barrier = pipeline.NamedBarrier(
-                barrier_id=int(self.epilog_load_bar_id), num_threads=2 * cute.arch.WARP_SIZE
+                barrier_id=int(NamedBarrierGemm.EpilogueLoad), num_threads=2 * cute.arch.WARP_SIZE
             )
-        else:
-            epi_load_barrier = None
         #
-        # Specialized TMA load warp
+        # Specialized AB load warps
         #
-        if warp_idx == self.tma_warp_id:
+        if warp_idx == self.ab_load_warp_id:
+            is_tma_warp = True
+            # initialize tensormap for A & B
+            varlen_manager.init_tensormap_AB(tma_atom_a, tma_atom_b, is_tma_warp)
+            tma_desc_a_ptr = varlen_manager.get_tma_desc_a_ptr()
+            tma_desc_b_ptr = varlen_manager.get_tma_desc_b_ptr()
             # Compute multicast mask for A/B buffer full
+            block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+            block_in_cluster_coord_sfb_vmnk = None
+            if const_expr(self.blockscaled):
+                block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+                    cta_rank_in_cluster
+                )
+            a_mcast_mask, b_mcast_mask = None, None
+            sfa_mcast_mask, sfb_mcast_mask = None, None
             if const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
                 a_mcast_mask = cpasync.create_tma_multicast_mask(
                     cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
@@ -918,141 +929,139 @@ class PersistentDenseGemmKernel:
                     sfb_mcast_mask = cpasync.create_tma_multicast_mask(
                         cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
                     )
-                else:
-                    sfa_mcast_mask, sfb_mcast_mask = None, None
-            else:
-                a_mcast_mask, b_mcast_mask = None, None
-                sfa_mcast_mask, sfb_mcast_mask = None, None
             # Persistent tile scheduling loop
             tile_scheduler = TileSchedulerCls()
             work_tile = tile_scheduler.initial_work_tile_info()
             ab_producer_state = pipeline.make_pipeline_state(
-                pipeline.PipelineUserType.Producer, self.num_ab_stage
+                pipeline.PipelineUserType.Producer, self.ab_stage
             )
-            do_epi_load_barrier_arrive = cutlass.Boolean(True)
+            if const_expr(varlen_k):
+                # wait tensormap initialization complete before update
+                varlen_manager.fence_tensormap_init()
+            do_epi_load_barrier_arrive = Boolean(True)
             while work_tile.is_valid_tile:
-                # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                varlen_manager.update_tensormap_AB(
+                    batch_idx,
+                    self.a_layout,
+                    self.b_layout,
+                    is_tma_warp,
+                )
+                # ///////////////////////////////////////////////////////////////////////////
+                #  Local_tile partition global tensors
+                # ///////////////////////////////////////////////////////////////////////////
                 mma_tile_coord_mnl = (
                     tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
                     tile_coord_mnkl[1],
                     tile_coord_mnkl[3],
                 )
-                # Local_tile partition global tensors
-                # (bM, bK, RestK)
-                gA_mkl = cute.local_tile(
-                    mA_mkl,
-                    cute.slice_(self.mma_tiler, (None, 0, None)),
-                    (mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2]),
-                )
+                gA_mk = None
+                if const_expr(not self.gather_A):
+                    mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
+                    # (bM, bK, RestK)
+                    gA_mk = cute.local_tile(
+                        mA_mk,
+                        cute.select(self.mma_tiler, [0, 2]),
+                        (mma_tile_coord_mnl[0], None),
+                    )
                 # (bN, bK, RestK)
-                gB_nkl = cute.local_tile(
-                    mB_nkl,
-                    cute.slice_(self.mma_tiler, (0, None, None)),
-                    (mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2]),
+                gB_nk = cute.local_tile(
+                    varlen_manager.offset_batch_B(mB_nkl, batch_idx),
+                    cute.select(self.mma_tiler, [1, 2]),
+                    (mma_tile_coord_mnl[1], None),
                 )
                 if const_expr(self.blockscaled):
                     # (bM, bK)
                     gSFA_mkl = cute.local_tile(
-                        mSFA_mkl,
-                        cute.slice_(self.mma_tiler, (None, 0, None)),
-                        (mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2]),
+                        varlen_manager.offset_batch_A(mSFA_mkl, batch_idx),
+                        cute.select(self.mma_tiler, [0, 2]),
+                        (mma_tile_coord_mnl[0], None),
                     )
                     # (bN, bK)
                     gSFB_nkl = cute.local_tile(
-                        mSFB_nkl,
-                        cute.slice_(self.mma_tiler, (0, None, None)),
-                        (mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2]),
+                        varlen_manager.offset_batch_B(mSFB_nkl, batch_idx),
+                        cute.select(self.mma_tiler, [1, 2]),
+                        (mma_tile_coord_mnl[1], None),
                     )
                 # Partition global tensor for TiledMMA_A/B/D
-                # (MMA, MMA_M, MMA_K, RestK)
-                tCgA = thr_mma.partition_A(gA_mkl)
+                # Then partition global/shared tensor for TMA load A/B
+                varlen_manager.fence_tensormap_update_AB(is_tma_warp)
+                len_k = varlen_manager.len_k(batch_idx)
+                # TMA load A partition_S/D
+                a_cta_layout = cute.make_layout(
+                    cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+                )
+                copy_A = None
+                if const_expr(not self.gather_A):
+                    # (MMA, MMA_M, MMA_K, RestK)
+                    tCgA = thr_mma.partition_A(gA_mk)
+                    copy_A, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_a,
+                        cta_coord=block_in_cluster_coord_vmnk[2],
+                        cta_layout=a_cta_layout,
+                        src_tensor=tCgA,
+                        dst_tensor=sA,
+                        mcast_mask=a_mcast_mask,
+                        tma_desc_ptr=tma_desc_a_ptr,
+                    )
                 # (MMA, MMA_N, MMA_K, RestK)
-                tCgB = thr_mma.partition_B(gB_nkl)
+                tCgB = thr_mma.partition_B(gB_nk)
                 if const_expr(self.blockscaled):
                     # (MMA, MMA_M, MMA_K)
                     tCgSFA = thr_mma.partition_A(gSFA_mkl)
                     # (MMA, MMA_N, MMA_K)
                     tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
-                # Partition global/shared tensor for TMA load A/B
-                # TMA load A partition_S/D
-                a_cta_layout = cute.make_layout(
-                    cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
-                )
-                # ((atom_v, rest_v), STAGE)
-                # ((atom_v, rest_v), RestK)
-                tAsA, tAgA = cpasync.tma_partition(
-                    tma_atom_a,
-                    block_in_cluster_coord_vmnk[2],
-                    a_cta_layout,
-                    cute.group_modes(sA, 0, 3),
-                    cute.group_modes(tCgA, 0, 3),
-                )
                 # TMA load B partition_S/D
-                b_cta_layout = cute.make_layout(
-                    cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
-                )
-                # ((atom_v, rest_v), STAGE)
-                # ((atom_v, rest_v), RestK)
-                tBsB, tBgB = cpasync.tma_partition(
+                copy_B, _, _ = copy_utils.tma_get_copy_fn(
                     tma_atom_b,
-                    block_in_cluster_coord_vmnk[1],
-                    b_cta_layout,
-                    cute.group_modes(sB, 0, 3),
-                    cute.group_modes(tCgB, 0, 3),
+                    cta_coord=block_in_cluster_coord_vmnk[1],
+                    cta_layout=cute.make_layout(
+                        cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+                    ),
+                    src_tensor=tCgB,
+                    dst_tensor=sB,
+                    mcast_mask=b_mcast_mask,
+                    tma_desc_ptr=tma_desc_b_ptr,
                 )
+                copy_SFA, copy_SFB = None, None
                 if const_expr(self.blockscaled):
                     #  TMA load SFA partition_S/D
-                    sfa_cta_layout = a_cta_layout
-                    # ((atom_v, rest_v), STAGE)
-                    # ((atom_v, rest_v), RestK)
-                    tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+                    copy_SFA, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_sfa,
-                        block_in_cluster_coord_vmnk[2],
-                        sfa_cta_layout,
-                        cute.group_modes(sSFA, 0, 3),
-                        cute.group_modes(tCgSFA, 0, 3),
+                        cta_coord=block_in_cluster_coord_vmnk[2],
+                        cta_layout=a_cta_layout,
+                        src_tensor=tCgSFA,
+                        dst_tensor=sSFA,
+                        filter_zeros=True,
+                        mcast_mask=sfa_mcast_mask,
+                        # tma_desc_ptr=tma_desc_sfa_ptr,
                     )
-                    tAsSFA = cute.filter_zeros(tAsSFA)
-                    tAgSFA = cute.filter_zeros(tAgSFA)
                     # TMA load SFB partition_S/D
                     sfb_cta_layout = cute.make_layout(
                         cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
                     )
-                    # ((atom_v, rest_v), STAGE)
-                    # ((atom_v, rest_v), RestK)
-                    tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+                    copy_SFB, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_sfb,
-                        block_in_cluster_coord_sfb_vmnk[1],
-                        sfb_cta_layout,
-                        cute.group_modes(sSFB, 0, 3),
-                        cute.group_modes(tCgSFB, 0, 3),
+                        cta_coord=block_in_cluster_coord_sfb_vmnk[1],
+                        cta_layout=sfb_cta_layout,
+                        src_tensor=tCgSFB,
+                        dst_tensor=sSFB,
+                        filter_zeros=True,
+                        mcast_mask=sfb_mcast_mask,
+                        # tma_desc_ptr=tma_desc_sfa_ptr,
                     )
-                    tBsSFB = cute.filter_zeros(tBsSFB)
-                    tBgSFB = cute.filter_zeros(tBgSFB)
-                else:
-                    tAsSFA, tAgSFA = None, None
-                    tBsSFB, tBgSFB = None, None
+                k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                 ab_producer_state = self.load_AB(
                     ab_pipeline,
                     ab_producer_state,
-                    tma_atom_a,
-                    tAgA,
-                    tAsA,
-                    a_mcast_mask,
-                    tma_atom_b,
-                    tBgB,
-                    tBsB,
-                    b_mcast_mask,
-                    tma_atom_sfa,
-                    tAgSFA,
-                    tAsSFA,
-                    sfa_mcast_mask,
-                    tma_atom_sfb,
-                    tBgSFB,
-                    tBsSFB,
-                    sfb_mcast_mask,
+                    copy_A,
+                    copy_B,
+                    k_tile_cnt,
+                    copy_SFA,
+                    copy_SFB,
                 )
                 if const_expr(epi_load_barrier is not None):
                     # In the first work tile, the epi load warp will wait for the signal
@@ -1060,58 +1069,209 @@ class PersistentDenseGemmKernel:
                     # with loading A and B.
                     if do_epi_load_barrier_arrive:
                         epi_load_barrier.arrive()
-                        do_epi_load_barrier_arrive = cutlass.Boolean(False)
+                        do_epi_load_barrier_arrive = Boolean(False)
                 # Advance to next tile
                 tile_scheduler.advance_to_next_work()
                 work_tile = tile_scheduler.get_current_work()
             # Wait A/B buffer empty
             ab_pipeline.producer_tail(ab_producer_state)
+        if const_expr(self.gather_A):
+            if (
+                warp_idx >= self.ab_load_warp_id + 1
+                and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
+            ):
+                # Persistent tile scheduling loop
+                tile_scheduler = TileSchedulerCls()
+                work_tile = tile_scheduler.initial_work_tile_info()
+                ab_producer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Producer, self.ab_stage
+                )
+                a_prefetch_consumer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Consumer, self.a_prefetch_stage
+                )
+                while work_tile.is_valid_tile:
+                    tile_coord_mnkl = work_tile.tile_idx
+                    batch_idx = tile_coord_mnkl[3]
+                    # ///////////////////////////////////////////////////////////////////////////
+                    #  Local_tile partition global tensors
+                    # ///////////////////////////////////////////////////////////////////////////
+                    mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
+                    if const_expr(varlen_m):
+                        # (M, K)
+                        mA_mk = mA_mkl
+                    else:
+                        assert varlen_k
+                        # (tile_M, K)
+                        mA_mk = cute.local_tile(
+                            mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
+                        )
+                    # Partition global tensor for TiledMMA_A/B/D
+                    len_m = varlen_manager.len_m(batch_idx)
+                    len_k = varlen_manager.len_k(batch_idx)
+                    # TMA load A partition_S/D
+                    tiled_copy_A = self._make_gmem_tiled_copy_A(
+                        mA_mkl.element_type, self.a_layout, (self.num_ab_load_warps - 1) * 32
+                    )
+                    tidx = cute.arch.thread_idx()[0] - (self.ab_load_warp_id + 1) * 32
+                    thr_copy_A = tiled_copy_A.get_slice(tidx)
+                    copy_A, prefetch_A = None, None
+                    if const_expr(varlen_m):
+                        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+                        copy_A = copy_utils.gather_m_get_copy_fn(
+                            thr_copy_A,
+                            mA_mk,
+                            sA,
+                            sAIdx[None, a_prefetch_consumer_state.index],
+                            limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                            limit_k=len_k,
+                        )
+                        cute.arch.sync_warp()
+                        with cute.arch.elect_one():
+                            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+                        a_prefetch_consumer_state.advance()
+                    else:
+                        copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
+                            thr_copy_A,
+                            mA_mk,
+                            sA,
+                            sAIdx,
+                            limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                            limit_k=len_k,
+                        )
+                        prefetch_A = partial(prefetch_A, a_prefetch_pipeline)
+                    k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                    ab_producer_state, a_prefetch_consumer_state = self.load_A_gather_A(
+                        ab_pipeline,
+                        ab_producer_state,
+                        a_prefetch_consumer_state,
+                        copy_A,
+                        prefetch_A,
+                        k_tile_cnt,
+                    )
+                    # Advance to next tile
+                    tile_scheduler.advance_to_next_work()
+                    work_tile = tile_scheduler.get_current_work()
+        #
+        # Specialized scheduler warp. Will also prefetch A indices if gatherA
+        #
+        if const_expr(tile_sched_params.tile_count_semaphore is not None or self.gather_A):
+            if warp_idx == self.scheduler_warp_id:
+                is_scheduler_warp = True
+                if const_expr(cute.size(cluster_layout_vmnk) > 1):
+                    is_scheduler_warp = cute.arch.block_idx_in_cluster() == 0
+                tile_M = self.cta_tile_shape_mnk[0]
+                tile_K = self.cta_tile_shape_mnk[2]
+                thr_copy_AIdx, tAsAIdx, tAcAIdx = None, None, None
+                if const_expr(self.gather_A):
+                    tiled_copy_AIdx = copy_utils.tiled_copy_1d(Int32, num_threads=32, is_async=True)
+                    thr_copy_AIdx = tiled_copy_AIdx.get_slice(cute.arch.lane_idx())
+                    tAsAIdx = thr_copy_AIdx.partition_D(sAIdx)
+                    tAcAIdx = thr_copy_AIdx.partition_S(
+                        cute.make_identity_tensor(tile_M if varlen_m else tile_K)
+                    )
+                # Persistent tile scheduling loop
+                tile_scheduler = TileSchedulerCls(is_scheduler_warp=is_scheduler_warp)
+                work_tile = tile_scheduler.initial_work_tile_info()
+                a_prefetch_producer_state = None
+                if const_expr(self.gather_A):
+                    a_prefetch_producer_state = pipeline.make_pipeline_state(
+                        pipeline.PipelineUserType.Producer, self.a_prefetch_stage
+                    )
+                while work_tile.is_valid_tile:
+                    if const_expr(self.gather_A):
+                        tile_coord_mnkl = work_tile.tile_idx
+                        batch_idx = tile_coord_mnkl[3]
+                        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
+                        if const_expr(varlen_m):
+                            # (tile_M,)
+                            gAIdx = cute.local_tile(mAIdx_mk, (tile_M,), (tile_coord_mnkl[0],))
+                            tAgAIdx = thr_copy_AIdx.partition_S(gAIdx)
+                            len_m = varlen_manager.len_m(batch_idx)
+                            m_limit = len_m - tile_coord_mnkl[0] * tile_M
+                            tApAIdx_m = cute.make_fragment((1, tAsAIdx.shape[1]), Boolean)
+                            for m in cutlass.range(tAsAIdx.shape[1], unroll_full=True):
+                                tApAIdx_m[0, m] = tAcAIdx[0, m] < m_limit
+                            a_prefetch_pipeline.producer_acquire(a_prefetch_producer_state)
+                            cute.copy(
+                                thr_copy_AIdx,
+                                tAgAIdx,
+                                tAsAIdx[None, None, a_prefetch_producer_state.index],
+                                pred=tApAIdx_m,
+                            )
+                            a_prefetch_pipeline.producer_commit(a_prefetch_producer_state)
+                            a_prefetch_producer_state.advance()
+                        else:
+                            # (tile_K, RestK)
+                            gAIdx = cute.flat_divide(mAIdx_mk, (tile_K,))
+                            tAgAIdx = thr_copy_AIdx.partition_S(gAIdx)
+                            len_k = varlen_manager.len_k(batch_idx)
+                            k_tile_cnt = cute.ceil_div(len_k, tile_K)
+                            for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+                                a_prefetch_pipeline.producer_acquire(a_prefetch_producer_state)
+                                cute.copy(
+                                    thr_copy_AIdx,
+                                    tAgAIdx[None, None, k_tile],
+                                    tAsAIdx[None, None, a_prefetch_producer_state.index],
+                                )
+                                a_prefetch_pipeline.producer_commit(a_prefetch_producer_state)
+                                a_prefetch_producer_state.advance()
+                            if 0 < k_tile_cnt:
+                                k_tile = k_tile_cnt - 1
+                                k_limit = len_k - k_tile * tile_K
+                                tApAIdx_k = cute.make_fragment((1, tAsAIdx.shape[1]), Boolean)
+                                for m in cutlass.range(tAsAIdx.shape[1], unroll_full=True):
+                                    tApAIdx_k[0, m] = tAcAIdx[0, m] < k_limit
+                                a_prefetch_pipeline.producer_acquire(a_prefetch_producer_state)
+                                cute.copy(
+                                    tiled_copy_AIdx,
+                                    tAgAIdx[None, None, k_tile],
+                                    tAsAIdx[None, None, a_prefetch_producer_state.index],
+                                    pred=tApAIdx_k,
+                                )
+                                a_prefetch_pipeline.producer_commit(a_prefetch_producer_state)
+                                a_prefetch_producer_state.advance()
+                    # Advance to next tile
+                    tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
+                    tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
+                    work_tile = tile_scheduler.get_current_work()
+                    # End of persistent scheduler loop
+                if is_scheduler_warp:
+                    tile_scheduler.producer_tail()
         #
         # Specialized TMA epi load warp
         #
         if const_expr(mC_mnl is not None):
-            if warp_idx == self.tma_epi_warp_id:
+            if warp_idx == self.epi_load_warp_id:
                 epi_producer_state = pipeline.make_pipeline_state(
-                    pipeline.PipelineUserType.Producer, self.num_c_stage
+                    pipeline.PipelineUserType.Producer, self.epi_c_stage
                 )
-                do_epi_load_barrier_wait = cutlass.Boolean(True)
+                do_epi_load_barrier_wait = Boolean(True)
                 # Persistent tile scheduling loop
                 tile_scheduler = TileSchedulerCls()
                 work_tile = tile_scheduler.initial_work_tile_info()
                 while work_tile.is_valid_tile:
                     # Get tile coord from tile scheduler
                     tile_coord_mnkl = work_tile.tile_idx
-                    mma_tile_coord_mnl = (
-                        tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
-                        tile_coord_mnkl[1],
-                        tile_coord_mnkl[3],
-                    )
-                    # Local_tile partition global tensors
-                    # (bM, bN)
-                    gC_mnl = cute.local_tile(
-                        mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), mma_tile_coord_mnl
+                    batch_idx = tile_coord_mnkl[3]
+                    copy_C_fn, _, bGS_gC = self.epilog_gmem_copy_and_partition(
+                        tma_atom_c,
+                        varlen_manager.offset_batch_epi(mC_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
+                        epi_tile,
+                        sC,
+                        tile_coord_mnkl,
                     )
-                    # Partition global tensor for TiledMMA_A/B/D
-                    # (MMA, MMA_M, MMA_N)
-                    tCgC = thr_mma.partition_C(gC_mnl)
-                    # bGS_gC has shape ((ATOM_V, REST_V), EPI_M, EPI_N)
-                    bGS_sC, bGS_gC = self.epilog_gmem_copy_and_partition(
-                        tma_atom_c, tCgC, epi_tile, sC
-                    )
-                    bGS_gC = cute.group_modes(bGS_gC, 1, cute.rank(bGS_gC))
+                    copy_C = copy_utils.tma_producer_copy_fn(copy_C_fn, epi_pipeline)
                     if do_epi_load_barrier_wait:
                         epi_load_barrier.arrive_and_wait()
-                        do_epi_load_barrier_wait = cutlass.Boolean(False)
+                        do_epi_load_barrier_wait = Boolean(False)
                     epi_tile_num = const_expr(cute.size(bGS_gC, mode=[1]))
-                    for subtile_idx in cutlass.range(epi_tile_num, unroll=1):
+                    for epi_idx in cutlass.range(epi_tile_num, unroll=1):
                         epi_pipeline.producer_acquire(epi_producer_state)
-                        cute.copy(
-                            tma_atom_c,
-                            bGS_gC[None, subtile_idx],
-                            bGS_sC[None, epi_producer_state.index],
-                            tma_bar_ptr=epi_pipeline.producer_get_barrier(epi_producer_state),
-                        )
+                        copy_C(src_idx=epi_idx, producer_state=epi_producer_state)
                         # Epi pipeline's producer commit is a NOP
                         epi_pipeline.producer_commit(epi_producer_state)
                         epi_producer_state.advance()
@@ -1132,7 +1292,7 @@ class PersistentDenseGemmKernel:
             )
             # Partition shared/tensor memory tensor for TiledMMA_A/B/D
             # (MMA, MMA_M, MMA_K, STAGE)
-            tCrA = tiled_mma.make_fragment_A(sA)
+            tCrA = tiled_mma.make_fragment_A(sA_mma)
             # (MMA, MMA_N, MMA_K, STAGE)
             tCrB = tiled_mma.make_fragment_B(sB)
             # (MMA, MMA_M, MMA_N, STAGE)
@@ -1149,10 +1309,9 @@ class PersistentDenseGemmKernel:
                     tiled_mma,
                     self.mma_tiler,
                     self.sf_vec_size,
-                    cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+                    cute.slice_(sfa_smem_layout, (None, None, None, 0)),
                 )
                 tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
                 # Make SFB tmem tensor
                 sfb_tmem_ptr = cute.recast_ptr(
                     acc_tmem_ptr
@@ -1165,7 +1324,7 @@ class PersistentDenseGemmKernel:
                     tiled_mma,
                     self.mma_tiler,
                     self.sf_vec_size,
-                    cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+                    cute.slice_(sfb_smem_layout, (None, None, None, 0)),
                 )
                 tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
                 # Partition for S2T copy of SFA/SFB
@@ -1180,6 +1339,7 @@ class PersistentDenseGemmKernel:
                     tCtSFB_compact_s2t,
                 ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
             else:
+                tCtSFA, tCtSFB = None, None
                 tiled_copy_s2t_sfa, tCsSFA_compact_s2t, tCtSFA_compact_s2t = None, None, None
                 tiled_copy_s2t_sfb, tCsSFB_compact_s2t, tCtSFB_compact_s2t = None, None, None
@@ -1187,7 +1347,7 @@ class PersistentDenseGemmKernel:
             tile_scheduler = TileSchedulerCls()
             work_tile = tile_scheduler.initial_work_tile_info()
             ab_consumer_state = pipeline.make_pipeline_state(
-                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+                pipeline.PipelineUserType.Consumer, self.ab_stage
             )
             acc_producer_state = pipeline.make_pipeline_state(
                 pipeline.PipelineUserType.Producer, self.num_acc_stage
@@ -1195,6 +1355,9 @@ class PersistentDenseGemmKernel:
             while work_tile.is_valid_tile:
                 # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                k_len = varlen_manager.len_k(batch_idx)
+                k_tile_cnt = cute.ceil_div(k_len, self.mma_tiler[2])
                 # Set tensor memory buffer for current tile
                 # (MMA, MMA_M, MMA_N)
                 tCtAcc = tCtAcc_base[None, None, None, acc_producer_state.index]
@@ -1209,6 +1372,9 @@ class PersistentDenseGemmKernel:
                     tCtAcc,
                     k_tile_cnt,
                     is_leader_cta,
+                    cta_rank_in_cluster,
+                    tCtSFA,
+                    tCtSFB,
                     tiled_copy_s2t_sfa,
                     tiled_copy_s2t_sfb,
                     tCsSFA_compact_s2t,
@@ -1234,6 +1400,14 @@ class PersistentDenseGemmKernel:
                 )
             # Bar sync for retrieve tensor memory ptr from shared memory
             tmem_alloc_barrier.arrive_and_wait()
+            is_tma_warp = Boolean(warp_idx == self.epilog_warp_id[0])
+            varlen_manager.init_tensormap_epi(
+                tma_atom_d, self.epi_get_tma_atoms(epilogue_params), is_tma_warp
+            )
+            tma_desc_d_ptr = varlen_manager.get_tma_desc_d_ptr()
+            tma_desc_epi_ptrs = varlen_manager.get_tma_desc_epi_ptrs()
             # Retrieving tensor memory ptr and make accumulator tensor
             acc_tmem_ptr = cute.arch.retrieve_tmem_ptr(
                 self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
@@ -1241,9 +1415,9 @@ class PersistentDenseGemmKernel:
             # (MMA, MMA_M, MMA_N, STAGE)
             tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
-            epilog_threads = cute.arch.WARP_SIZE * len(self.epilog_warp_id)
             epilogue_barrier = pipeline.NamedBarrier(
-                barrier_id=self.epilog_sync_bar_id, num_threads=epilog_threads
+                barrier_id=int(NamedBarrierGemm.Epilogue),
+                num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
             )
             # Partition for epilogue
@@ -1252,19 +1426,16 @@ class PersistentDenseGemmKernel:
                 epi_tidx, tCtAcc_base, epi_tile, use_2cta_instrs
             )
-            tTR_rD = cute.make_fragment(tTR_rAcc.shape, self.d_dtype)
-            tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_copy_and_partition(
-                tiled_copy_t2r, tTR_rD, epi_tidx, sD
+            tTR_rD = cute.make_fragment(tTR_rAcc.shape, self.acc_dtype)
+            tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
+                tiled_copy_t2r, self.d_layout, self.d_dtype, tTR_rD, sD, epi_tidx
             )
+            tRS_rC, tSR_rC, tSR_sC = None, None, None
+            tiled_copy_s2r = None
             if const_expr(mC_mnl is not None):
-                tTR_rC = cute.make_fragment_like(tTR_rD, self.c_dtype)
-                tiled_copy_s2r, tSR_rC, tSR_sC = self.epilog_smem_copy_and_partition(
-                    tiled_copy_t2r, tTR_rC, epi_tidx, sC
+                tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
+                    tiled_copy_t2r, self.c_layout, self.c_dtype, sC, tRS_rD.layout, epi_tidx
                 )
-                # TODO: for m major, D is being stored w STSM so we'd need LDSM here
-                # tRS_rC = tSR_rC  # TODO: retile?
-                tRS_rC = cute.make_fragment(tRS_rD.layout, self.c_dtype)
-                tSR_rC = tiled_copy_s2r.get_slice(epi_tidx).retile(tRS_rC)
             # Persistent tile scheduling loop
             tile_scheduler = TileSchedulerCls()
@@ -1272,37 +1443,27 @@ class PersistentDenseGemmKernel:
             acc_consumer_state = pipeline.make_pipeline_state(
                 pipeline.PipelineUserType.Consumer, self.num_acc_stage
             )
-            # Threads/warps participating in tma store pipeline
-            d_producer_group = pipeline.CooperativeGroup(
-                pipeline.Agent.Thread,
-                32 * len(self.epilog_warp_id),
-                32 * len(self.epilog_warp_id),
-            )
-            d_pipeline = pipeline.PipelineTmaStore.create(
-                num_stages=self.num_d_stage, producer_group=d_producer_group
-            )
+            epi_store_pipeline = self.make_epi_store_pipeline()
             epi_read_state = pipeline.make_pipeline_state(
-                pipeline.PipelineUserType.Consumer, self.num_c_stage
+                pipeline.PipelineUserType.Consumer, self.epi_c_stage
             )
+            if const_expr(varlen_m):
+                # wait tensormap initialization complete before update
+                varlen_manager.fence_tensormap_init()
             while work_tile.is_valid_tile:
                 # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
-                mma_tile_coord_mnl = (
-                    tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
-                    tile_coord_mnkl[1],
-                    tile_coord_mnkl[3],
+                batch_idx = tile_coord_mnkl[3]
+                epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
+                    epilogue_params, varlen_params.cu_seqlens_m, batch_idx
                 )
-                # Local_tile partition global tensors
-                # (bM, bN)
-                gD_mnl = cute.local_tile(
-                    mD_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), mma_tile_coord_mnl
+                varlen_manager.update_tensormap_epi(
+                    batch_idx,
+                    self.d_layout,
+                    epi_shapes,
+                    epi_orders,
+                    is_tma_warp,
                 )
-                # Partition global tensor for TiledMMA_A/B/D
-                # (MMA, MMA_M, MMA_N)
-                tDgD = thr_mma.partition_C(gD_mnl)
-                # bSG_gD has shape ((ATOM_V, REST_V), EPI_M, EPI_N)
-                bSG_sD, bSG_gD = self.epilog_gmem_copy_and_partition(tma_atom_d, tDgD, epi_tile, sD)
                 # Set tensor memory buffer for current tile
                 # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
@@ -1311,49 +1472,59 @@ class PersistentDenseGemmKernel:
                 # Wait for accumulator buffer full
                 acc_pipeline.consumer_wait(acc_consumer_state)
-                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
-                bSG_gD = cute.group_modes(bSG_gD, 1, cute.rank(bSG_gD))
-                # Store accumulator to global memory in subtiles
-                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
-                num_prev_subtiles = tile_scheduler.num_tiles_executed * subtile_cnt
-                for subtile_idx in cutlass.range(subtile_cnt):
-                    # Load accumulator from tensor memory buffer to register
-                    tTR_tAcc_mn = tTR_tAcc[None, None, None, subtile_idx]
-                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
-                    # Convert to D type
-                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
-                    acc_vec = epilogue_op(acc_vec)
-                    if const_expr(mC_mnl is not None):
-                        epi_pipeline.consumer_wait(epi_read_state)
-                        cute.copy(
-                            tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC
-                        )
-                        # Fence to make sure shared memory read is visible to TMA load
-                        cute.arch.fence_proxy(
-                            cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                        )
-                        cute.arch.sync_warp()
-                        with cute.arch.elect_one():
-                            epi_pipeline.consumer_release(epi_read_state)
-                        epi_read_state.advance()
-                        acc_vec = acc_vec + tRS_rC.load().to(self.acc_dtype)
-                    tRS_rD.store(acc_vec.to(self.d_dtype))
-                    # Store D to shared memory
-                    d_buffer = (num_prev_subtiles + subtile_idx) % self.num_d_stage
-                    cute.copy(tiled_copy_r2s, tRS_rD, tRS_sD[(None, None, None, d_buffer)])
-                    # Fence and barrier to make sure shared memory store is visible to TMA store
-                    cute.arch.fence_proxy(
-                        cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                varlen_manager.fence_tensormap_update_epi(is_tma_warp)
+                copy_D = None
+                if const_expr(has_D):
+                    copy_D, _, _ = self.epilog_gmem_copy_and_partition(
+                        tma_atom_d,
+                        varlen_manager.offset_batch_epi(mD_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
+                        epi_tile,
+                        sD,
+                        tile_coord_mnkl,
+                        tma_desc_ptr=tma_desc_d_ptr,
                     )
-                    epilogue_barrier.arrive_and_wait()
-                    # TMA store D to global memory
-                    if warp_idx == self.epilog_warp_id[0]:
-                        cute.copy(tma_atom_d, bSG_sD[None, d_buffer], bSG_gD[None, subtile_idx])
-                        # Fence and barrier to make sure shared memory store is visible to TMA store
-                        d_pipeline.producer_commit()
-                        d_pipeline.producer_acquire()
-                    epilogue_barrier.arrive_and_wait()
+                copy_C = None  # We're using a separate warp to load C
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                k_len = varlen_manager.len_k(batch_idx)
+                load_acc_subtile = partial(
+                    self.epi_load_acc_subtile,
+                    tiled_copy_t2r,
+                    tiled_copy_r2s,
+                    tTR_tAcc,
+                    tTR_rAcc,
+                    clear_acc=varlen_k and k_len == 0,
+                )
+                epi_read_state, _ = self.epilogue(
+                    epilogue_params,
+                    epi_smem_tensors,
+                    tma_desc_epi_ptrs,
+                    epi_pipeline,
+                    epi_store_pipeline,
+                    epi_read_state,
+                    None,  # epi_producer_state
+                    epi_tile,
+                    load_acc_subtile,
+                    tRS_rD,
+                    tRS_rC,
+                    tiled_copy_t2r,
+                    tiled_copy_r2s,
+                    tRS_sD,
+                    tiled_copy_s2r,
+                    tSR_rC,
+                    tSR_sC,
+                    copy_D,
+                    copy_C,
+                    tile_coord_mnkl,
+                    varlen_manager,
+                    epilogue_barrier,
+                    tile_scheduler,
+                    epi_tidx,
+                    is_tma_warp,
+                )
                 # Async arrive accumulator buffer empty
                 with cute.arch.elect_one():
@@ -1369,7 +1540,7 @@ class PersistentDenseGemmKernel:
                 cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
             epilogue_barrier.arrive_and_wait()
             if warp_idx == self.epilog_warp_id[0]:
-                if use_2cta_instrs:
+                if const_expr(use_2cta_instrs):
                     cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1)
                     cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
                 cute.arch.dealloc_tmem(
@@ -1377,82 +1548,54 @@ class PersistentDenseGemmKernel:
                 )
             # Wait for D store complete
-            d_pipeline.producer_tail()
+            if is_tma_warp:
+                epi_store_pipeline.producer_tail()
     @cute.jit
-    def load_AB(
+    def load_A_gather_A(
         self,
-        ab_pipeline: cutlass.pipeline.PipelineAsync,
-        ab_producer_state: cutlass.pipeline.PipelineState,
-        tma_atom_a: cute.CopyAtom,
-        tAgA: cute.Tensor,
-        tAsA: cute.Tensor,
-        a_mcast_mask: cutlass.Int16,
-        tma_atom_b: cute.CopyAtom,
-        tBgB: cute.Tensor,
-        tBsB: cute.Tensor,
-        b_mcast_mask: cutlass.Int16,
-        tma_atom_sfa: Optional[cute.CopyAtom] = None,
-        tAgSFA: Optional[cute.Tensor] = None,
-        tAsSFA: Optional[cute.Tensor] = None,
-        sfa_mcast_mask: Optional[cutlass.Int16] = None,
-        tma_atom_sfb: Optional[cute.CopyAtom] = None,
-        tBgSFB: Optional[cute.Tensor] = None,
-        tBsSFB: Optional[cute.Tensor] = None,
-        sfb_mcast_mask: Optional[cutlass.Int16] = None,
-    ) -> cutlass.pipeline.PipelineState:
-        blockscaled = const_expr(tma_atom_sfa is not None)
-        if const_expr(blockscaled):
-            assert all(x is not None for x in (tma_atom_sfa, tAgSFA, tAsSFA))
-            assert all(x is not None for x in (tma_atom_sfb, tBgSFB, tBsSFB))
-        k_tile_cnt = cute.size(tAgA, mode=[1])
+        a_pipeline: cutlass.pipeline.PipelineAsync,
+        a_producer_state: cutlass.pipeline.PipelineState,
+        a_prefetch_consumer_state: Optional[cutlass.pipeline.PipelineState],
+        copy_A: Callable,
+        prefetch_A: Optional[Callable],
+        k_tile_cnt: Int32,
+    ) -> Tuple[cutlass.pipeline.PipelineState, Optional[cutlass.pipeline.PipelineState]]:
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
-        peek_ab_empty_status = cutlass.Boolean(True)
+        peek_a_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
-            peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+            peek_a_empty_status = a_pipeline.producer_try_acquire(a_producer_state)
         # /////////////////////////////////////////////////////////////////////////
-        # TMA load
+        # cp.async on A
         # /////////////////////////////////////////////////////////////////////////
-        for k_tile in cutlass.range(k_tile_cnt, unroll=1):
-            # Wait for A/B buffers to be empty before loading into them
-            # Also sets the transaction barrier for the A/B buffers
-            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
-            cute.copy(
-                tma_atom_a,
-                tAgA[None, k_tile],
-                tAsA[None, ab_producer_state.index],
-                tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                mcast_mask=a_mcast_mask,
-            )
-            cute.copy(
-                tma_atom_b,
-                tBgB[None, k_tile],
-                tBsB[None, ab_producer_state.index],
-                tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                mcast_mask=b_mcast_mask,
-            )
-            if const_expr(blockscaled):
-                cute.copy(
-                    tma_atom_sfa,
-                    tAgSFA[None, ab_producer_state.count],
-                    tAsSFA[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                    mcast_mask=sfa_mcast_mask,
-                )
-                cute.copy(
-                    tma_atom_sfb,
-                    tBgSFB[None, ab_producer_state.count],
-                    tBsSFB[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                    mcast_mask=sfb_mcast_mask,
-                )
-            # Mainloop pipeline's producer commit is a NOP
-            ab_pipeline.producer_commit(ab_producer_state)
-            ab_producer_state.advance()
-            peek_ab_empty_status = cutlass.Boolean(True)
+        is_tma_warp = False
+        for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+            smem_idx = a_producer_state.index
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, smem_idx, a_prefetch_consumer_state),)
+                a_prefetch_consumer_state.advance()
+            a_pipeline.producer_acquire(a_producer_state, peek_a_empty_status, is_tma_warp)
+            copy_A(k_tile, smem_idx, *prefetch_out)
+            # This tells mbarrier to track the completion of cp.async
+            a_pipeline.producer_cpasync_commit(a_producer_state)
+            a_producer_state.advance()
+            peek_a_empty_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
-                peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
-        return ab_producer_state
+                peek_a_empty_status = a_pipeline.producer_try_acquire(a_producer_state)
+        # bound checking in the K dimension on the last k_tile
+        if 0 < k_tile_cnt:
+            k_tile = k_tile_cnt - 1
+            smem_idx = a_producer_state.index
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, smem_idx, a_prefetch_consumer_state, pred=True),)
+                a_prefetch_consumer_state.advance()
+            a_pipeline.producer_acquire(a_producer_state, peek_a_empty_status, is_tma_warp)
+            copy_A(k_tile, smem_idx, *prefetch_out, pred=True)
+            a_pipeline.producer_cpasync_commit(a_producer_state)
+            a_producer_state.advance()
+        return a_producer_state, a_prefetch_consumer_state
     @cute.jit
     def mma(
@@ -1466,7 +1609,10 @@ class PersistentDenseGemmKernel:
         tCrB: cute.Tensor,
         acc: cute.Tensor,
         k_tile_cnt: Int32,
-        is_leader_cta: cutlass.Boolean,
+        is_leader_cta: Boolean,
+        cta_rank_in_cluster: Int32,
+        tCtSFA: Optional[cute.Tensor] = None,
+        tCtSFB: Optional[cute.Tensor] = None,
         tiled_copy_s2t_sfa: Optional[cute.TiledCopy] = None,
         tiled_copy_s2t_sfb: Optional[cute.TiledCopy] = None,
         tCsSFA_compact_s2t: Optional[cute.Tensor] = None,
@@ -1476,12 +1622,17 @@ class PersistentDenseGemmKernel:
     ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState, cute.TiledMma]:
         blockscaled = const_expr(tiled_copy_s2t_sfa is not None)
         if const_expr(blockscaled):
+            assert all(x is not None for x in (tCtSFA, tCtSFB))
             assert all(x is not None for x in (tiled_copy_s2t_sfa, tiled_copy_s2t_sfb))
             assert all(x is not None for x in (tCsSFA_compact_s2t, tCsSFB_compact_s2t))
             assert all(x is not None for x in (tCtSFA_compact_s2t, tCtSFB_compact_s2t))
+        # If gather_A and use_2cta_instrs, the cp.async for the non-leader CTA will
+        # arrive at an mbarrier on the non-leader CTA side, then the mma warp of the non-leader
+        # CTA will wait for that then arrive at the mbarrier on the leader CTA.
+        need_nonleader_cta = const_expr(self.gather_A and self.use_2cta_instrs)
         # Peek (try_wait) AB buffer full for k_tile = 0
-        peek_ab_full_status = cutlass.Boolean(True)
-        if 0 < k_tile_cnt and is_leader_cta:
+        peek_ab_full_status = Boolean(True)
+        if 0 < k_tile_cnt and (is_leader_cta or need_nonleader_cta):
             peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
         # Wait for accumulator buffer empty
         if is_leader_cta:
@@ -1491,6 +1642,14 @@ class PersistentDenseGemmKernel:
         # Mma mainloop
         num_k_blocks = cute.size(tCrA, mode=[2])
         for k_tile in cutlass.range(k_tile_cnt, unroll=1):
+            if const_expr(need_nonleader_cta):
+                if not is_leader_cta:
+                    ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+                    with cute.arch.elect_one():
+                        # The odd CTA signals the even CTA
+                        ab_pipeline.sync_object_full.arrive_mbarrier(
+                            ab_consumer_state.index, dst_rank=cta_rank_in_cluster & 0xFE
+                        )
             if is_leader_cta:
                 # Conditionally wait for AB buffer full
                 ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
@@ -1503,14 +1662,19 @@ class PersistentDenseGemmKernel:
                     cute.copy(tiled_copy_s2t_sfb, tCsSFB_compact_s2t_staged, tCtSFB_compact_s2t)
                 for k_blk_idx in cutlass.range(num_k_blocks, unroll_full=True):
                     k_blk_coord = (None, None, k_blk_idx, ab_consumer_state.index)
+                    if const_expr(blockscaled):
+                        # Set SFA/SFB tensor to tiled_mma
+                        sf_kblock_coord = (None, None, k_blk_idx)
+                        tiled_mma.set(tcgen05.Field.SFA, tCtSFA[sf_kblock_coord].iterator)
+                        tiled_mma.set(tcgen05.Field.SFB, tCtSFB[sf_kblock_coord].iterator)
                     cute.gemm(tiled_mma, acc, tCrA[k_blk_coord], tCrB[k_blk_coord], acc)
                     tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
                 # Async arrive AB buffer empty
                 ab_pipeline.consumer_release(ab_consumer_state)
             ab_consumer_state.advance()
             # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
-            peek_ab_full_status = cutlass.Boolean(True)
-            if k_tile + 1 < k_tile_cnt and is_leader_cta:
+            peek_ab_full_status = Boolean(True)
+            if k_tile + 1 < k_tile_cnt and (is_leader_cta or need_nonleader_cta):
                 peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
         # Async arrive accumulator buffer full
         if is_leader_cta:
@@ -1520,6 +1684,25 @@ class PersistentDenseGemmKernel:
         # "operand #0 does not dominate this use"
         return ab_consumer_state, acc_producer_state, tiled_mma
+    @cute.jit
+    def epi_load_acc_subtile(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tiled_copy_r2s: cute.TiledCopy,
+        tTR_tAcc: cute.Tensor,
+        tTR_rAcc: cute.Tensor,
+        tRS_rD: cute.Tensor,
+        epi_idx: int,
+        clear_acc: Boolean = False,
+    ):
+        if not clear_acc:
+            # Load accumulator from tensor memory buffer to register
+            cute.copy(tiled_copy_t2r, tTR_tAcc[None, None, None, epi_idx], tTR_rAcc)
+            tRS_rAcc = tiled_copy_r2s.retile(tTR_rAcc)
+            tRS_rD.store(tRS_rAcc.load())
+        else:
+            tRS_rD.fill(0.0)
     def mainloop_s2t_copy_and_partition(
         self,
         sSF: cute.Tensor,
@@ -1560,7 +1743,7 @@ class PersistentDenseGemmKernel:
         tidx: Int32,
         tAcc: cute.Tensor,
         epi_tile: cute.Tile,
-        use_2cta_instrs: Union[cutlass.Boolean, bool],
+        use_2cta_instrs: Union[Boolean, bool],
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         """
         Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
@@ -1583,8 +1766,8 @@ class PersistentDenseGemmKernel:
         # Make tiledCopy for tensor memory load
         copy_atom_t2r = sm100_utils.get_tmem_load_op(
             self.cta_tile_shape_mnk,
-            self.d_layout,
-            self.d_dtype,
+            self.d_layout if self.d_layout is not None else LayoutEnum.ROW_MAJOR,
+            self.d_dtype if self.d_dtype is not None else cutlass.BFloat16,
             self.acc_dtype,
             epi_tile,
             use_2cta_instrs,
@@ -1607,12 +1790,14 @@ class PersistentDenseGemmKernel:
         tTR_rAcc = cute.make_fragment(tTR_cAcc[None, None, None, 0, 0].shape, self.acc_dtype)
         return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
-    def epilog_smem_copy_and_partition(
+    def epilog_smem_store_and_partition(
         self,
         tiled_copy_t2r: cute.TiledCopy,
+        d_layout: Optional[LayoutEnum],
+        dtype: Optional[Type[cutlass.Numeric]],
         tTR_rD: cute.Tensor,
-        tidx: Int32,
         sD: cute.Tensor,
+        tidx: Int32,
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         """
         Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
@@ -1634,93 +1819,183 @@ class PersistentDenseGemmKernel:
         :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
         """
         copy_atom_r2s = sm100_utils.get_smem_store_op(
-            self.d_layout, self.d_dtype, self.acc_dtype, tiled_copy_t2r
+            d_layout if d_layout is not None else LayoutEnum.ROW_MAJOR,
+            dtype if dtype is not None else cutlass.BFloat16,
+            self.acc_dtype,
+            tiled_copy_t2r,
         )
         tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
         # (R2S, R2S_M, R2S_N, PIPE_D)
         thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
-        tRS_sD = thr_copy_r2s.partition_D(sD)
+        tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
         # (R2S, R2S_M, R2S_N)
         tRS_rD = tiled_copy_r2s.retile(tTR_rD)
         return tiled_copy_r2s, tRS_rD, tRS_sD
-    # def epilog_smem_load_copy_and_partition(
-    #     self,
-    #     tiled_copy_t2r: cute.TiledCopy,
-    #     tTR_rC: cute.Tensor,
-    #     tidx: Int32,
-    #     sC: cute.Tensor,
-    # ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
-    #     copy_atom_s2r = cute.make_copy_atom(
-    #         warp.LdMatrix8x8x16bOp(self.c_layout.is_m_major_c(), num_matrices=4),
-    #         self.c_dtype,  # TODO: this probably only works for f16 for now?
-    #     )
-    #     # copy_atom_s2r = utils.sm90_get_smem_load_op(self.c_layout, self.c_dtype)
-    #     tiled_copy_s2r = cute.make_tiled_copy_D(copy_atom_s2r, tiled_copy_t2r)
-    #     # (R2S, R2S_M, R2S_N, PIPE_D)
-    #     thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
-    #     # (R2S, R2S_M, R2S_N)
-    #     tSR_sC = thr_copy_s2r.partition_S(sC)
-    #     return tiled_copy_s2r, tSR_sC
-    def epilog_gmem_copy_and_partition(
+    def epilog_smem_load_and_partition(
         self,
-        atom: Union[cute.CopyAtom, cute.TiledCopy],
-        gD_mnl: cute.Tensor,
-        epi_tile: cute.Tile,
-        sD: cute.Tensor,
-    ) -> Tuple[cute.Tensor, cute.Tensor]:
-        """Make tiledCopy for global memory store, then use it to:
-        - partition register array (source) and global memory (destination) for none TMA store version;
-        - partition shared memory (source) and global memory (destination) for TMA store version.
-        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
-        :type atom: cute.CopyAtom or cute.TiledCopy
-        :param gD_mnl: The global tensor C
-        :type gD_mnl: cute.Tensor
-        :param epi_tile: The epilogue tiler
-        :type epi_tile: cute.Tile
-        :param sD: The shared memory tensor to be copied and partitioned
-        :type sD: cute.Tensor
+        tiled_copy_t2r: cute.TiledCopy,
+        c_layout: LayoutEnum,
+        dtype: Type[cutlass.Numeric],
+        # tTR_rC: cute.Tensor,
+        sC: cute.Tensor,
+        tRS_rD_layout: cutlass.Layout,
+        tidx: Int32,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            c_layout, dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        store_op = copy_atom_r2s.op
+        # m8n8 16-bit path
+        if isinstance(store_op, StMatrix8x8x16bOp):
+            op = LdMatrix8x8x16bOp(num_matrices=store_op.num_matrices, transpose=store_op.transpose)
+        # m16n8 8-bit store -> m16n16 8-bit load
+        elif isinstance(store_op, StMatrix16x8x8bOp) and store_op.num_matrices in [2, 4]:
+            # transpose=True is enforced by the class
+            op = LdMatrix16x16x8bOp(num_matrices=store_op.num_matrices // 2)
+        else:
+            op = cute.nvgpu.CopyUniversalOp()
+        copy_atom_s2r = cute.make_copy_atom(op, dtype)
+        tiled_copy_s2r = cute.make_tiled_copy_D(copy_atom_s2r, tiled_copy_t2r)
+        thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        tSR_sC = thr_copy_s2r.partition_S(sC)
+        tRS_rC = cute.make_fragment(tRS_rD_layout, dtype)
+        # (R2S, R2S_M, R2S_N)
+        tSR_rC = tiled_copy_s2r.retile(tRS_rC)
+        return tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC
-        :return: A tuple containing either:
-            - For TMA store: (tma_atom_d, bSG_sD, bSG_gD) where:
-                - tma_atom_d: The TMA copy atom
-                - bSG_sD: The partitioned shared memory tensor C
-                - bSG_gD: The partitioned global tensor C
-            - For non-TMA store: (simt_atom, tTR_rD, tTR_gD) where:
-                - simt_atom: The SIMT copy atom
-                - tTR_rD: The register tensor C
-                - tTR_gD: The partitioned global tensor C
-        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
-        """
-        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N)
-        gD_epi = cute.flat_divide(gD_mnl[((None, None), 0, 0)], epi_tile)
-        sD_for_tma_partition = cute.group_modes(sD, 0, 2)
-        gD_for_tma_partition = cute.group_modes(gD_epi, 0, 2)
-        # ((ATOM_V, REST_V), EPI_M, EPI_N)
-        bSG_sD, bSG_gD = cpasync.tma_partition(
-            atom,
-            0,
-            cute.make_layout(1),
-            sD_for_tma_partition,
-            gD_for_tma_partition,
+    @cute.jit
+    def make_ab_pipeline(
+        self,
+        tiled_mma: cute.TiledMma,
+        cluster_layout_vmnk: cute.Layout,
+        ab_pipeline_mbar_ptr: cute.Pointer,
+        is_leader_cta: Boolean,
+    ) -> pipeline.PipelineAsync:
+        # If gather_A and use_2cta_instrs, the cp.async for the non-leader CTA will
+        # arrive at an mbarrier on the non-leader CTA side, then the mma warp of the non-leader
+        # CTA will wait for that then arrive at the mbarrier on the leader CTA.
+        # The producer count for the leader CTA is 1 (TMA) + num_cpasync_threads
+        # + 1 (from non-leader CTA).
+        # The producer count for the non-leader CTA is num_cpasync_threads
+        # (TMA doesn't arrive there).
+        if const_expr(not self.gather_A):
+            producer_cnt = 1
+        else:
+            producer_cnt = (self.num_ab_load_warps - 1) * 32 + (
+                1 if const_expr(not self.use_2cta_instrs) else 2
+            )
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, producer_cnt)
+        # Each warp will contribute to the arrive count with the number of mcast size
+        mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        consumer_arrive_cnt = mcast_size
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
         )
-        return bSG_sD, bSG_gD
+        if const_expr(not self.gather_A):
+            pipeline_ab = pipeline.PipelineTmaUmma.create(
+                barrier_storage=ab_pipeline_mbar_ptr,
+                num_stages=self.ab_stage,
+                producer_group=ab_pipeline_producer_group,
+                consumer_group=ab_pipeline_consumer_group,
+                tx_count=self.num_tma_load_bytes,
+                cta_layout_vmnk=cluster_layout_vmnk,
+            )
+        else:
+            pipeline_ab = PipelineTmaCpAsyncUmma.create(
+                barrier_storage=ab_pipeline_mbar_ptr,
+                num_stages=self.ab_stage,
+                producer_group=ab_pipeline_producer_group,
+                consumer_group=ab_pipeline_consumer_group,
+                tx_count=self.num_tma_load_bytes,
+                cta_layout_vmnk=cluster_layout_vmnk,
+                producer_drop_count=None
+                if not self.use_2cta_instrs
+                else (2 if not is_leader_cta else 0),
+            )
+        return pipeline_ab
-    @staticmethod
+    def make_acc_pipeline(
+        self, cluster_layout_vmnk: cute.Layout, acc_pipeline_mbar_ptr: cute.Pointer
+    ) -> pipeline.PipelineAsync:
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = self.num_epi_warps * (2 if self.use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        return pipeline.PipelineUmmaAsync.create(
+            barrier_storage=acc_pipeline_mbar_ptr,
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+    def make_sched_pipeline(
+        self,
+        cluster_layout_mnk: cute.Layout,
+        sched_pipeline_mbar_ptr: cute.Pointer,
+        has_C: bool = False,
+    ) -> pipeline.PipelineAsync:
+        # Threads/warps participating in this pipeline
+        sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        cluster_size = cute.size(cluster_layout_mnk)
+        # Each warp that are not the scheduler warp will contribute 1 to the arrive count
+        warps_per_cta = self.num_ab_load_warps + len(
+            (self.mma_warp_id, *self.epilog_warp_id, self.scheduler_warp_id)
+        )
+        if has_C:
+            warps_per_cta += 1
+        consumer_arrive_cnt = warps_per_cta * cluster_size - 1
+        sched_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        return pipeline.PipelineAsync.create(
+            barrier_storage=sched_pipeline_mbar_ptr,
+            num_stages=self.sched_stage,
+            producer_group=sched_pipeline_producer_group,
+            consumer_group=sched_pipeline_consumer_group,
+            # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
+            consumer_mask=None if const_expr(cluster_size == 1) else 0,
+        )
+    @cute.jit
+    def make_a_prefetch_pipeline(
+        self, a_prefetch_pipeline_mbar_ptr: cute.Pointer
+    ) -> pipeline.PipelineAsync:
+        producer_cnt = 32
+        a_prefetch_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, producer_cnt, alignment=producer_cnt
+        )
+        consumer_arrive_cnt = self.num_ab_load_warps - 1
+        a_prefetch_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        return pipeline.PipelineCpAsync.create(
+            barrier_storage=a_prefetch_pipeline_mbar_ptr,
+            num_stages=self.a_prefetch_stage,
+            producer_group=a_prefetch_producer_group,
+            consumer_group=a_prefetch_consumer_group,
+        )
+    @classmethod
     def _compute_stages(
+        cls,
         tiled_mma: cute.TiledMma,
         mma_tiler_mnk: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: cute.Tile,
         a_dtype: Type[cutlass.Numeric],
         b_dtype: Type[cutlass.Numeric],
-        epi_tile: cute.Tile,
-        d_dtype: Type[cutlass.Numeric],
-        c_dtype: Optional[Type[cutlass.Numeric]],
-        d_layout: cutlass.utils.LayoutEnum,
-        c_layout: Optional[cutlass.utils.LayoutEnum],
         sf_dtype: Optional[Type[cutlass.Numeric]],
         sf_vec_size: Optional[int],
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        c_dtype: Optional[Type[cutlass.Numeric]],
+        d_layout: Optional[LayoutEnum],
+        c_layout: Optional[LayoutEnum],
+        epilogue_args: EpilogueArguments,
+        prefetch_A_idx: Literal[None, "varlen_m", "varlen_k"],
         smem_capacity: int,
         occupancy: int,
     ) -> Tuple[int, int, int]:
@@ -1738,8 +2013,8 @@ class PersistentDenseGemmKernel:
         :type epi_tile: cute.Tile
         :param d_dtype: Data type of operand C (output).
         :type d_dtype: type[cutlass.Numeric]
-        :param d_layout: Layout enum of operand C.
-        :type d_layout: cutlass.utils.LayoutEnum
+        :param d_layout: Layout enum of operand D.
+        :type d_layout: LayoutEnum
         :param smem_capacity: Total available shared memory capacity in bytes.
         :type smem_capacity: int
         :param occupancy: Target number of CTAs per SM (occupancy).
@@ -1757,8 +2032,8 @@ class PersistentDenseGemmKernel:
             num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
         # Default D stages
-        num_d_stage = 2
-        num_c_stage = 2 if c_dtype is not None else 0
+        epi_stage = 4 if cute.size(epi_tile[1]) <= 16 else 2
+        epi_c_stage = 0 if c_dtype is None else (4 if cute.size(epi_tile[1]) <= 16 else 2)
         # Calculate smem layout and size for one stage of A, B, and C
         a_smem_layout_staged_one = sm100_utils.make_smem_layout_a(
@@ -1773,7 +2048,11 @@ class PersistentDenseGemmKernel:
             b_dtype,
             1,  # a tmp 1 stage is provided
         )
-        d_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(d_dtype, d_layout, epi_tile, 1)
+        d_smem_layout_staged_one = (
+            sm100_utils.make_smem_layout_epi(d_dtype, d_layout, epi_tile, 1)
+            if d_dtype is not None
+            else None
+        )
         c_smem_layout_staged_one = (
             sm100_utils.make_smem_layout_epi(c_dtype, c_layout, epi_tile, 1)
             if c_dtype is not None
@@ -1796,34 +2075,38 @@ class PersistentDenseGemmKernel:
         ab_bytes_per_stage = cute.size_in_bytes(
             a_dtype, a_smem_layout_staged_one
         ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+        if const_expr(prefetch_A_idx == "varlen_k"):  # Need smem to prefetch A indices
+            ab_bytes_per_stage += Int32.width // 8 * cta_tile_shape_mnk[2]
         if const_expr(blockscaled):
             ab_bytes_per_stage += cute.size_in_bytes(
                 sf_dtype, sfa_smem_layout_staged_one
             ) + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
         mbar_helpers_bytes = 1024
-        d_bytes_per_stage = cute.size_in_bytes(d_dtype, d_smem_layout_staged_one)
-        epi_bytes = d_bytes_per_stage * num_d_stage
+        if const_expr(prefetch_A_idx == "varlen_m"):
+            mbar_helpers_bytes += Int32.width // 8 * cta_tile_shape_mnk[0] * 2
+        d_bytes_per_stage = (
+            cute.size_in_bytes(d_dtype, d_smem_layout_staged_one) if d_dtype is not None else 0
+        )
+        epi_bytes_per_stage = d_bytes_per_stage + cls.epi_smem_bytes_per_stage(
+            epilogue_args, cta_tile_shape_mnk, epi_tile
+        )
+        epi_bytes = epi_bytes_per_stage * epi_stage
         if const_expr(c_dtype is not None):
             c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
-            epi_bytes += c_bytes_per_stage * num_c_stage
+            epi_bytes += c_bytes_per_stage * epi_c_stage
         # Calculate A/B/SFA/SFB stages:
         # Start with total smem per CTA (capacity / occupancy)
         # Subtract reserved bytes and initial C stages bytes
         # Divide remaining by bytes needed per A/B/SFA/SFB stage
-        num_ab_stage = (
-            smem_capacity // occupancy - (mbar_helpers_bytes + epi_bytes)
-        ) // ab_bytes_per_stage
+        remaining_bytes = smem_capacity // occupancy - mbar_helpers_bytes - epi_bytes
+        ab_stage = remaining_bytes // ab_bytes_per_stage
         # Refine epilogue stages:
         # Calculate remaining smem after allocating for A/B stages and reserved bytes
         # Add remaining unused smem to epilogue
-        num_d_stage += (
-            smem_capacity
-            - occupancy * ab_bytes_per_stage * num_ab_stage
-            - occupancy * (mbar_helpers_bytes + epi_bytes)
-        ) // (occupancy * d_bytes_per_stage)
-        return num_acc_stage, num_ab_stage, num_d_stage, num_c_stage
+        epi_stage += (remaining_bytes - ab_bytes_per_stage * ab_stage) // (epi_bytes_per_stage)
+        return num_acc_stage, ab_stage, epi_stage, epi_c_stage
     @staticmethod
     def _compute_num_tmem_alloc_cols(
@@ -1851,9 +2134,12 @@ class PersistentDenseGemmKernel:
     @staticmethod
     def is_valid_dtypes(
-        ab_dtype: Type[cutlass.Numeric],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
         acc_dtype: Type[cutlass.Numeric],
-        d_dtype: Type[cutlass.Numeric],
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        a_major: str,
+        b_major: str,
     ) -> bool:
         """
         Check if the dtypes are valid
@@ -1869,6 +2155,9 @@ class PersistentDenseGemmKernel:
         :rtype: bool
         """
         is_valid = True
+        if b_dtype != a_dtype:
+            is_valid = False
+        ab_dtype = a_dtype
         if ab_dtype not in {
             cutlass.Float16,
             cutlass.BFloat16,
@@ -1880,18 +2169,18 @@ class PersistentDenseGemmKernel:
         }:
             is_valid = False
         if (
-            acc_dtype not in {cutlass.Float32, cutlass.Float16, Int32}
+            acc_dtype not in {Float32, cutlass.Float16, Int32}
             or acc_dtype == cutlass.Float16
             and ab_dtype not in {cutlass.Float16, cutlass.Float8E4M3FN, cutlass.Float8E5M2}
             or acc_dtype == Int32
             and ab_dtype not in {cutlass.Uint8, cutlass.Int8}
         ):
             is_valid = False
-        if (
-            acc_dtype == cutlass.Float32
+        if d_dtype is not None and (
+            acc_dtype == Float32
             and d_dtype
             not in {
-                cutlass.Float32,
+                Float32,
                 cutlass.Float16,
                 cutlass.BFloat16,
                 cutlass.Float8E4M3FN,
@@ -1911,13 +2200,15 @@ class PersistentDenseGemmKernel:
             not in {
                 cutlass.BFloat16,
                 cutlass.Float16,
-                cutlass.Float32,
+                Float32,
                 Int32,
                 cutlass.Int8,
                 cutlass.Uint8,
             }
         ):
             is_valid = False
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
         return is_valid
     @staticmethod
@@ -1964,7 +2255,7 @@ class PersistentDenseGemmKernel:
         # Check valid d_dtype
         if d_dtype not in {
-            cutlass.Float32,
+            Float32,
             cutlass.Float16,
             cutlass.BFloat16,
             cutlass.Float8E5M2,
@@ -1974,37 +2265,8 @@ class PersistentDenseGemmKernel:
         return is_valid
-    @staticmethod
-    def is_valid_layouts(
-        ab_dtype: Type[cutlass.Numeric],
-        a_major: str,
-        b_major: str,
-    ) -> bool:
-        """
-        Check if the dtypes and sf_vec_size are valid combinations
-        :param ab_dtype: The data type of the A and B operands
-        :type ab_dtype: Type[cutlass.Numeric]
-        :param d_dtype: The data type of the output tensor
-        :type d_dtype: Type[cutlass.Numeric]
-        :param a_major: The major dimension of the A tensor
-        :type a_major: str
-        :param b_major: The major dimension of the B tensor
-        :type b_major: str
-        :param d_major: The major dimension of the C tensor
-        :type d_major: str
-        :return: True if the layouts are valid, False otherwise
-        :rtype: bool
-        """
-        is_valid = True
-        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
-            is_valid = False
-        return is_valid
     @staticmethod
     def is_valid_mma_tiler_and_cluster_shape(
-        use_2cta_instrs: bool,
         mma_tiler_mn: Tuple[int, int],
         cluster_shape_mn: Tuple[int, int],
         blockscaled: bool,
@@ -2012,8 +2274,6 @@ class PersistentDenseGemmKernel:
         """
         Check if the mma tiler and cluster shape are valid
-        :param use_2cta_instrs: Whether to use 2 CTA groups
-        :type use_2cta_instrs: bool
         :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
         :type mma_tiler_mn: Tuple[int, int]
         :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
@@ -2024,10 +2284,7 @@ class PersistentDenseGemmKernel:
         """
         is_valid = True
         # Skip invalid mma tile shape
-        if not (
-            (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
-            or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
-        ):
+        if mma_tiler_mn[0] not in [64, 128, 256]:
             is_valid = False
         if not blockscaled:
             if mma_tiler_mn[1] not in range(32, 257, 32):
@@ -2035,9 +2292,6 @@ class PersistentDenseGemmKernel:
         else:
             if mma_tiler_mn[1] not in [128, 256]:
                 is_valid = False
-        # Skip illegal cluster shape
-        if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
-            is_valid = False
         # Skip invalid cluster shape
         is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
         if (
@@ -2113,7 +2367,6 @@ class PersistentDenseGemmKernel:
         ab_dtype: Type[cutlass.Numeric],
         acc_dtype: Type[cutlass.Numeric],
         d_dtype: Type[cutlass.Numeric],
-        use_2cta_instrs: bool,
         mma_tiler_mn: Tuple[int, int],
         cluster_shape_mn: Tuple[int, int],
         m: int,
@@ -2133,8 +2386,6 @@ class PersistentDenseGemmKernel:
         :type acc_dtype: Type[cutlass.Numeric]
         :param d_dtype: The data type of the output tensor
         :type d_dtype: Type[cutlass.Numeric]
-        :param use_2cta_instrs: Whether to use 2 CTA groups
-        :type use_2cta_instrs: bool
         :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
         :type mma_tiler_mn: Tuple[int, int]
         :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
@@ -2159,15 +2410,15 @@ class PersistentDenseGemmKernel:
         """
         can_implement = True
         # Skip unsupported types
-        if not PersistentDenseGemmKernel.is_valid_dtypes(ab_dtype, acc_dtype, d_dtype):
+        if not GemmSm100.is_valid_dtypes(ab_dtype, ab_dtype, acc_dtype, d_dtype, a_major, b_major):
             can_implement = False
         # Skip invalid mma tile shape and cluster shape
-        if not PersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
-            use_2cta_instrs, mma_tiler_mn, cluster_shape_mn, blockscaled=False
+        if not GemmSm100.is_valid_mma_tiler_and_cluster_shape(
+            mma_tiler_mn, cluster_shape_mn, blockscaled=False
         ):
             can_implement = False
         # Skip illegal problem shape for load/store alignment
-        if not PersistentDenseGemmKernel.is_valid_tensor_alignment(
+        if not GemmSm100.is_valid_tensor_alignment(
             m, n, k, l, ab_dtype, d_dtype, a_major, b_major, d_major
         ):
             can_implement = False
@@ -2186,7 +2437,6 @@ def run(
     c_major: str,
     mma_tiler_mn: Tuple[int, int] = (256, 256),
     cluster_shape_mn: Tuple[int, int] = (2, 1),
-    use_2cta_instrs: bool = True,
     tolerance: float = 1e-01,
     warmup_iterations: int = 0,
     iterations: int = 1,
@@ -2215,9 +2465,6 @@ def run(
     :param cluster_shape_mn: Cluster shape. If not specified in the decorator parameters, the autotuner will use the
         default value of (2, 1). Otherwise, the autotuner will use the value specified in the decorator parameters.
     :type cluster_shape_mn: Tuple[int, int], optional
-    :param use_2cta_instrs: Whether to use 2CTA instructions. If not specified in the decorator parameters, the autotuner
-        will use the default value of True. Otherwise, the autotuner will use the value specified in the decorator parameters.
-    :type use_2cta_instrs: bool, optional
     :param tolerance: Tolerance value for reference validation comparison, defaults to 1e-01
     :type tolerance: float, optional
     :param warmup_iterations: Number of warmup iterations before benchmarking, defaults to 0
@@ -2236,7 +2483,6 @@ def run(
     print(f"AB dtype: {ab_dtype}, C dtype: {d_dtype}, Acc dtype: {acc_dtype}")
     print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {d_major}")
     print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
-    print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}")
     print(f"Tolerance: {tolerance}")
     print(f"Warmup iterations: {warmup_iterations}")
     print(f"Iterations: {iterations}")
@@ -2248,11 +2494,10 @@ def run(
     m, n, k, l = mnkl
     # Skip unsupported testcase
-    if not PersistentDenseGemmKernel.can_implement(
+    if not GemmSm100.can_implement(
         ab_dtype,
         acc_dtype,
         d_dtype,
-        use_2cta_instrs,
         mma_tiler_mn,
         cluster_shape_mn,
         m,
@@ -2264,7 +2509,7 @@ def run(
         d_major,
     ):
         raise TypeError(
-            f"Unsupported testcase {ab_dtype}, {acc_dtype}, {d_dtype}, {use_2cta_instrs}, {mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {d_major}"
+            f"Unsupported testcase {ab_dtype}, {acc_dtype}, {d_dtype}, {mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {d_major}"
         )
     if not torch.cuda.is_available():
@@ -2339,12 +2584,8 @@ def run(
         c, mC, c_torch = None, None, None
     # Configure gemm kernel
-    gemm = PersistentDenseGemmKernel(
-        acc_dtype,
-        use_2cta_instrs,
-        mma_tiler_mn,
-        cluster_shape_mn,
-    )
+    cluster_shape_mnk = (*cluster_shape_mn, 1)
+    gemm = GemmSm100(acc_dtype, ab_dtype, mma_tiler_mn, cluster_shape_mnk)
     # Compute max active clusters on current device
     hardware_info = cutlass.utils.HardwareInfo()
@@ -2356,6 +2597,17 @@ def run(
     else:
         tile_count_semaphore = None
+    scheduler_args = TileSchedulerOptions(
+        Int32(max_active_clusters),
+        tile_count_semaphore=make_ptr(
+            Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4
+        )
+        if tile_count_semaphore is not None
+        else None,
+    )
+    epi_args = gemm.EpilogueArguments()
+    varlen_args = VarlenArguments()
     # Get current CUDA stream from PyTorch
     torch_stream = torch.cuda.current_stream()
     # Get the raw stream pointer as a CUstream
@@ -2367,15 +2619,14 @@ def run(
         mB,
         mD,
         mC,
-        make_ptr(Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
-        if tile_count_semaphore is not None
-        else None,
-        max_active_clusters,
+        epi_args,
+        scheduler_args,
+        varlen_args,
         current_stream,
     )
     if not skip_ref_check:
-        compiled_gemm(mA, mB, mD, mC, tile_count_semaphore, current_stream)
+        compiled_gemm(mA, mB, mD, mC, epi_args, scheduler_args, varlen_args, current_stream)
         if ab_dtype in {
             cutlass.Int8,
             cutlass.Uint8,
@@ -2393,7 +2644,7 @@ def run(
         gpu_d = d_torch.cpu()
         # Convert ref to c_type
-        if d_dtype == cutlass.Float32:
+        if d_dtype == Float32:
             ref_d = ref
         elif d_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}:
             # m major: (l, n, m) -> (m, n, l)
@@ -2463,7 +2714,9 @@ def run(
     print(f"CuBLAS Average time: {timing_cublas:.3f} ms, TFLOPS: {tflops_cublas:.1f}")
     time.sleep(0.5)
-    fn = lambda: compiled_gemm(mA, mB, mD, mC, tile_count_semaphore, current_stream)
+    fn = lambda: compiled_gemm(
+        mA, mB, mD, mC, epi_args, scheduler_args, varlen_args, current_stream
+    )
     timing = do_bench(fn, warmup=warmup, rep=repeats)
     tflops = flops / (timing * 1e9)  # Convert to TFlops
     print(f"Cute-DSL Average time: {timing:.3f} ms, TFLOPS: {tflops:.1f}")
@@ -2505,12 +2758,7 @@ if __name__ == "__main__":
     parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.BFloat16)
     parser.add_argument("--d_dtype", type=cutlass.dtype, default=cutlass.BFloat16)
     parser.add_argument("--c_dtype", type=cutlass.dtype, default=None)
-    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32)
-    parser.add_argument(
-        "--use_2cta_instrs",
-        action="store_true",
-        help="Enable 2CTA MMA instructions feature",
-    )
+    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=Float32)
     parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
     parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
     parser.add_argument("--d_major", choices=["n", "m"], type=str, default="n")
@@ -2552,7 +2800,6 @@ if __name__ == "__main__":
         args.c_major,
         args.mma_tiler_mn,
         args.cluster_shape_mn,
-        args.use_2cta_instrs,
         args.tolerance,
         args.warmup_iterations,
         args.iterations,

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl