PyPI - quack-kernels - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

quack-kernels 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

quack/__init__.py +1 -1
quack/activation.py +16 -25
quack/autotuner.py +64 -5
quack/cross_entropy.py +6 -10
quack/cute_dsl_utils.py +6 -7
quack/dense_gemm_sm90.py +582 -287
quack/gemm_act_sm90.py +70 -29
quack/gemm_dact_sm90.py +43 -10
quack/gemm_interface.py +453 -130
quack/{dense_gemm_sm100.py → gemm_sm100.py} +443 -419
quack/gemm_wrapper_utils.py +179 -22
quack/layernorm.py +1 -1
quack/reduce.py +6 -7
quack/rmsnorm.py +126 -158
quack/softmax.py +1 -1
quack/tile_scheduler.py +37 -49
quack/utils.py +61 -71
quack/varlen_utils.py +1 -6
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/METADATA +3 -3
quack_kernels-0.2.2.dist-info/RECORD +37 -0
quack_kernels-0.2.0.dist-info/RECORD +0 -37
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/top_level.txt +0 -0

quack/{dense_gemm_sm100.py → gemm_sm100.py} RENAMED Viewed

@@ -1,30 +1,5 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Based on the cute-dsl example:
+# https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
 import argparse
 from typing import Optional, Type, Tuple, Union, Callable
@@ -40,15 +15,16 @@ import cutlass.torch as cutlass_torch
 import cutlass.pipeline as pipeline
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass.utils import LayoutEnum
 from cutlass.cute.runtime import from_dlpack, make_ptr
-from cutlass import Int32, const_expr
-from quack.cute_dsl_utils import ParamsBase
-from quack.tile_scheduler import (
-    TileSchedulerArguments,
-    TileScheduler,
-    RasterOrderOption,
-)
+from quack.cute_dsl_utils import ParamsBase, ArgumentsBase
+from quack.tile_scheduler import TileSchedulerOptions
+from quack.varlen_utils import VarlenArguments
+from quack.dense_gemm_sm90 import GemmSm90, NamedBarrierGemm
+# return PipelineStateWAdvance instead of PipelineState
 """
 A high-performance persistent batched dense GEMM example for the NVIDIA Blackwell SM100 architecture
@@ -72,8 +48,6 @@ This GEMM works as follows:
     - Type convert C matrix to output type.
     - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
       or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
-    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
-      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
 SM100 tcgen05.mma instructions operate as follows:
 - Read matrix A from SMEM
@@ -105,7 +79,7 @@ To collect performance with NCU profiler:
 Constraints are same as dense_gemm.py:
 * Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2),
-  see detailed valid dtype combinations in below PersistentDenseGemmKernel class documentation
+  see detailed valid dtype combinations in below GemmSm100 class documentation
 * A/B tensor must have the same data type
 * Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
 * Mma tiler N must be 32-256, step 32
@@ -118,14 +92,12 @@ Constraints are same as dense_gemm.py:
 """
-class PersistentDenseGemmKernel:
+class GemmSm100(GemmSm90):
     """This class implements batched matrix multiplication (C = A x B) with support for various data types
     and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
     :param acc_dtype: Data type for accumulation during computation
     :type acc_dtype: type[cutlass.Numeric]
-    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
-    :type use_2cta_instrs: bool
     :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
     :type mma_tiler_mn: Tuple[int, int]
     :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
@@ -159,22 +131,27 @@ class PersistentDenseGemmKernel:
         - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
     Example:
-        >>> gemm = PersistentDenseGemmKernel(
-        ...     acc_dtype=cutlass.Float32,
-        ...     use_2cta_instrs=True,
+        >>> gemm = GemmSm100(
+        ...     acc_dtype=Float32,
         ...     mma_tiler_mn=(128, 128),
         ...     cluster_shape_mn=(2, 2)
         ... )
         >>> gemm(mA, mB, mD, max_active_clusters, stream)
     """
+    arch = 100
+    num_epi_tensormaps = GemmSm90.num_epi_tensormaps
+    EpilogueArguments = GemmSm90.EpilogueArguments
+    EpilogueParams = GemmSm90.EpilogueParams
     def __init__(
         self,
         acc_dtype: Type[cutlass.Numeric],
-        use_2cta_instrs: bool,
         mma_tiler_mn: Tuple[int, int],
-        cluster_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
         sf_vec_size: Optional[int] = None,
+        gather_A: bool = False,
     ):
         """Initializes the configuration for a Blackwell dense GEMM kernel.
@@ -187,47 +164,42 @@ class PersistentDenseGemmKernel:
               with cta_group=2 should be used.
         2.  Cluster Shape:
-            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+            - cluster_shape_mnk: The (ClusterM, ClusterN) shape of the CTA cluster.
         :param acc_dtype: Data type of the accumulator.
         :type acc_dtype: type[cutlass.Numeric]
         :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
         :type mma_tiler_mn: Tuple[int, int]
-        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
-        :type use_2cta_instrs: bool
-        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
-        :type cluster_shape_mn: Tuple[int, int]
+        :param cluster_shape_mnk: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mnk: Tuple[int, int]
         """
         self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
-        self.use_2cta_instrs = use_2cta_instrs
-        self.cluster_shape_mn = cluster_shape_mn
+        self.use_2cta_instrs = cluster_shape_mnk[0] == 2 and mma_tiler_mn[0] in (128, 256)
+        self.cluster_shape_mnk = cluster_shape_mnk
+        assert cluster_shape_mnk[2] == 1, "Cluster shape K must be 1"
         # K dimension is deferred in _setup_attributes
         self.mma_tiler = (*mma_tiler_mn, 1)
         self.sf_vec_size = sf_vec_size
         self.blockscaled = sf_vec_size is not None
+        self.is_persistent = True
+        self.pingpong = False  # for compatibility with GemmSm90
+        self.gather_A = gather_A
+        if gather_A:
+            assert cluster_shape_mnk[1] == 1, "Cluster shape N must be 1 for gather A "
-        self.cta_group = tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
         self.occupancy = 1
         # Set specialized warp ids
-        self.epilog_warp_id = (
-            0,
-            1,
-            2,
-            3,
-        )
+        self.epilog_warp_id = (0, 1, 2, 3)
         self.mma_warp_id = 4
         self.tma_warp_id = 5
         self.tma_epi_warp_id = 6
+        self.num_epi_warps = len(self.epilog_warp_id)
         self.threads_per_cta = 32 * len(
             (self.mma_warp_id, self.tma_warp_id, self.tma_epi_warp_id, *self.epilog_warp_id)
         )
-        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
-        self.cta_sync_bar_id = 0
-        self.epilog_sync_bar_id = 1
-        self.tmem_ptr_sync_bar_id = 2
-        self.epilog_load_bar_id = 3
         self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_100")
     def _setup_attributes(self):
@@ -261,7 +233,7 @@ class PersistentDenseGemmKernel:
         # Configure tiled mma
         if const_expr(not self.blockscaled):
-            tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.tiled_mma = sm100_utils.make_trivial_tiled_mma(
                 self.a_dtype,
                 self.a_major_mode,
                 self.b_major_mode,
@@ -269,9 +241,9 @@ class PersistentDenseGemmKernel:
                 self.cta_group,
                 self.mma_tiler[:2],
             )
-            tiled_mma_sfb = None
+            self.tiled_mma_sfb = None
         else:
-            tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
                 self.a_dtype,
                 self.a_major_mode,
                 self.b_major_mode,
@@ -280,13 +252,13 @@ class PersistentDenseGemmKernel:
                 self.cta_group,
                 self.mma_inst_shape_mnk[:2],
             )
-            tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
                 self.a_dtype,
                 self.a_major_mode,
                 self.b_major_mode,
                 self.sf_dtype,
                 self.sf_vec_size,
-                cute.nvgpu.tcgen05.CtaGroup.ONE,
+                tcgen05.CtaGroup.ONE,
                 self.mma_inst_shape_mnk_sfb[:2],
             )
@@ -306,20 +278,20 @@ class PersistentDenseGemmKernel:
         else:
             self.mma_tiler_sfb = None
         self.cta_tile_shape_mnk = (
-            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[0] // cute.size(self.tiled_mma.thr_id.shape),
             self.mma_tiler[1],
             self.mma_tiler[2],
         )
         # Compute cluster layout
         self.cluster_layout_vmnk = cute.tiled_divide(
-            cute.make_layout((*self.cluster_shape_mn, 1)),
-            (tiled_mma.thr_id.shape,),
+            cute.make_layout(self.cluster_shape_mnk),
+            (self.tiled_mma.thr_id.shape,),
         )
         if const_expr(self.blockscaled):
             self.cluster_layout_sfb_vmnk = cute.tiled_divide(
-                cute.make_layout((*self.cluster_shape_mn, 1)),
-                (tiled_mma_sfb.thr_id.shape,),
+                cute.make_layout(self.cluster_shape_mnk),
+                (self.tiled_mma_sfb.thr_id.shape,),
             )
         else:
             self.cluster_layout_sfb_vmnk = None
@@ -344,11 +316,11 @@ class PersistentDenseGemmKernel:
         # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
         (
             self.num_acc_stage,
-            self.num_ab_stage,
-            self.num_d_stage,
-            self.num_c_stage,
+            self.ab_stage,
+            self.epi_stage,
+            self.epi_c_stage,
         ) = self._compute_stages(
-            tiled_mma,
+            self.tiled_mma,
             self.mma_tiler,
             self.a_dtype,
             self.b_dtype,
@@ -362,35 +334,36 @@ class PersistentDenseGemmKernel:
             self.smem_capacity,
             self.occupancy,
         )
+        self.sched_stage = 1  # For compatibility with GemmSm90
         # Compute A/B/SFA/SFB/C shared memory layout
         self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-            tiled_mma, self.mma_tiler, self.a_dtype, self.num_ab_stage
+            self.tiled_mma, self.mma_tiler, self.a_dtype, self.ab_stage
         )
         self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-            tiled_mma, self.mma_tiler, self.b_dtype, self.num_ab_stage
+            self.tiled_mma, self.mma_tiler, self.b_dtype, self.ab_stage
         )
-        self.d_smem_layout_staged = sm100_utils.make_smem_layout_epi(
-            self.d_dtype, self.d_layout, self.epi_tile, self.num_d_stage
+        self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.d_dtype, self.d_layout, self.epi_tile, self.epi_stage
         )
         if const_expr(self.c_dtype is not None):
             self.epi_c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
-                self.c_dtype, self.c_layout, self.epi_tile, self.num_c_stage
+                self.c_dtype, self.c_layout, self.epi_tile, self.epi_c_stage
             )
         else:
             self.epi_c_smem_layout_staged = None
         if const_expr(self.blockscaled):
             self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
-                tiled_mma,
+                self.tiled_mma,
                 self.mma_tiler,
                 self.sf_vec_size,
-                self.num_ab_stage,
+                self.ab_stage,
             )
             self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
-                tiled_mma,
+                self.tiled_mma,
                 self.mma_tiler,
                 self.sf_vec_size,
-                self.num_ab_stage,
+                self.ab_stage,
             )
         else:
             self.sfa_smem_layout_staged, self.sfb_smem_layout_staged = None, None
@@ -398,7 +371,7 @@ class PersistentDenseGemmKernel:
         # Compute the number of tensor memory allocation columns
         if const_expr(not self.blockscaled):
             self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(
-                tiled_mma, self.mma_tiler, self.num_acc_stage
+                self.tiled_mma, self.mma_tiler, self.num_acc_stage
             )
         else:
             SM100_TMEM_CAPACITY_COLUMNS = 512
@@ -409,14 +382,14 @@ class PersistentDenseGemmKernel:
         self,
         mA: cute.Tensor,
         mB: cute.Tensor,
-        mD: cute.Tensor,
+        mD: Optional[cute.Tensor],
         mC: Optional[cute.Tensor],
-        tile_count_semaphore: Optional[cute.Pointer],
-        max_active_clusters: cutlass.Constexpr,
+        epilogue_args: ArgumentsBase,
+        scheduler_args: TileSchedulerOptions,
+        varlen_args: Optional[VarlenArguments],
         stream: cuda.CUstream,
         mSFA: Optional[cute.Tensor] = None,
         mSFB: Optional[cute.Tensor] = None,
-        epilogue_op: cutlass.Constexpr = lambda x: x,
     ):
         """Execute the GEMM operation in steps:
         - Setup static attributes before smem/grid/tma computation
@@ -435,30 +408,46 @@ class PersistentDenseGemmKernel:
         :type max_active_clusters: cutlass.Constexpr
         :param stream: CUDA stream for asynchronous execution
         :type stream: cuda.CUstream
-        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
-        :type epilogue_op: cutlass.Constexpr
         :raises TypeError: If input data types are incompatible with the MMA instruction.
         :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
         """
         if const_expr(self.blockscaled):
             assert mSFA is not None and mSFB is not None
         # Setup static attributes before smem/grid/tma computation
-        self.a_dtype: Type[cutlass.Numeric] = mA.element_type
-        self.b_dtype: Type[cutlass.Numeric] = mB.element_type
-        self.d_dtype: Type[cutlass.Numeric] = mD.element_type
+        self.a_dtype = mA.element_type
+        self.b_dtype = mB.element_type
+        self.d_dtype = mD.element_type if mD is not None else None
         self.c_dtype = mC.element_type if mC is not None else None
         self.sf_dtype: Optional[Type[cutlass.Numeric]] = (
             mSFA.element_type if mSFA is not None else None
         )
-        self.a_major_mode = cutlass.utils.LayoutEnum.from_tensor(mA).mma_major_mode()
-        self.b_major_mode = cutlass.utils.LayoutEnum.from_tensor(mB).mma_major_mode()
-        self.d_layout = cutlass.utils.LayoutEnum.from_tensor(mD)
-        self.c_layout = cutlass.utils.LayoutEnum.from_tensor(mC) if mC is not None else None
+        self.a_layout = LayoutEnum.from_tensor(mA)
+        self.b_layout = LayoutEnum.from_tensor(mB)
+        self.d_layout = LayoutEnum.from_tensor(mD) if mD is not None else None
+        self.c_layout = LayoutEnum.from_tensor(mC) if mC is not None else None
+        self.a_major_mode = LayoutEnum.from_tensor(mA).mma_major_mode()
+        self.b_major_mode = LayoutEnum.from_tensor(mB).mma_major_mode()
         # Check if input data types are compatible with MMA instruction
         if const_expr(self.a_dtype != self.b_dtype):
             raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+        if const_expr(varlen_args is None):
+            varlen_args = VarlenArguments()
+        assert (varlen_args.mAIdx is not None) == self.gather_A
+        # Assume all strides are divisible by 128 bits except the last stride
+        new_stride = lambda t: tuple(
+            cute.assume(s, divby=128 // t.element_type.width) if not cute.is_static(s) else s
+            for s in t.stride
+        )
+        mA, mD = [
+            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
+            if t is not None
+            else None
+            for t in (mA, mD)
+        ]
         # Setup attributes that dependent on gemm inputs
         self._setup_attributes()
@@ -471,67 +460,44 @@ class PersistentDenseGemmKernel:
             sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(mB.shape, self.sf_vec_size)
             mSFB = cute.make_tensor(mSFB.iterator, sfb_layout)
-        if const_expr(not self.blockscaled):
-            tiled_mma = sm100_utils.make_trivial_tiled_mma(
-                self.a_dtype,
-                self.a_major_mode,
-                self.b_major_mode,
-                self.acc_dtype,
-                self.cta_group,
-                self.mma_tiler[:2],
-            )
-            tiled_mma_sfb = None
-        else:
-            tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
-                self.a_dtype,
-                self.a_major_mode,
-                self.b_major_mode,
-                self.sf_dtype,
-                self.sf_vec_size,
-                self.cta_group,
-                self.mma_inst_shape_mnk[:2],
-            )
-            tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
-                self.a_dtype,
-                self.a_major_mode,
-                self.b_major_mode,
-                self.sf_dtype,
-                self.sf_vec_size,
-                cute.nvgpu.tcgen05.CtaGroup.ONE,
-                self.mma_inst_shape_mnk_sfb[:2],
-            )
-        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+        atom_thr_size = cute.size(self.tiled_mma.thr_id.shape)
-        # Setup TMA load for A
-        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        # Setup TMA load for A & B
         a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
-        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
-            a_op,
-            mA,
-            a_smem_layout,
-            self.mma_tiler,
-            tiled_mma,
-            self.cluster_layout_vmnk.shape,
-            internal_type=(cutlass.TFloat32 if mA.element_type is cutlass.Float32 else None),
-        )
-        # Setup TMA load for B
-        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
         b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = None, None
+        if const_expr(not self.gather_A):
+            a_op = sm100_utils.cluster_shape_to_tma_atom_A(
+                self.cluster_shape_mnk, self.tiled_mma.thr_id
+            )
+            tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+                a_op,
+                mA,
+                a_smem_layout,
+                self.mma_tiler,
+                self.tiled_mma,
+                self.cluster_layout_vmnk.shape,
+                internal_type=(cutlass.TFloat32 if mA.element_type is Float32 else None),
+            )
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(
+            self.cluster_shape_mnk, self.tiled_mma.thr_id
+        )
         tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
             b_op,
             mB,
             b_smem_layout,
             self.mma_tiler,
-            tiled_mma,
+            self.tiled_mma,
             self.cluster_layout_vmnk.shape,
-            internal_type=(cutlass.TFloat32 if mB.element_type is cutlass.Float32 else None),
+            internal_type=(cutlass.TFloat32 if mB.element_type is Float32 else None),
         )
+        tma_atom_sfa, tma_tensor_sfa = None, None
+        tma_atom_sfb, tma_tensor_sfb = None, None
         if const_expr(self.blockscaled):
             # Setup TMA load for SFA
             sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(
-                self.cluster_shape_mn, tiled_mma.thr_id
+                self.cluster_shape_mnk, self.tiled_mma.thr_id
             )
             sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
             tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
@@ -539,13 +505,13 @@ class PersistentDenseGemmKernel:
                 mSFA,
                 sfa_smem_layout,
                 self.mma_tiler,
-                tiled_mma,
+                self.tiled_mma,
                 self.cluster_layout_vmnk.shape,
                 internal_type=cutlass.Int16,
             )
             # Setup TMA load for SFB
             sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
-                self.cluster_shape_mn, tiled_mma.thr_id
+                self.cluster_shape_mnk, self.tiled_mma.thr_id
             )
             sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
             tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
@@ -553,30 +519,31 @@ class PersistentDenseGemmKernel:
                 mSFB,
                 sfb_smem_layout,
                 self.mma_tiler_sfb,
-                tiled_mma_sfb,
+                self.tiled_mma_sfb,
                 self.cluster_layout_sfb_vmnk.shape,
                 internal_type=cutlass.Int16,
             )
-        else:
-            tma_atom_sfa, tma_tensor_sfa = None, None
-            tma_atom_sfb, tma_tensor_sfb = None, None
-        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
-        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
-        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+        self.num_tma_load_bytes = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        if const_expr(not self.gather_A):
+            self.num_tma_load_bytes += cute.size_in_bytes(self.a_dtype, a_smem_layout)
         if const_expr(self.blockscaled):
             sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
             sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
-            self.num_tma_load_bytes += (sfa_copy_size + sfb_copy_size) * atom_thr_size
+            self.num_tma_load_bytes += sfa_copy_size + sfb_copy_size
+        self.num_tma_load_bytes *= atom_thr_size
         # Setup TMA store for D
-        epi_smem_layout = cute.slice_(self.d_smem_layout_staged, (None, None, 0))
-        tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
-            cpasync.CopyBulkTensorTileS2GOp(),
-            mD,
-            epi_smem_layout,
-            self.epi_tile,
-        )
+        tma_atom_d, tma_tensor_d = None, None
+        if const_expr(mD is not None):
+            epi_smem_layout = cute.slice_(self.epi_smem_layout_staged, (None, None, 0))
+            tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
+                cpasync.CopyBulkTensorTileS2GOp(),
+                mD,
+                epi_smem_layout,
+                self.epi_tile,
+            )
+        tma_atom_c, tma_tensor_c = None, None
         if const_expr(mC is not None):
             epi_c_smem_layout = cute.slice_(self.epi_c_smem_layout_staged, (None, None, 0))
             tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
@@ -585,26 +552,19 @@ class PersistentDenseGemmKernel:
                 epi_c_smem_layout,
                 self.epi_tile,
             )
-        else:
-            tma_atom_c, tma_tensor_c = None, None
-        problem_shape_ntile_mnl = cute.ceil_div(mD.shape[:2], self.cta_tile_shape_mnk[:2]) + (
-            mD.shape[2],
-        )
-        TileSchedulerCls = TileScheduler
-        tile_sched_args = TileSchedulerArguments(
-            problem_shape_ntile_mnl=problem_shape_ntile_mnl,
-            raster_order=RasterOrderOption.Heuristic,
-            group_size=8,
-            cluster_shape_mnk=(*self.cluster_shape_mn, 1),
-            tile_count_semaphore=tile_count_semaphore,
-            is_persistent=True,
-        )
+        epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
+        TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
+        tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
         tile_sched_params = TileSchedulerCls.to_underlying_arguments(tile_sched_args)
-        grid = TileSchedulerCls.get_grid_shape(tile_sched_params, max_active_clusters)
+        grid = TileSchedulerCls.get_grid_shape(
+            tile_sched_params, scheduler_args.max_active_clusters
+        )
         self.buffer_align_bytes = 1024
+        epi_smem_size = cute.cosize(self.epi_smem_layout_staged) if mD is not None else 0
         epi_c_smem_size = cute.cosize(self.epi_c_smem_layout_staged) if mC is not None else 0
         sf_dtype = self.sf_dtype if const_expr(self.blockscaled) else cutlass.Float8E8M0FNU
         sfa_smem_size = (
@@ -617,18 +577,18 @@ class PersistentDenseGemmKernel:
         # Define shared storage for kernel
         @cute.struct
         class SharedStorage:
-            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
-            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
-            epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.num_c_stage * 2]
-            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
-            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
+            epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
+            acc_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
+            tile_count: cute.struct.MemRange[cutlass.Int32, self.sched_stage]
             tmem_dealloc_mbar_ptr: cutlass.Int64
             tmem_holding_buf: Int32
-            sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, 2]
-            tile_count: cute.struct.MemRange[cutlass.Int32, 1]
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sD: cute.struct.Align[
-                cute.struct.MemRange[self.d_dtype, cute.cosize(self.d_smem_layout_staged.outer)],
+                cute.struct.MemRange[
+                    self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
+                ],
                 self.buffer_align_bytes,
             ]
             sC: cute.struct.Align[
@@ -637,6 +597,7 @@ class PersistentDenseGemmKernel:
                 ],
                 self.buffer_align_bytes,
             ]
+            epi: self.epi_get_smem_struct(epilogue_params)
             # (MMA, MMA_M, MMA_K, STAGE)
             sA: cute.struct.Align[
                 cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
@@ -662,10 +623,10 @@ class PersistentDenseGemmKernel:
         # Launch the kernel synchronously
         self.kernel(
-            tiled_mma,
-            tiled_mma_sfb,
+            self.tiled_mma,
+            self.tiled_mma_sfb,
             tma_atom_a,
-            tma_tensor_a,
+            tma_tensor_a if const_expr(not self.gather_A) else mA,
             tma_atom_b,
             tma_tensor_b,
             tma_atom_sfa,
@@ -676,24 +637,29 @@ class PersistentDenseGemmKernel:
             tma_tensor_d,
             tma_atom_c,
             tma_tensor_c,
+            epilogue_params,
+            varlen_args.mCuSeqlensM,
+            varlen_args.mCuSeqlensK,
+            varlen_args.mTensormaps,
+            varlen_args.mAIdx,
             self.cluster_layout_vmnk,
             self.cluster_layout_sfb_vmnk,
             self.a_smem_layout_staged,
             self.b_smem_layout_staged,
             self.sfa_smem_layout_staged,
             self.sfb_smem_layout_staged,
-            self.d_smem_layout_staged,
+            self.epi_smem_layout_staged,
             self.epi_c_smem_layout_staged,
             self.epi_tile,
             tile_sched_params,
             TileSchedulerCls,
-            epilogue_op,
         ).launch(
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
-            cluster=(*self.cluster_shape_mn, 1),
+            cluster=self.cluster_shape_mnk,
             smem=self.shared_storage.size_in_bytes(),
             stream=stream,
+            min_blocks_per_mp=1,
         )
         return
@@ -703,7 +669,7 @@ class PersistentDenseGemmKernel:
         self,
         tiled_mma: cute.TiledMma,
         tiled_mma_sfb: Optional[cute.TiledMma],
-        tma_atom_a: cute.CopyAtom,
+        tma_atom_a: Optional[cute.CopyAtom],
         mA_mkl: cute.Tensor,
         tma_atom_b: cute.CopyAtom,
         mB_nkl: cute.Tensor,
@@ -712,37 +678,54 @@ class PersistentDenseGemmKernel:
         tma_atom_sfb: Optional[cute.CopyAtom],
         mSFB_nkl: Optional[cute.Tensor],
         tma_atom_d: Optional[cute.CopyAtom],
-        mD_mnl: cute.Tensor,
+        mD_mnl: Optional[cute.Tensor],
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
+        epilogue_params: ParamsBase,
+        cu_seqlens_m: Optional[cute.Tensor],
+        cu_seqlens_k: Optional[cute.Tensor],
+        tensormaps: Optional[cute.Tensor],
+        mAIdx: Optional[cute.Tensor],
         cluster_layout_vmnk: cute.Layout,
         cluster_layout_sfb_vmnk: Optional[cute.Layout],
-        a_smem_layout_staged: cute.ComposedLayout,
-        b_smem_layout_staged: cute.ComposedLayout,
-        sfa_smem_layout_staged: Optional[cute.Layout],
-        sfb_smem_layout_staged: Optional[cute.Layout],
-        d_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
-        epi_c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        a_smem_layout: cute.ComposedLayout,
+        b_smem_layout: cute.ComposedLayout,
+        sfa_smem_layout: Optional[cute.Layout],
+        sfb_smem_layout: Optional[cute.Layout],
+        epi_smem_layout: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_c_smem_layout: Union[cute.Layout, cute.ComposedLayout, None],
         epi_tile: cute.Tile,
         tile_sched_params: ParamsBase,
         TileSchedulerCls: cutlass.Constexpr[Callable],
-        epilogue_op: cutlass.Constexpr[Callable],
     ):
         """
         GPU device kernel performing the Persistent batched GEMM computation.
         """
+        varlen_m = const_expr(cu_seqlens_m is not None)
+        varlen_k = const_expr(cu_seqlens_k is not None)
+        assert not (varlen_m and varlen_k)
+        if const_expr(self.gather_A):
+            assert varlen_m or varlen_k
+        has_D = const_expr(mD_mnl is not None)
+        has_C = const_expr(mC_mnl is not None)
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
-        #
-        # Prefetch tma desc
-        #
+        # /////////////////////////////////////////////////////////////////////////////
+        #  Prefetch Tma desc
+        # /////////////////////////////////////////////////////////////////////////////
         if warp_idx == self.tma_warp_id:
-            cpasync.prefetch_descriptor(tma_atom_a)
-            cpasync.prefetch_descriptor(tma_atom_b)
-            if const_expr(self.blockscaled):
-                cpasync.prefetch_descriptor(tma_atom_sfa)
-                cpasync.prefetch_descriptor(tma_atom_sfb)
-            cpasync.prefetch_descriptor(tma_atom_d)
+            for tma_atom in (
+                tma_atom_a,
+                tma_atom_b,
+                tma_atom_sfa,
+                tma_atom_sfb,
+                tma_atom_d,
+                tma_atom_c,
+            ):
+                if const_expr(tma_atom is not None):
+                    cpasync.prefetch_descriptor(tma_atom)
         use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
@@ -754,13 +737,6 @@ class PersistentDenseGemmKernel:
         mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
         is_leader_cta = mma_tile_coord_v == 0
         cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
-        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
-        if const_expr(self.blockscaled):
-            block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
-                cta_rank_in_cluster
-            )
-        else:
-            block_in_cluster_coord_sfb_vmnk = None
         # Coord inside cta
         tidx, _, _ = cute.arch.thread_idx()
@@ -779,100 +755,53 @@ class PersistentDenseGemmKernel:
                 num_tmem_dealloc_threads = 32
                 cute.arch.mbarrier_init(tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads)
-        # Initialize mainloop ab_pipeline (barrier) and states
-        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
-        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
-            pipeline.Agent.Thread, num_tma_producer
-        )
-        ab_pipeline = pipeline.PipelineTmaUmma.create(
-            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
-            num_stages=self.num_ab_stage,
-            producer_group=ab_pipeline_producer_group,
-            consumer_group=ab_pipeline_consumer_group,
-            tx_count=self.num_tma_load_bytes,
-            cta_layout_vmnk=cluster_layout_vmnk,
+        # Initialize pipelines and states
+        ab_pipeline = self.make_ab_pipeline(
+            tiled_mma=tiled_mma,
+            cluster_layout_vmnk=cluster_layout_vmnk,
+            ab_pipeline_mbar_ptr=storage.ab_pipeline_array_ptr.data_ptr(),
         )
-        if const_expr(mC_mnl is not None):
-            # Threads/warps participating in this pipeline
-            epi_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-            # Each warp will contribute 1 to the arrive count
-            consumer_arrive_cnt = len(self.epilog_warp_id)
-            epi_pipeline_consumer_group = pipeline.CooperativeGroup(
-                pipeline.Agent.Thread, consumer_arrive_cnt
-            )
-            c_smem_layout = cute.slice_(epi_c_smem_layout_staged, (None, None, 0))
-            tma_copy_c_bytes = cute.size_in_bytes(self.c_dtype, c_smem_layout)
-            epi_pipeline = pipeline.PipelineTmaAsync.create(
-                barrier_storage=storage.epi_pipeline_array_ptr.data_ptr(),
-                num_stages=self.num_c_stage,
-                producer_group=epi_pipeline_producer_group,
-                consumer_group=epi_pipeline_consumer_group,
-                tx_count=tma_copy_c_bytes,
+        epi_pipeline = None
+        if const_expr(has_C):
+            epi_pipeline = self.make_epi_pipeline(
+                c_smem_layout=cute.slice_(epi_c_smem_layout, (None, None, 0)),
+                epi_pipeline_mbar_ptr=storage.epi_pipeline_array_ptr.data_ptr(),
             )
-        else:
-            epi_pipeline = None
-        # Initialize acc_pipeline (barrier) and states
-        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
-        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
-            pipeline.Agent.Thread, num_acc_consumer_threads
-        )
-        acc_pipeline = pipeline.PipelineUmmaAsync.create(
-            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
-            num_stages=self.num_acc_stage,
-            producer_group=acc_pipeline_producer_group,
-            consumer_group=acc_pipeline_consumer_group,
-            cta_layout_vmnk=cluster_layout_vmnk,
+        acc_pipeline = self.make_acc_pipeline(
+            cluster_layout_vmnk=cluster_layout_vmnk,
+            acc_pipeline_mbar_ptr=storage.acc_pipeline_array_ptr.data_ptr(),
         )
-        # if const_expr(tile_sched_params.tile_count_semaphore is not None):
-        #     # Dynamic persistent scheduler
-        #     # Threads/warps participating in this pipeline
-        #     sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-        #     cluster_size = cute.size(cluster_layout_vmnk)
-        #     # Each warp that are not the scheduler warp will contribute 1 to the arrive count
-        #     consumer_arrive_cnt = (
-        #         (self.mma_warp_groups if not self.pingpong else 1) * 4 + self.num_ab_load_warps
-        #     ) * cluster_size - 1
-        #     sched_pipeline_consumer_group = pipeline.CooperativeGroup(
-        #         pipeline.Agent.Thread, consumer_arrive_cnt
-        #     )
-        #     sched_pipeline = pipeline.PipelineAsync.create(
-        #         barrier_storage=storage.sched_pipeline_array_ptr.data_ptr(),
-        #         num_stages=self.sched_stage,
-        #         producer_group=sched_pipeline_producer_group,
-        #         consumer_group=sched_pipeline_consumer_group,
-        #         # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
-        #         consumer_mask=None if const_expr(cute.size(cluster_layout_mnk) == 1) else 0,
-        #     )
-        #     tile_count = storage.tile_count.get_tensor((self.sched_stage,))
-        # else:
-        #     sched_pipeline = None
-        #     tile_count = None
+        sched_pipeline = None
+        tile_count = None
+        if const_expr(tile_sched_params.tile_count_semaphore is not None):
+            # TODO: Untested, not sure if this is right for Sm100
+            # Dynamic persistent scheduler
+            sched_pipeline = self.make_sched_pipeline(
+                self.cluster_shape_mnk,
+                sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
+                varlen_k=varlen_k,
+            )
+            tile_count = storage.tile_count.get_tensor((self.sched_stage,))
         # Setup smem tensor A/B/D
         # (MMA, MMA_M, MMA_K, STAGE)
-        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        sA = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
         # (MMA, MMA_N, MMA_K, STAGE)
-        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
+        sSFA, sSFB = None, None
         if const_expr(self.blockscaled):
             # (MMA, MMA_M, MMA_K, STAGE)
-            sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+            sSFA = storage.sSFA.get_tensor(sfa_smem_layout)
             # (MMA, MMA_N, MMA_K, STAGE)
-            sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
-        else:
-            sSFA, sSFB = None, None
-        # (EPI_TILE_M, EPI_TILE_N, STAGE)
-        sD = storage.sD.get_tensor(d_smem_layout_staged.outer, swizzle=d_smem_layout_staged.inner)
-        if const_expr(mC_mnl is not None):
-            sC = storage.sC.get_tensor(
-                epi_c_smem_layout_staged.outer, swizzle=epi_c_smem_layout_staged.inner
-            )
-        else:
-            sC = None
+            sSFB = storage.sSFB.get_tensor(sfb_smem_layout)
+        sD = None
+        if const_expr(has_D):
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sD = storage.sD.get_tensor(epi_smem_layout.outer, swizzle=epi_smem_layout.inner)
+        sC = None
+        if const_expr(has_C):
+            sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
+        epi_smem_tensors = self.epi_get_smem_tensors(epilogue_params, storage)
         thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
         thr_mma_sfb = (
@@ -884,26 +813,51 @@ class PersistentDenseGemmKernel:
         # (MMA, MMA_M, MMA_N, STAGE)
         tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
-        tmem_ptr_read_threads = cute.arch.WARP_SIZE * len((self.mma_warp_id, *self.epilog_warp_id))
-        tmem_alloc_barrier = pipeline.NamedBarrier(
-            barrier_id=self.tmem_ptr_sync_bar_id, num_threads=tmem_ptr_read_threads
+        # Get tensormap buffer address
+        tensormap_manager, tensormap_ab_ptrs, tensormap_d_ptr, tensormap_epi_ptrs = (
+            self.tensormap_init(tensormaps, varlen_m, varlen_k, has_D, warp_idx)
         )
-        TileSchedulerCls = partial(TileSchedulerCls.create, tile_sched_params)
-        k_tile_cnt = cute.ceil_div(cute.size(mA_mkl.shape[1]), self.mma_tiler[2])
+        TileSchedulerCls = partial(
+            TileSchedulerCls.create, tile_sched_params, tile_count, sched_pipeline
+        )
-        if const_expr(mC_mnl is not None):
+        tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierGemm.TmemPtr),
+            num_threads=cute.arch.WARP_SIZE * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        epi_load_barrier = None
+        if const_expr(has_C):
             epi_load_barrier = pipeline.NamedBarrier(
-                barrier_id=int(self.epilog_load_bar_id), num_threads=2 * cute.arch.WARP_SIZE
+                barrier_id=int(NamedBarrierGemm.EpilogueLoad), num_threads=2 * cute.arch.WARP_SIZE
             )
-        else:
-            epi_load_barrier = None
         #
         # Specialized TMA load warp
         #
         if warp_idx == self.tma_warp_id:
+            if const_expr(varlen_k):
+                # initialize tensormap for A & B
+                if const_expr(not self.gather_A):
+                    tensormap_manager.init_tensormap_from_atom(
+                        tma_atom_a,
+                        tensormap_ab_ptrs[0],
+                        is_manager_warp=True,
+                    )
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_b,
+                    tensormap_ab_ptrs[1],
+                    is_manager_warp=True,
+                )
             # Compute multicast mask for A/B buffer full
+            block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+            block_in_cluster_coord_sfb_vmnk = None
+            if const_expr(self.blockscaled):
+                block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+                    cta_rank_in_cluster
+                )
+            a_mcast_mask, b_mcast_mask = None, None
+            sfa_mcast_mask, sfb_mcast_mask = None, None
             if const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
                 a_mcast_mask = cpasync.create_tma_multicast_mask(
                     cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
@@ -918,28 +872,45 @@ class PersistentDenseGemmKernel:
                     sfb_mcast_mask = cpasync.create_tma_multicast_mask(
                         cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
                     )
-                else:
-                    sfa_mcast_mask, sfb_mcast_mask = None, None
-            else:
-                a_mcast_mask, b_mcast_mask = None, None
-                sfa_mcast_mask, sfb_mcast_mask = None, None
             # Persistent tile scheduling loop
-            tile_scheduler = TileSchedulerCls()
+            is_scheduler_warp = True
+            if const_expr(cute.size(cluster_layout_vmnk) > 1):
+                is_scheduler_warp = cute.arch.block_idx_in_cluster() == 0
+            tile_scheduler = TileSchedulerCls(is_scheduler_warp=is_scheduler_warp)
             work_tile = tile_scheduler.initial_work_tile_info()
             ab_producer_state = pipeline.make_pipeline_state(
-                pipeline.PipelineUserType.Producer, self.num_ab_stage
+                pipeline.PipelineUserType.Producer, self.ab_stage
             )
-            do_epi_load_barrier_arrive = cutlass.Boolean(True)
+            if const_expr(varlen_k):
+                # wait tensormap initialization complete before update
+                tensormap_manager.fence_tensormap_initialization()
+            # batch index of last tile
+            last_batch_idx = cutlass.Int32(-1)
+            do_epi_load_barrier_arrive = Boolean(True)
             while work_tile.is_valid_tile:
-                # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                if const_expr(varlen_k):
+                    is_group_changed = batch_idx != last_batch_idx
+                    last_batch_idx = batch_idx
+                    if is_group_changed:
+                        self.tensormap_update_AB(
+                            tensormap_manager,
+                            tensormap_ab_ptrs,
+                            cu_seqlens_k,
+                            batch_idx,
+                            is_manager_warp=True,
+                        )
+                # ///////////////////////////////////////////////////////////////////////////
+                #  Local_tile partition global tensors
+                # ///////////////////////////////////////////////////////////////////////////
                 mma_tile_coord_mnl = (
                     tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
                     tile_coord_mnkl[1],
                     tile_coord_mnkl[3],
                 )
-                # Local_tile partition global tensors
+                # TODO: varlen_m
                 # (bM, bK, RestK)
                 gA_mkl = cute.local_tile(
                     mA_mkl,
@@ -1007,7 +978,7 @@ class PersistentDenseGemmKernel:
                     sfa_cta_layout = a_cta_layout
                     # ((atom_v, rest_v), STAGE)
                     # ((atom_v, rest_v), RestK)
-                    tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+                    tAsSFA, tAgSFA = cpasync.tma_partition(
                         tma_atom_sfa,
                         block_in_cluster_coord_vmnk[2],
                         sfa_cta_layout,
@@ -1022,7 +993,7 @@ class PersistentDenseGemmKernel:
                     )
                     # ((atom_v, rest_v), STAGE)
                     # ((atom_v, rest_v), RestK)
-                    tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+                    tBsSFB, tBgSFB = cpasync.tma_partition(
                         tma_atom_sfb,
                         block_in_cluster_coord_sfb_vmnk[1],
                         sfb_cta_layout,
@@ -1060,12 +1031,15 @@ class PersistentDenseGemmKernel:
                     # with loading A and B.
                     if do_epi_load_barrier_arrive:
                         epi_load_barrier.arrive()
-                        do_epi_load_barrier_arrive = cutlass.Boolean(False)
+                        do_epi_load_barrier_arrive = Boolean(False)
                 # Advance to next tile
+                tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
                 tile_scheduler.advance_to_next_work()
                 work_tile = tile_scheduler.get_current_work()
             # Wait A/B buffer empty
             ab_pipeline.producer_tail(ab_producer_state)
+            if is_scheduler_warp:
+                tile_scheduler.producer_tail()
         #
         # Specialized TMA epi load warp
@@ -1073,15 +1047,16 @@ class PersistentDenseGemmKernel:
         if const_expr(mC_mnl is not None):
             if warp_idx == self.tma_epi_warp_id:
                 epi_producer_state = pipeline.make_pipeline_state(
-                    pipeline.PipelineUserType.Producer, self.num_c_stage
+                    pipeline.PipelineUserType.Producer, self.epi_c_stage
                 )
-                do_epi_load_barrier_wait = cutlass.Boolean(True)
+                do_epi_load_barrier_wait = Boolean(True)
                 # Persistent tile scheduling loop
                 tile_scheduler = TileSchedulerCls()
                 work_tile = tile_scheduler.initial_work_tile_info()
                 while work_tile.is_valid_tile:
                     # Get tile coord from tile scheduler
                     tile_coord_mnkl = work_tile.tile_idx
+                    # TODO: varlen_m
                     mma_tile_coord_mnl = (
                         tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
                         tile_coord_mnkl[1],
@@ -1102,7 +1077,7 @@ class PersistentDenseGemmKernel:
                     bGS_gC = cute.group_modes(bGS_gC, 1, cute.rank(bGS_gC))
                     if do_epi_load_barrier_wait:
                         epi_load_barrier.arrive_and_wait()
-                        do_epi_load_barrier_wait = cutlass.Boolean(False)
+                        do_epi_load_barrier_wait = Boolean(False)
                     epi_tile_num = const_expr(cute.size(bGS_gC, mode=[1]))
                     for subtile_idx in cutlass.range(epi_tile_num, unroll=1):
                         epi_pipeline.producer_acquire(epi_producer_state)
@@ -1149,10 +1124,9 @@ class PersistentDenseGemmKernel:
                     tiled_mma,
                     self.mma_tiler,
                     self.sf_vec_size,
-                    cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+                    cute.slice_(sfa_smem_layout, (None, None, None, 0)),
                 )
                 tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
                 # Make SFB tmem tensor
                 sfb_tmem_ptr = cute.recast_ptr(
                     acc_tmem_ptr
@@ -1165,7 +1139,7 @@ class PersistentDenseGemmKernel:
                     tiled_mma,
                     self.mma_tiler,
                     self.sf_vec_size,
-                    cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+                    cute.slice_(sfb_smem_layout, (None, None, None, 0)),
                 )
                 tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
                 # Partition for S2T copy of SFA/SFB
@@ -1183,11 +1157,12 @@ class PersistentDenseGemmKernel:
                 tiled_copy_s2t_sfa, tCsSFA_compact_s2t, tCtSFA_compact_s2t = None, None, None
                 tiled_copy_s2t_sfb, tCsSFB_compact_s2t, tCtSFB_compact_s2t = None, None, None
+            k_tile_cnt = cute.ceil_div(cute.size(mA_mkl.shape[1]), self.mma_tiler[2])
             # Persistent tile scheduling loop
             tile_scheduler = TileSchedulerCls()
             work_tile = tile_scheduler.initial_work_tile_info()
             ab_consumer_state = pipeline.make_pipeline_state(
-                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+                pipeline.PipelineUserType.Consumer, self.ab_stage
             )
             acc_producer_state = pipeline.make_pipeline_state(
                 pipeline.PipelineUserType.Producer, self.num_acc_stage
@@ -1241,11 +1216,29 @@ class PersistentDenseGemmKernel:
             # (MMA, MMA_M, MMA_N, STAGE)
             tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
-            epilog_threads = cute.arch.WARP_SIZE * len(self.epilog_warp_id)
             epilogue_barrier = pipeline.NamedBarrier(
-                barrier_id=self.epilog_sync_bar_id, num_threads=epilog_threads
+                barrier_id=int(NamedBarrierGemm.Epilogue),
+                num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
             )
+            is_tma_warp = Boolean(warp_idx == self.epilog_warp_id[0])
+            if const_expr(varlen_m):
+                # initialize tensormap for D
+                if const_expr(has_D):
+                    tensormap_manager.init_tensormap_from_atom(
+                        tma_atom_d,
+                        tensormap_d_ptr,
+                        is_manager_warp=is_tma_warp,
+                    )
+                for tma_atom, tensormap_epi_ptr in zip(
+                    self.epi_get_tma_atoms(epilogue_params), tensormap_epi_ptrs
+                ):
+                    tensormap_manager.init_tensormap_from_atom(
+                        tma_atom,
+                        tensormap_epi_ptr,
+                        is_manager_warp=is_tma_warp,
+                    )
             # Partition for epilogue
             epi_tidx = tidx
             tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = self.epilog_tmem_copy_and_partition(
@@ -1256,6 +1249,7 @@ class PersistentDenseGemmKernel:
             tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_copy_and_partition(
                 tiled_copy_t2r, tTR_rD, epi_tidx, sD
             )
+            tRS_rC, tSR_rC = None, None
             if const_expr(mC_mnl is not None):
                 tTR_rC = cute.make_fragment_like(tTR_rD, self.c_dtype)
                 tiled_copy_s2r, tSR_rC, tSR_sC = self.epilog_smem_copy_and_partition(
@@ -1272,22 +1266,33 @@ class PersistentDenseGemmKernel:
             acc_consumer_state = pipeline.make_pipeline_state(
                 pipeline.PipelineUserType.Consumer, self.num_acc_stage
             )
-            # Threads/warps participating in tma store pipeline
-            d_producer_group = pipeline.CooperativeGroup(
-                pipeline.Agent.Thread,
-                32 * len(self.epilog_warp_id),
-                32 * len(self.epilog_warp_id),
-            )
-            d_pipeline = pipeline.PipelineTmaStore.create(
-                num_stages=self.num_d_stage, producer_group=d_producer_group
-            )
+            epi_store_pipeline = self.make_epi_store_pipeline()
             epi_read_state = pipeline.make_pipeline_state(
-                pipeline.PipelineUserType.Consumer, self.num_c_stage
+                pipeline.PipelineUserType.Consumer, self.epi_c_stage
             )
+            if const_expr(varlen_m):
+                # wait tensormap initialization complete before update
+                tensormap_manager.fence_tensormap_initialization()
+            # batch index of last tile
+            last_batch_idx = cutlass.Int32(-1)
             while work_tile.is_valid_tile:
                 # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                if const_expr(varlen_m):
+                    is_group_changed = batch_idx != last_batch_idx
+                    last_batch_idx = batch_idx
+                    if is_group_changed:
+                        self.tensormap_update_D_epi(
+                            tensormap_manager,
+                            tensormap_d_ptr,
+                            tensormap_epi_ptrs,
+                            epilogue_params,
+                            cu_seqlens_m,
+                            batch_idx,
+                            is_manager_warp=is_tma_warp,
+                        )
                 mma_tile_coord_mnl = (
                     tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
                     tile_coord_mnkl[1],
@@ -1311,6 +1316,25 @@ class PersistentDenseGemmKernel:
                 # Wait for accumulator buffer full
                 acc_pipeline.consumer_wait(acc_consumer_state)
+                tma_desc_d_ptr, tma_desc_epi_ptrs = None, [None] * self.num_epi_tensormaps
+                if const_expr(varlen_m):
+                    # ensure the update to tensormap has completed before using it
+                    if is_group_changed and is_tma_warp:
+                        if const_expr(has_D):
+                            tensormap_manager.fence_tensormap_update(tensormap_d_ptr)
+                        for tensormap_epi_ptr in tensormap_epi_ptrs:
+                            tensormap_manager.fence_tensormap_update(tensormap_epi_ptr)
+                    if const_expr(has_D):
+                        tma_desc_d_ptr = tensormap_manager.get_tensormap_ptr(
+                            tensormap_d_ptr, cute.AddressSpace.generic
+                        )
+                    tma_desc_epi_ptrs = [
+                        tensormap_manager.get_tensormap_ptr(
+                            tensormap_epi_ptr, cute.AddressSpace.generic
+                        )
+                        for tensormap_epi_ptr in tensormap_epi_ptrs
+                    ]
                 tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
                 bSG_gD = cute.group_modes(bSG_gD, 1, cute.rank(bSG_gD))
@@ -1323,7 +1347,6 @@ class PersistentDenseGemmKernel:
                     cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
                     # Convert to D type
                     acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
-                    acc_vec = epilogue_op(acc_vec)
                     if const_expr(mC_mnl is not None):
                         epi_pipeline.consumer_wait(epi_read_state)
                         cute.copy(
@@ -1340,7 +1363,7 @@ class PersistentDenseGemmKernel:
                         acc_vec = acc_vec + tRS_rC.load().to(self.acc_dtype)
                     tRS_rD.store(acc_vec.to(self.d_dtype))
                     # Store D to shared memory
-                    d_buffer = (num_prev_subtiles + subtile_idx) % self.num_d_stage
+                    d_buffer = (num_prev_subtiles + subtile_idx) % self.epi_stage
                     cute.copy(tiled_copy_r2s, tRS_rD, tRS_sD[(None, None, None, d_buffer)])
                     # Fence and barrier to make sure shared memory store is visible to TMA store
                     cute.arch.fence_proxy(
@@ -1348,11 +1371,11 @@ class PersistentDenseGemmKernel:
                     )
                     epilogue_barrier.arrive_and_wait()
                     # TMA store D to global memory
-                    if warp_idx == self.epilog_warp_id[0]:
+                    if is_tma_warp:
                         cute.copy(tma_atom_d, bSG_sD[None, d_buffer], bSG_gD[None, subtile_idx])
                         # Fence and barrier to make sure shared memory store is visible to TMA store
-                        d_pipeline.producer_commit()
-                        d_pipeline.producer_acquire()
+                        epi_store_pipeline.producer_commit()
+                        epi_store_pipeline.producer_acquire()
                     epilogue_barrier.arrive_and_wait()
                 # Async arrive accumulator buffer empty
@@ -1369,7 +1392,7 @@ class PersistentDenseGemmKernel:
                 cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
             epilogue_barrier.arrive_and_wait()
             if warp_idx == self.epilog_warp_id[0]:
-                if use_2cta_instrs:
+                if const_expr(use_2cta_instrs):
                     cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1)
                     cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
                 cute.arch.dealloc_tmem(
@@ -1377,7 +1400,8 @@ class PersistentDenseGemmKernel:
                 )
             # Wait for D store complete
-            d_pipeline.producer_tail()
+            if is_tma_warp:
+                epi_store_pipeline.producer_tail()
     @cute.jit
     def load_AB(
@@ -1407,7 +1431,7 @@ class PersistentDenseGemmKernel:
             assert all(x is not None for x in (tma_atom_sfb, tBgSFB, tBsSFB))
         k_tile_cnt = cute.size(tAgA, mode=[1])
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
-        peek_ab_empty_status = cutlass.Boolean(True)
+        peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
         # /////////////////////////////////////////////////////////////////////////
@@ -1449,7 +1473,7 @@ class PersistentDenseGemmKernel:
             # Mainloop pipeline's producer commit is a NOP
             ab_pipeline.producer_commit(ab_producer_state)
             ab_producer_state.advance()
-            peek_ab_empty_status = cutlass.Boolean(True)
+            peek_ab_empty_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
                 peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
         return ab_producer_state
@@ -1466,7 +1490,7 @@ class PersistentDenseGemmKernel:
         tCrB: cute.Tensor,
         acc: cute.Tensor,
         k_tile_cnt: Int32,
-        is_leader_cta: cutlass.Boolean,
+        is_leader_cta: Boolean,
         tiled_copy_s2t_sfa: Optional[cute.TiledCopy] = None,
         tiled_copy_s2t_sfb: Optional[cute.TiledCopy] = None,
         tCsSFA_compact_s2t: Optional[cute.Tensor] = None,
@@ -1480,7 +1504,7 @@ class PersistentDenseGemmKernel:
             assert all(x is not None for x in (tCsSFA_compact_s2t, tCsSFB_compact_s2t))
             assert all(x is not None for x in (tCtSFA_compact_s2t, tCtSFB_compact_s2t))
         # Peek (try_wait) AB buffer full for k_tile = 0
-        peek_ab_full_status = cutlass.Boolean(True)
+        peek_ab_full_status = Boolean(True)
         if 0 < k_tile_cnt and is_leader_cta:
             peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
         # Wait for accumulator buffer empty
@@ -1509,7 +1533,7 @@ class PersistentDenseGemmKernel:
                 ab_pipeline.consumer_release(ab_consumer_state)
             ab_consumer_state.advance()
             # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
-            peek_ab_full_status = cutlass.Boolean(True)
+            peek_ab_full_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt and is_leader_cta:
                 peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
         # Async arrive accumulator buffer full
@@ -1560,7 +1584,7 @@ class PersistentDenseGemmKernel:
         tidx: Int32,
         tAcc: cute.Tensor,
         epi_tile: cute.Tile,
-        use_2cta_instrs: Union[cutlass.Boolean, bool],
+        use_2cta_instrs: Union[Boolean, bool],
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         """
         Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
@@ -1708,6 +1732,22 @@ class PersistentDenseGemmKernel:
         )
         return bSG_sD, bSG_gD
+    def make_acc_pipeline(
+        self, cluster_layout_vmnk: cute.Layout, acc_pipeline_mbar_ptr: cute.Pointer
+    ):
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = self.num_epi_warps * (2 if self.use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        return pipeline.PipelineUmmaAsync.create(
+            barrier_storage=acc_pipeline_mbar_ptr,
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
     @staticmethod
     def _compute_stages(
         tiled_mma: cute.TiledMma,
@@ -1717,8 +1757,8 @@ class PersistentDenseGemmKernel:
         epi_tile: cute.Tile,
         d_dtype: Type[cutlass.Numeric],
         c_dtype: Optional[Type[cutlass.Numeric]],
-        d_layout: cutlass.utils.LayoutEnum,
-        c_layout: Optional[cutlass.utils.LayoutEnum],
+        d_layout: LayoutEnum,
+        c_layout: Optional[LayoutEnum],
         sf_dtype: Optional[Type[cutlass.Numeric]],
         sf_vec_size: Optional[int],
         smem_capacity: int,
@@ -1739,7 +1779,7 @@ class PersistentDenseGemmKernel:
         :param d_dtype: Data type of operand C (output).
         :type d_dtype: type[cutlass.Numeric]
         :param d_layout: Layout enum of operand C.
-        :type d_layout: cutlass.utils.LayoutEnum
+        :type d_layout: LayoutEnum
         :param smem_capacity: Total available shared memory capacity in bytes.
         :type smem_capacity: int
         :param occupancy: Target number of CTAs per SM (occupancy).
@@ -1757,8 +1797,8 @@ class PersistentDenseGemmKernel:
             num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
         # Default D stages
-        num_d_stage = 2
-        num_c_stage = 2 if c_dtype is not None else 0
+        epi_stage = 2
+        epi_c_stage = 2 if c_dtype is not None else 0
         # Calculate smem layout and size for one stage of A, B, and C
         a_smem_layout_staged_one = sm100_utils.make_smem_layout_a(
@@ -1802,28 +1842,28 @@ class PersistentDenseGemmKernel:
             ) + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
         mbar_helpers_bytes = 1024
         d_bytes_per_stage = cute.size_in_bytes(d_dtype, d_smem_layout_staged_one)
-        epi_bytes = d_bytes_per_stage * num_d_stage
+        epi_bytes = d_bytes_per_stage * epi_stage
         if const_expr(c_dtype is not None):
             c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
-            epi_bytes += c_bytes_per_stage * num_c_stage
+            epi_bytes += c_bytes_per_stage * epi_c_stage
         # Calculate A/B/SFA/SFB stages:
         # Start with total smem per CTA (capacity / occupancy)
         # Subtract reserved bytes and initial C stages bytes
         # Divide remaining by bytes needed per A/B/SFA/SFB stage
-        num_ab_stage = (
+        ab_stage = (
             smem_capacity // occupancy - (mbar_helpers_bytes + epi_bytes)
         ) // ab_bytes_per_stage
         # Refine epilogue stages:
         # Calculate remaining smem after allocating for A/B stages and reserved bytes
         # Add remaining unused smem to epilogue
-        num_d_stage += (
+        epi_stage += (
             smem_capacity
-            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * ab_bytes_per_stage * ab_stage
             - occupancy * (mbar_helpers_bytes + epi_bytes)
         ) // (occupancy * d_bytes_per_stage)
-        return num_acc_stage, num_ab_stage, num_d_stage, num_c_stage
+        return num_acc_stage, ab_stage, epi_stage, epi_c_stage
     @staticmethod
     def _compute_num_tmem_alloc_cols(
@@ -1880,7 +1920,7 @@ class PersistentDenseGemmKernel:
         }:
             is_valid = False
         if (
-            acc_dtype not in {cutlass.Float32, cutlass.Float16, Int32}
+            acc_dtype not in {Float32, cutlass.Float16, Int32}
             or acc_dtype == cutlass.Float16
             and ab_dtype not in {cutlass.Float16, cutlass.Float8E4M3FN, cutlass.Float8E5M2}
             or acc_dtype == Int32
@@ -1888,10 +1928,10 @@ class PersistentDenseGemmKernel:
         ):
             is_valid = False
         if (
-            acc_dtype == cutlass.Float32
+            acc_dtype == Float32
             and d_dtype
             not in {
-                cutlass.Float32,
+                Float32,
                 cutlass.Float16,
                 cutlass.BFloat16,
                 cutlass.Float8E4M3FN,
@@ -1911,7 +1951,7 @@ class PersistentDenseGemmKernel:
             not in {
                 cutlass.BFloat16,
                 cutlass.Float16,
-                cutlass.Float32,
+                Float32,
                 Int32,
                 cutlass.Int8,
                 cutlass.Uint8,
@@ -1964,7 +2004,7 @@ class PersistentDenseGemmKernel:
         # Check valid d_dtype
         if d_dtype not in {
-            cutlass.Float32,
+            Float32,
             cutlass.Float16,
             cutlass.BFloat16,
             cutlass.Float8E5M2,
@@ -2004,7 +2044,6 @@ class PersistentDenseGemmKernel:
     @staticmethod
     def is_valid_mma_tiler_and_cluster_shape(
-        use_2cta_instrs: bool,
         mma_tiler_mn: Tuple[int, int],
         cluster_shape_mn: Tuple[int, int],
         blockscaled: bool,
@@ -2012,8 +2051,6 @@ class PersistentDenseGemmKernel:
         """
         Check if the mma tiler and cluster shape are valid
-        :param use_2cta_instrs: Whether to use 2 CTA groups
-        :type use_2cta_instrs: bool
         :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
         :type mma_tiler_mn: Tuple[int, int]
         :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
@@ -2024,10 +2061,7 @@ class PersistentDenseGemmKernel:
         """
         is_valid = True
         # Skip invalid mma tile shape
-        if not (
-            (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
-            or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
-        ):
+        if mma_tiler_mn[0] not in [64, 128, 256]:
             is_valid = False
         if not blockscaled:
             if mma_tiler_mn[1] not in range(32, 257, 32):
@@ -2035,9 +2069,6 @@ class PersistentDenseGemmKernel:
         else:
             if mma_tiler_mn[1] not in [128, 256]:
                 is_valid = False
-        # Skip illegal cluster shape
-        if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
-            is_valid = False
         # Skip invalid cluster shape
         is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
         if (
@@ -2113,7 +2144,6 @@ class PersistentDenseGemmKernel:
         ab_dtype: Type[cutlass.Numeric],
         acc_dtype: Type[cutlass.Numeric],
         d_dtype: Type[cutlass.Numeric],
-        use_2cta_instrs: bool,
         mma_tiler_mn: Tuple[int, int],
         cluster_shape_mn: Tuple[int, int],
         m: int,
@@ -2133,8 +2163,6 @@ class PersistentDenseGemmKernel:
         :type acc_dtype: Type[cutlass.Numeric]
         :param d_dtype: The data type of the output tensor
         :type d_dtype: Type[cutlass.Numeric]
-        :param use_2cta_instrs: Whether to use 2 CTA groups
-        :type use_2cta_instrs: bool
         :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
         :type mma_tiler_mn: Tuple[int, int]
         :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
@@ -2159,15 +2187,15 @@ class PersistentDenseGemmKernel:
         """
         can_implement = True
         # Skip unsupported types
-        if not PersistentDenseGemmKernel.is_valid_dtypes(ab_dtype, acc_dtype, d_dtype):
+        if not GemmSm100.is_valid_dtypes(ab_dtype, acc_dtype, d_dtype):
             can_implement = False
         # Skip invalid mma tile shape and cluster shape
-        if not PersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
-            use_2cta_instrs, mma_tiler_mn, cluster_shape_mn, blockscaled=False
+        if not GemmSm100.is_valid_mma_tiler_and_cluster_shape(
+            mma_tiler_mn, cluster_shape_mn, blockscaled=False
         ):
             can_implement = False
         # Skip illegal problem shape for load/store alignment
-        if not PersistentDenseGemmKernel.is_valid_tensor_alignment(
+        if not GemmSm100.is_valid_tensor_alignment(
             m, n, k, l, ab_dtype, d_dtype, a_major, b_major, d_major
         ):
             can_implement = False
@@ -2186,7 +2214,6 @@ def run(
     c_major: str,
     mma_tiler_mn: Tuple[int, int] = (256, 256),
     cluster_shape_mn: Tuple[int, int] = (2, 1),
-    use_2cta_instrs: bool = True,
     tolerance: float = 1e-01,
     warmup_iterations: int = 0,
     iterations: int = 1,
@@ -2215,9 +2242,6 @@ def run(
     :param cluster_shape_mn: Cluster shape. If not specified in the decorator parameters, the autotuner will use the
         default value of (2, 1). Otherwise, the autotuner will use the value specified in the decorator parameters.
     :type cluster_shape_mn: Tuple[int, int], optional
-    :param use_2cta_instrs: Whether to use 2CTA instructions. If not specified in the decorator parameters, the autotuner
-        will use the default value of True. Otherwise, the autotuner will use the value specified in the decorator parameters.
-    :type use_2cta_instrs: bool, optional
     :param tolerance: Tolerance value for reference validation comparison, defaults to 1e-01
     :type tolerance: float, optional
     :param warmup_iterations: Number of warmup iterations before benchmarking, defaults to 0
@@ -2236,7 +2260,6 @@ def run(
     print(f"AB dtype: {ab_dtype}, C dtype: {d_dtype}, Acc dtype: {acc_dtype}")
     print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {d_major}")
     print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
-    print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}")
     print(f"Tolerance: {tolerance}")
     print(f"Warmup iterations: {warmup_iterations}")
     print(f"Iterations: {iterations}")
@@ -2248,11 +2271,10 @@ def run(
     m, n, k, l = mnkl
     # Skip unsupported testcase
-    if not PersistentDenseGemmKernel.can_implement(
+    if not GemmSm100.can_implement(
         ab_dtype,
         acc_dtype,
         d_dtype,
-        use_2cta_instrs,
         mma_tiler_mn,
         cluster_shape_mn,
         m,
@@ -2264,7 +2286,7 @@ def run(
         d_major,
     ):
         raise TypeError(
-            f"Unsupported testcase {ab_dtype}, {acc_dtype}, {d_dtype}, {use_2cta_instrs}, {mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {d_major}"
+            f"Unsupported testcase {ab_dtype}, {acc_dtype}, {d_dtype}, {mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {d_major}"
         )
     if not torch.cuda.is_available():
@@ -2339,12 +2361,8 @@ def run(
         c, mC, c_torch = None, None, None
     # Configure gemm kernel
-    gemm = PersistentDenseGemmKernel(
-        acc_dtype,
-        use_2cta_instrs,
-        mma_tiler_mn,
-        cluster_shape_mn,
-    )
+    cluster_shape_mnk = (*cluster_shape_mn, 1)
+    gemm = GemmSm100(acc_dtype, mma_tiler_mn, cluster_shape_mnk)
     # Compute max active clusters on current device
     hardware_info = cutlass.utils.HardwareInfo()
@@ -2356,6 +2374,17 @@ def run(
     else:
         tile_count_semaphore = None
+    scheduler_args = TileSchedulerOptions(
+        Int32(max_active_clusters),
+        tile_count_semaphore=make_ptr(
+            Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4
+        )
+        if tile_count_semaphore is not None
+        else None,
+    )
+    epi_args = gemm.EpilogueArguments()
+    varlen_args = VarlenArguments()
     # Get current CUDA stream from PyTorch
     torch_stream = torch.cuda.current_stream()
     # Get the raw stream pointer as a CUstream
@@ -2367,15 +2396,14 @@ def run(
         mB,
         mD,
         mC,
-        make_ptr(Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
-        if tile_count_semaphore is not None
-        else None,
-        max_active_clusters,
+        epi_args,
+        scheduler_args,
+        varlen_args,
         current_stream,
     )
     if not skip_ref_check:
-        compiled_gemm(mA, mB, mD, mC, tile_count_semaphore, current_stream)
+        compiled_gemm(mA, mB, mD, mC, epi_args, scheduler_args, varlen_args, current_stream)
         if ab_dtype in {
             cutlass.Int8,
             cutlass.Uint8,
@@ -2393,7 +2421,7 @@ def run(
         gpu_d = d_torch.cpu()
         # Convert ref to c_type
-        if d_dtype == cutlass.Float32:
+        if d_dtype == Float32:
             ref_d = ref
         elif d_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}:
             # m major: (l, n, m) -> (m, n, l)
@@ -2463,7 +2491,9 @@ def run(
     print(f"CuBLAS Average time: {timing_cublas:.3f} ms, TFLOPS: {tflops_cublas:.1f}")
     time.sleep(0.5)
-    fn = lambda: compiled_gemm(mA, mB, mD, mC, tile_count_semaphore, current_stream)
+    fn = lambda: compiled_gemm(
+        mA, mB, mD, mC, epi_args, scheduler_args, varlen_args, current_stream
+    )
     timing = do_bench(fn, warmup=warmup, rep=repeats)
     tflops = flops / (timing * 1e9)  # Convert to TFlops
     print(f"Cute-DSL Average time: {timing:.3f} ms, TFLOPS: {tflops:.1f}")
@@ -2505,12 +2535,7 @@ if __name__ == "__main__":
     parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.BFloat16)
     parser.add_argument("--d_dtype", type=cutlass.dtype, default=cutlass.BFloat16)
     parser.add_argument("--c_dtype", type=cutlass.dtype, default=None)
-    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32)
-    parser.add_argument(
-        "--use_2cta_instrs",
-        action="store_true",
-        help="Enable 2CTA MMA instructions feature",
-    )
+    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=Float32)
     parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
     parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
     parser.add_argument("--d_major", choices=["n", "m"], type=str, default="n")
@@ -2552,7 +2577,6 @@ if __name__ == "__main__":
         args.c_major,
         args.mma_tiler_mn,
         args.cluster_shape_mn,
-        args.use_2cta_instrs,
         args.tolerance,
         args.warmup_iterations,
         args.iterations,

quack-kernels 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

quack-kernels 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl