PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/{dense_gemm_sm90.py → gemm_sm90.py} RENAMED Viewed

@@ -2,12 +2,10 @@
 # https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/hopper/dense_gemm.py
 import enum
-from typing import Tuple, Type, Callable, Optional, Union
-from dataclasses import dataclass
+from typing import Tuple, Type, Callable, Optional, Union, Literal
 from functools import partial
 import math
-from torch import Tensor
 import cuda.bindings.driver as cuda
@@ -16,10 +14,9 @@ import cutlass.cute as cute
 import cutlass.pipeline as pipeline
 from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 import cutlass.utils.hopper_helpers as sm90_utils
-from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass import Int32, Float32, Float16, Boolean, const_expr
+from cutlass.cutlass_dsl import if_generate
 from cutlass.utils import LayoutEnum
-import cutlass.torch as cutlass_torch
-from cutlass.cute.runtime import make_ptr
 from quack.cute_dsl_utils import ParamsBase, ArgumentsBase
@@ -30,14 +27,12 @@ from quack.tile_scheduler import (
     VarlenMTileSchedulerArguments,
     VarlenMTileScheduler,
 )
-from quack.varlen_utils import VarlenArguments
-from quack.tensormap_manager import TensorMapManagerSm90
+from quack.varlen_utils import VarlenArguments, VarlenManager
 # return PipelineStateWAdvance instead of PipelineState
 from quack.pipeline import make_pipeline_state, PipelineTmaCpAsync
-import quack.utils as utils
-from quack.cute_dsl_utils import get_max_active_clusters
-from quack.gemm_wrapper_utils import GemmWrapperBase
+import quack.copy_utils as copy_utils
+import quack.sm90_utils as quack_sm90_utils
 """
 A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Hopper architecture
@@ -86,12 +81,13 @@ class NamedBarrierGemm(enum.IntEnum):
     MmaWG1 = enum.auto()
     EpiWG0 = enum.auto()
     EpiWG1 = enum.auto()
+    TmemPtr = enum.auto()
 class GemmSm90:
     """
     This class implements batched matrix multiplication (C = A x B) with support for various data types
-    and architectural features specific to Hopper GPUs.
+    and architectural features specific to Hopper GPUs with persistent tile scheduling and warp specialization.
     :param acc_dtype: Data type for accumulation during computation
     :type acc_dtype: type[cutlass.Numeric]
@@ -118,24 +114,18 @@ class GemmSm90:
     Example:
         >>> gemm = GemmSm90(
-        ...     acc_dtype=cutlass.Float32,
+        ...     acc_dtype=Float32,
         ...     tile_shape_mn=(128, 256),
         ...     cluster_shape_mnk=(1, 1, 1)
         ... )
         >>> gemm(a_tensor, b_tensor, c_tensor, stream)
     """
-    bytes_per_tensormap = 128
+    arch = 90
+    num_epi_tensormaps: int = 0
-    @dataclass
-    class EpilogueArguments(ArgumentsBase):
-        alpha: Optional[Float32 | cute.Tensor] = None
-        beta: Optional[Float32 | cute.Tensor] = None
-    @dataclass
-    class EpilogueParams(ParamsBase):
-        alpha: Optional[Float32 | cute.Tensor] = None
-        beta: Optional[Float32 | cute.Tensor] = None
+    EpilogueArguments = ArgumentsBase
+    EpilogueParams = ParamsBase
     def __init__(
         self,
@@ -174,8 +164,8 @@ class GemmSm90:
         self.cluster_shape_mnk = cluster_shape_mnk
         # K dimension is deferred in _setup_attributes
-        self.tile_shape_mnk = (*tile_shape_mn, 1)
-        tile_M, tile_N = self.tile_shape_mnk[0], self.tile_shape_mnk[1]
+        self.cta_tile_shape_mnk = (*tile_shape_mn, 1)
+        tile_M, tile_N = self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[1]
         # check the cta tile shape
         if not self.pingpong:
             if tile_M not in [64, 128, 192, 256, 320]:
@@ -209,14 +199,18 @@ class GemmSm90:
                 else:
                     atom_layout_m, atom_layout_n = 1, 2
             else:
-                atom_layout_m = self.tile_shape_mnk[0] // 64 if self.tile_shape_mnk[0] < 256 else 2
+                atom_layout_m = (
+                    self.cta_tile_shape_mnk[0] // 64 if self.cta_tile_shape_mnk[0] < 256 else 2
+                )
                 atom_layout_n = 1
             assert atom_layout_m in [1, 2, 3] and atom_layout_n in [1, 2]
         else:
             atom_layout_m, atom_layout_n = 1, 1
         self.atom_layout_mnk = (atom_layout_m, atom_layout_n, 1)
-        self.num_mcast_ctas_a = self.cluster_shape_mnk[1] if not self.gather_A else 1
+        self.num_mcast_ctas_a = self.cluster_shape_mnk[1]
+        if self.gather_A:
+            assert self.num_mcast_ctas_a == 1
         self.num_mcast_ctas_b = self.cluster_shape_mnk[0]
         self.is_a_mcast = self.num_mcast_ctas_a > 1
         self.is_b_mcast = self.num_mcast_ctas_b > 1
@@ -229,16 +223,13 @@ class GemmSm90:
         self.num_threads_per_warp_group = 128
         self.threads_per_cta = (self.mma_warp_groups + 1) * self.num_threads_per_warp_group
         self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_90")
-        self.num_epi_threads = (
-            self.mma_warp_groups if not self.pingpong else 1
-        ) * self.num_threads_per_warp_group
+        self.num_epi_warps = (self.mma_warp_groups if not self.pingpong else 1) * 4
         self.num_ab_load_warps = 1 if not self.gather_A else 4
-        self.num_ab_load_threads = cute.arch.WARP_SIZE * self.num_ab_load_warps
-        self.num_epi_load_threads = cute.arch.WARP_SIZE * 1
         self.ab_load_warp_id = self.mma_warp_groups * 4
-        self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
+        # self.num_epi_load_threads = cute.arch.WARP_SIZE * 1
+        # self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
-        regs_per_thread = math.prod(self.tile_shape_mnk[:2]) // (
+        regs_per_thread = math.prod(self.cta_tile_shape_mnk[:2]) // (
             math.prod(self.atom_layout_mnk) * self.num_threads_per_warp_group
         )
         if self.fp8_slow_accum:
@@ -268,7 +259,7 @@ class GemmSm90:
         self.shared_storage = None
         self.buffer_align_bytes = 1024
-    def _setup_attributes(self, epilogue_args: Optional[EpilogueArguments]):
+    def _setup_attributes(self, epilogue_args: EpilogueArguments):
         """Set up configurations that are dependent on GEMM inputs
         This method configures various attributes based on the input tensor properties
@@ -289,7 +280,7 @@ class GemmSm90:
             self.b_layout.sm90_mma_major_mode(),
             self.acc_dtype,
             self.atom_layout_mnk,
-            tiler_mn=(64, self.tile_shape_mnk[1] // self.atom_layout_mnk[1]),
+            tiler_mn=(64, self.cta_tile_shape_mnk[1] // self.atom_layout_mnk[1]),
         )
         if const_expr(self.atom_layout_mnk[1] > 1):
             # If N dimension is split among 2 WGs, we need to permute the N dimension so
@@ -299,7 +290,7 @@ class GemmSm90:
             # WG1 would write to a separate epi smem of size (64, 16) that's far away.
             atom_n = self.atom_layout_mnk[1]
             permutation_n = cute.make_ordered_layout(
-                (8, self.tile_shape_mnk[1] // atom_n // 8, atom_n), order=(0, 2, 1)
+                (8, self.cta_tile_shape_mnk[1] // atom_n // 8, atom_n), order=(0, 2, 1)
             )
             self.tiled_mma = cute.make_tiled_mma(
                 cute.make_mma_atom(self.tiled_mma.op),
@@ -308,30 +299,30 @@ class GemmSm90:
             )
         mma_inst_shape_k = cute.size(self.tiled_mma.shape_mnk, mode=[2])
         mma_inst_tile_k = 4
-        self.tile_shape_mnk = (
-            self.tile_shape_mnk[0],
-            self.tile_shape_mnk[1],
+        self.cta_tile_shape_mnk = (
+            self.cta_tile_shape_mnk[0],
+            self.cta_tile_shape_mnk[1],
             mma_inst_shape_k * mma_inst_tile_k,
         )
         self.cluster_layout_mnk = cute.make_layout(self.cluster_shape_mnk)
         self.epi_tile = self._sm90_compute_tile_shape_or_override(
-            self.tile_shape_mnk,
+            self.cta_tile_shape_mnk,
             self.atom_layout_mnk,
             self.d_dtype,
         )
         # Compute stage before compute smem layout
         self.ab_stage, self.epi_stage, self.epi_c_stage = self._compute_stages(
-            self.tile_shape_mnk,
+            self.cta_tile_shape_mnk,
             self.epi_tile,
             self.a_dtype,
             self.b_dtype,
             self.d_dtype,
             self.c_dtype,
             epilogue_args,
-            self.smem_capacity,
+            cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
             self.occupancy,
             # epi_smem will reuse smem ab if not persistent.
             overlap_sD_sA=not self.is_persistent,
@@ -344,7 +335,7 @@ class GemmSm90:
             self.epi_smem_layout_staged,
             self.epi_c_smem_layout_staged,
         ) = self._make_smem_layouts(
-            self.tile_shape_mnk,
+            self.cta_tile_shape_mnk,
             self.epi_tile,
             self.a_dtype,
             self.a_layout,
@@ -366,10 +357,9 @@ class GemmSm90:
         mB: cute.Tensor,
         mD: Optional[cute.Tensor],
         mC: Optional[cute.Tensor],
-        epilogue_args: Optional[ArgumentsBase],
+        epilogue_args: ArgumentsBase,
         scheduler_args: TileSchedulerOptions,
         varlen_args: Optional[VarlenArguments],
-        mAIdx: Optional[cute.Tensor],
         stream: cuda.CUstream,
     ):
         """Execute the GEMM operation in steps:
@@ -405,7 +395,10 @@ class GemmSm90:
             raise TypeError(f"Type width mismatch: {self.a_dtype.width} != {self.b_dtype.width}")
         if const_expr(self.a_dtype.width != 16 and self.a_dtype.width != 8):
             raise TypeError("a_dtype should be float16 or float8")
-        assert (mAIdx is not None) == self.gather_A
+        if const_expr(varlen_args is None):
+            varlen_args = VarlenArguments()
+        assert (varlen_args.mAIdx is not None) == self.gather_A
         # Assume all strides are divisible by 128 bits except the last stride
         new_stride = lambda t: tuple(
@@ -421,77 +414,48 @@ class GemmSm90:
         self._setup_attributes(epilogue_args)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, 0))
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, 0))
+        tma_atom_a, tma_tensor_a = None, None
         if const_expr(not self.gather_A):
             tma_atom_a, tma_tensor_a = self._make_tma_atoms_and_tensors(
                 mA,
-                self.a_smem_layout_staged,
-                (self.tile_shape_mnk[0], self.tile_shape_mnk[2]),
+                a_smem_layout,
+                (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]),
                 self.cluster_shape_mnk[1],
             )
-        else:
-            tma_atom_a, tma_tensor_a = None, None
         tma_atom_b, tma_tensor_b = self._make_tma_atoms_and_tensors(
             mB,
-            self.b_smem_layout_staged,
-            (self.tile_shape_mnk[1], self.tile_shape_mnk[2]),
+            b_smem_layout,
+            (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]),
             self.cluster_shape_mnk[0],
         )
+        self.num_tma_load_bytes = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        if const_expr(not self.gather_A):
+            self.num_tma_load_bytes += cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        tma_atom_d, tma_tensor_d = None, None
         if const_expr(mD is not None):
             tma_atom_d, tma_tensor_d = self._make_tma_epi_atoms_and_tensors(
-                mD, self.epi_smem_layout_staged, self.epi_tile, store_or_load="store"
+                mD,
+                self.epi_smem_layout_staged,
+                self.epi_tile,
+                op_type="store"
+                if not (hasattr(epilogue_args, "add_to_output") and epilogue_args.add_to_output)
+                else "add",
             )
-        else:
-            tma_atom_d, tma_tensor_d = None, None
+        tma_atom_c, tma_tensor_c = None, None
         if const_expr(mC is not None):
             tma_atom_c, tma_tensor_c = self._make_tma_epi_atoms_and_tensors(
-                mC, self.epi_c_smem_layout_staged, self.epi_tile, store_or_load="load"
+                mC, self.epi_c_smem_layout_staged, self.epi_tile, op_type="load"
             )
-        else:
-            tma_atom_c, tma_tensor_c = None, None
         epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
+        varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
-        if const_expr(varlen_args is None):
-            varlen_args = VarlenArguments()
-        if const_expr(varlen_args.mCuSeqlensM is None):
-            num_problems = (
-                mD.shape[2]
-                if mD is not None
-                else (
-                    mB.shape[2]
-                    if varlen_args.mCuSeqlensK is None
-                    else varlen_args.mCuSeqlensK.shape[0] - 1
-                )
-            )
-            problem_shape_ntile_mnl = (
-                cute.ceil_div(mA.shape[0], self.tile_shape_mnk[0]),
-                cute.ceil_div(mB.shape[0], self.tile_shape_mnk[1]),
-                num_problems,
-            )
-            TileSchedulerCls = self.get_scheduler_class()
-            tile_sched_args = self.get_scheduler_arguments(problem_shape_ntile_mnl, scheduler_args)
-        else:
-            assert mD is not None or not self.gather_A
-            problem_shape_ntile_mnl = (
-                None,
-                cute.ceil_div(mB.shape[0], self.tile_shape_mnk[1]),
-                varlen_args.mCuSeqlensM.shape[0] - 1,
-            )
-            TileSchedulerCls = VarlenMTileScheduler
-            tile_sched_args = VarlenMTileSchedulerArguments(
-                problem_shape_ntile_mnl=problem_shape_ntile_mnl,
-                total_m=mD.shape[0] if mD is not None else mAIdx.shape[0],
-                cu_seqlens_m=varlen_args.mCuSeqlensM,
-                raster_order=scheduler_args.raster_order,
-                group_size=scheduler_args.max_swizzle_size,
-                tile_shape_mn=self.tile_shape_mnk[:2],
-                cluster_shape_mnk=self.cluster_shape_mnk,
-                tile_count_semaphore=scheduler_args.tile_count_semaphore,
-                is_persistent=self.is_persistent,
-            )
+        TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
+        tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
         tile_sched_params = TileSchedulerCls.to_underlying_arguments(tile_sched_args)
         grid = TileSchedulerCls.get_grid_shape(
             tile_sched_params, scheduler_args.max_active_clusters
@@ -507,7 +471,7 @@ class GemmSm90:
             ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
             epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
             sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
-            tile_count: cute.struct.MemRange[cutlass.Int32, self.sched_stage]
+            tile_count: cute.struct.MemRange[Int32, self.sched_stage]
             sD: cute.struct.Align[
                 cute.struct.MemRange[
                     self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
@@ -534,6 +498,7 @@ class GemmSm90:
         # Launch the kernel synchronously
         self.kernel(
+            self.tiled_mma,
             tma_atom_a,
             tma_tensor_a if const_expr(not self.gather_A) else mA,
             tma_atom_b,
@@ -543,11 +508,7 @@ class GemmSm90:
             tma_atom_c,
             tma_tensor_c,
             epilogue_params,
-            mAIdx,
-            varlen_args.mCuSeqlensM,
-            varlen_args.mCuSeqlensK,
-            varlen_args.mTensormaps,
-            self.tiled_mma,
+            varlen_params,
             self.cluster_layout_mnk,
             self.a_smem_layout_staged,
             self.b_smem_layout_staged,
@@ -559,7 +520,6 @@ class GemmSm90:
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
             cluster=self.cluster_shape_mnk,
-            smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
         )
@@ -569,6 +529,7 @@ class GemmSm90:
     @cute.kernel
     def kernel(
         self,
+        tiled_mma: cute.TiledMma,
         tma_atom_a: Optional[cute.CopyAtom],
         mA_mkl: cute.Tensor,
         tma_atom_b: cute.CopyAtom,
@@ -578,11 +539,7 @@ class GemmSm90:
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
         epilogue_params: ParamsBase,
-        mAIdx: Optional[cute.Tensor],
-        cu_seqlens_m: Optional[cute.Tensor],
-        cu_seqlens_k: Optional[cute.Tensor],
-        tensormaps: Optional[cute.Tensor],
-        tiled_mma: cute.TiledMma,
+        varlen_params: VarlenManager.Params,
         cluster_layout_mnk: cute.Layout,
         a_smem_layout: cute.ComposedLayout,
         b_smem_layout: cute.ComposedLayout,
@@ -618,9 +575,11 @@ class GemmSm90:
         :type epi_smem_layout: cute.ComposedLayout
         """
-        varlen_m = const_expr(cu_seqlens_m is not None)
-        varlen_k = const_expr(cu_seqlens_k is not None)
+        varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
+        varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
         assert not (varlen_m and varlen_k)
+        if const_expr(self.gather_A):
+            assert varlen_m or varlen_k
         has_D = const_expr(mD_mnl is not None)
         has_C = const_expr(mC_mnl is not None)
@@ -641,8 +600,6 @@ class GemmSm90:
         storage = smem.allocate(self.shared_storage)
         ab_pipeline = self.make_ab_pipeline(
-            a_smem_layout=cute.slice_(a_smem_layout, (None, None, 0)),
-            b_smem_layout=cute.slice_(b_smem_layout, (None, None, 0)),
             tiled_mma=tiled_mma,
             cluster_layout_vmnk=cute.make_layout((1, *cluster_layout_mnk.shape)),
             ab_pipeline_mbar_ptr=storage.ab_pipeline_array_ptr.data_ptr(),
@@ -681,28 +638,20 @@ class GemmSm90:
             sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
         epi_smem_tensors = self.epi_get_smem_tensors(epilogue_params, storage)
-        # Get tensormap buffer address
-        tensormap_manager = None
-        tensormap_a_ptr, tensormap_b_ptr, tensormap_d_ptr = None, None, None
-        if const_expr(varlen_m or varlen_k):
-            tensormap_manager = TensorMapManagerSm90(
-                cutlass.utils.TensorMapUpdateMode.GMEM, GemmSm90.bytes_per_tensormap
-            )
-            # equivalent to bidx + bidy * gridDim.x + bidxz * gridDim.x * gridDim.y
-            tensormap_workspace_idx = cute.make_layout(cute.arch.grid_dim())(cute.arch.block_idx())
-            if const_expr(varlen_m):
-                tensormap_d_idx = warp_idx // 4 if const_expr(self.pingpong) else 0
-                tensormap_d_ptr = tensormap_manager.get_tensormap_ptr(
-                    tensormaps[tensormap_workspace_idx, tensormap_d_idx, None].iterator
-                )
-            else:
-                assert varlen_k
-                tensormap_a_ptr = tensormap_manager.get_tensormap_ptr(
-                    tensormaps[tensormap_workspace_idx, 0, None].iterator
-                )
-                tensormap_b_ptr = tensormap_manager.get_tensormap_ptr(
-                    tensormaps[tensormap_workspace_idx, 1, None].iterator
-                )
+        varlen_manager = VarlenManager.create(
+            varlen_params,
+            has_D,
+            self.num_epi_tensormaps,
+            # Only used if not varlen_m
+            len_m_static=Int32(
+                mA_mkl.shape[0]
+                if varlen_k or varlen_params.mAIdx is None
+                else varlen_params.mAIdx.shape[0]
+            ),
+            len_k_static=Int32(mA_mkl.shape[1]),
+            pingpong=self.pingpong,
+            warp_idx=warp_idx,
+        )
         TileSchedulerCls = partial(
             TileSchedulerCls.create, tile_sched_params, tile_count, sched_pipeline
@@ -715,28 +664,20 @@ class GemmSm90:
                 and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
             ):
                 is_tma_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
-                if const_expr(varlen_k):
-                    # initialize tensormap for A & B
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom_a,
-                        tensormap_a_ptr,
-                        is_tma_warp,
-                    )
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom_b,
-                        tensormap_b_ptr,
-                        is_tma_warp,
-                    )
+                # initialize tensormap for A & B
+                varlen_manager.init_tensormap_AB(tma_atom_a, tma_atom_b, is_tma_warp)
+                tma_desc_a_ptr = varlen_manager.get_tma_desc_a_ptr()
+                tma_desc_b_ptr = varlen_manager.get_tma_desc_b_ptr()
                 # ///////////////////////////////////////////////////////////////////////////////
                 # Get mcast mask
                 # ///////////////////////////////////////////////////////////////////////////////
                 cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
-                cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
+                block_in_cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
                 a_mcast_mask = cute.make_layout_image_mask(
-                    cluster_layout_mnk, cluster_coord_mnk, mode=1
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=1
                 )
                 b_mcast_mask = cute.make_layout_image_mask(
-                    cluster_layout_mnk, cluster_coord_mnk, mode=0
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=0
                 )
                 a_mcast_mask = a_mcast_mask if self.is_a_mcast else 0
                 b_mcast_mask = b_mcast_mask if self.is_b_mcast else 0
@@ -752,177 +693,129 @@ class GemmSm90:
                 )
                 if const_expr(varlen_k):
                     # wait tensormap initialization complete before update
-                    tensormap_manager.fence_tensormap_initialization()
-                # batch index of last tile
-                last_batch_idx = cutlass.Int32(-1)
+                    varlen_manager.fence_tensormap_init()
                 while work_tile.is_valid_tile:
                     tile_coord_mnkl = work_tile.tile_idx
                     batch_idx = tile_coord_mnkl[3]
-                    if const_expr(varlen_k):
-                        is_group_changed = batch_idx != last_batch_idx
-                        last_batch_idx = batch_idx
-                        if is_group_changed:
-                            # construct tensor A/B based on real address, shape and stride information
-                            tensormap_manager.update_tensormap_shape(
-                                (tensormap_a_ptr, tensormap_b_ptr),
-                                is_manager_warp=is_tma_warp,
-                                shapes=(cu_seqlens_k[batch_idx + 1], cu_seqlens_k[batch_idx + 1]),
-                                orders=(
-                                    0 if const_expr(self.a_layout == LayoutEnum.ROW_MAJOR) else 1,
-                                    0 if const_expr(self.b_layout == LayoutEnum.ROW_MAJOR) else 1,
-                                ),
-                                tensormap_smem_ptr=None,
-                            )
+                    varlen_manager.update_tensormap_AB(
+                        batch_idx,
+                        self.a_layout,
+                        self.b_layout,
+                        is_tma_warp,
+                    )
                     # ///////////////////////////////////////////////////////////////////////////
                     #  Local_tile partition global tensors
                     # ///////////////////////////////////////////////////////////////////////////
                     if const_expr(not self.gather_A):
-                        if const_expr(varlen_m):
-                            mA_mk = cute.domain_offset((cu_seqlens_m[batch_idx], 0), mA_mkl)
-                        elif const_expr(varlen_k):
-                            mA_mk = cute.domain_offset((0, cu_seqlens_k[batch_idx]), mA_mkl)
-                        else:
-                            mA_mk = mA_mkl[None, None, batch_idx]
+                        mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
                         # (bM, bK, RestK)
-                        gA_k = cute.local_tile(
+                        gA_mk = cute.local_tile(
                             mA_mk,
-                            cute.select(self.tile_shape_mnk, [0, 2]),
+                            cute.select(self.cta_tile_shape_mnk, [0, 2]),
                             (tile_coord_mnkl[0], None),
                         )
                     else:
-                        mA_mk = mA_mkl
+                        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
                         if const_expr(varlen_m):
-                            mAIdx_mk = cute.domain_offset((cu_seqlens_m[batch_idx],), mAIdx)
-                        elif const_expr(varlen_k):
-                            mAIdx_mk = cute.domain_offset((cu_seqlens_k[batch_idx],), mAIdx)
+                            gAIdx = cute.local_tile(
+                                mAIdx_mk, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0],)
+                            )
+                            # (M, K)
+                            mA_mk = mA_mkl
                         else:
-                            mAIdx_mk = mAIdx[None, batch_idx]
-                        gAIdx = cute.local_tile(
-                            mAIdx_mk, (self.tile_shape_mnk[0],), (tile_coord_mnkl[0],)
-                        )
-                    if const_expr(varlen_k):
-                        mB_nk = cute.domain_offset((0, cu_seqlens_k[batch_idx]), mB_nkl)
-                    else:
-                        mB_nk = mB_nkl[None, None, batch_idx]
+                            assert varlen_k
+                            # (tile_K, RestK)
+                            gAIdx = cute.flat_divide(mAIdx_mk, (self.cta_tile_shape_mnk[2],))
+                            # (tile_M, K)
+                            mA_mk = cute.local_tile(
+                                mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
+                            )
                     # (bN, bK, RestK)
-                    gB_k = cute.local_tile(
-                        mB_nk, cute.select(self.tile_shape_mnk, [1, 2]), (tile_coord_mnkl[1], None)
+                    gB_nk = cute.local_tile(
+                        varlen_manager.offset_batch_B(mB_nkl, batch_idx),
+                        cute.select(self.cta_tile_shape_mnk, [1, 2]),
+                        (tile_coord_mnkl[1], None),
                     )
                     # //////////////////////////////////////////////////////////////////////////
                     #  Partition shared tensor for TMA load A/B
                     # //////////////////////////////////////////////////////////////////////////
-                    if const_expr(varlen_k):
-                        # ensure the update to tensormap has completed before using it
-                        if is_group_changed and is_tma_warp:
-                            tensormap_manager.fence_tensormap_update(tensormap_a_ptr)
-                            tensormap_manager.fence_tensormap_update(tensormap_b_ptr)
-                        tma_desc_a_ptr = tensormap_manager.get_tensormap_ptr(
-                            tensormap_a_ptr, cute.AddressSpace.generic
-                        )
-                        tma_desc_b_ptr = tensormap_manager.get_tensormap_ptr(
-                            tensormap_b_ptr, cute.AddressSpace.generic
-                        )
-                    else:
-                        tma_desc_a_ptr, tma_desc_b_ptr = None, None
+                    varlen_manager.fence_tensormap_update_AB(is_tma_warp)
+                    len_m = varlen_manager.len_m(batch_idx)
+                    len_k = varlen_manager.len_k(batch_idx)
                     #  TMA load A partition_S/D
-                    a_cta_layout = cute.make_layout(
-                        cute.slice_(cluster_layout_mnk, (0, None, 0)).shape
-                    )
-                    a_cta_crd = cluster_coord_mnk[1]
+                    copy_A = None
                     if const_expr(not self.gather_A):
-                        # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
-                        tAsA, tAgA_k = cpasync.tma_partition(
-                            tma_atom_a,
-                            a_cta_crd,
-                            a_cta_layout,
-                            cute.group_modes(sA, 0, 2),
-                            cute.group_modes(gA_k, 0, 2),
-                        )
-                        copy_A = partial(
-                            cute.copy,
+                        copy_A, _, _ = copy_utils.tma_get_copy_fn(
                             tma_atom_a,
+                            cta_coord=block_in_cluster_coord_mnk[1],
+                            cta_layout=cute.make_layout(
+                                cute.slice_(cluster_layout_mnk, (0, None, 0)).shape
+                            ),
+                            src_tensor=gA_mk,
+                            dst_tensor=sA,
                             mcast_mask=a_mcast_mask,
                             tma_desc_ptr=tma_desc_a_ptr,
                         )
                     else:
                         tiled_copy_A = self._make_gmem_tiled_copy_A(
-                            mA_mkl.element_type, self.a_layout, self.num_ab_load_threads
+                            mA_mkl.element_type, self.a_layout, self.num_ab_load_warps * 32
                         )
                         tidx = (
-                            cute.arch.thread_idx()[0]
-                            - self.mma_warp_groups * self.num_threads_per_warp_group
+                            cute.arch.thread_idx()[0] - cute.arch.WARP_SIZE * self.ab_load_warp_id
                         )
                         thr_copy_A = tiled_copy_A.get_slice(tidx)
-                        # (atom_v, CPY_M, 1, STAGE)
-                        tAsA = thr_copy_A.partition_D(sA)
-                        assert tAsA.shape[2] == 1
-                        tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
-                        copy_A = partial(cute.copy, tiled_copy_A)
+                        copy_A, prefetch_A = None, None
+                        if const_expr(varlen_m):
+                            copy_A = copy_utils.gather_m_get_copy_fn(
+                                thr_copy_A,
+                                mA_mk,
+                                sA,
+                                gAIdx,
+                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                                limit_k=len_k,
+                            )
+                        else:
+                            copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
+                                thr_copy_A,
+                                mA_mk,
+                                sA,
+                                gAIdx,
+                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                                limit_k=len_k,
+                            )
                     # TMA load B partition_S/D
-                    b_cta_layout = cute.make_layout(
-                        cute.slice_(cluster_layout_mnk, (None, 0, 0)).shape
-                    )
-                    b_cta_crd = cluster_coord_mnk[0]
-                    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
-                    tBsB, tBgB_k = cpasync.tma_partition(
+                    copy_B, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_b,
-                        b_cta_crd,
-                        b_cta_layout,
-                        cute.group_modes(sB, 0, 2),
-                        cute.group_modes(gB_k, 0, 2),
-                    )
-                    copy_B = partial(
-                        cute.copy, tma_atom_b, mcast_mask=b_mcast_mask, tma_desc_ptr=tma_desc_b_ptr
+                        cta_coord=block_in_cluster_coord_mnk[0],
+                        cta_layout=cute.make_layout(
+                            cute.slice_(cluster_layout_mnk, (None, 0, 0)).shape
+                        ),
+                        src_tensor=gB_nk,
+                        dst_tensor=sB,
+                        mcast_mask=b_mcast_mask,
+                        tma_desc_ptr=tma_desc_b_ptr,
                     )
-                    k_len = (
-                        cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                        if const_expr(varlen_k)
-                        else mA_mkl.shape[1]
-                    )
-                    k_tile_cnt = cute.ceil_div(k_len, self.tile_shape_mnk[2])
+                    k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                     if const_expr(not self.gather_A):
                         ab_producer_state = self.load_AB(
-                            ab_pipeline,
-                            ab_producer_state,
-                            copy_A,
-                            tAgA_k,
-                            tAsA,
-                            copy_B,
-                            tBgB_k,
-                            tBsB,
-                            k_tile_cnt,
+                            ab_pipeline, ab_producer_state, copy_A, copy_B, k_tile_cnt
                         )
                     else:
-                        limit_m = (
-                            mAIdx.shape[0]
-                            if const_expr(cu_seqlens_m is None)
-                            else cu_seqlens_m[batch_idx + 1] - cu_seqlens_m[batch_idx]
-                        )
                         ab_producer_state = self.load_AB_gather_A(
                             ab_pipeline,
                             ab_producer_state,
-                            thr_copy_A,
-                            mA_mk,
-                            tAsA,
-                            gAIdx,
+                            copy_A,
+                            prefetch_A,
                             copy_B,
-                            tBgB_k,
-                            tBsB,
                             k_tile_cnt,
-                            limit_A=(
-                                limit_m - tile_coord_mnkl[0] * self.tile_shape_mnk[0],
-                                mA_mk.shape[1],
-                            ),
+                            varlen_m=varlen_m,
                         )
                     tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
-                    tile_scheduler.broadcast_next_work(is_scheduler_warp=is_scheduler_warp)
                     tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
                     work_tile = tile_scheduler.get_current_work()
                     # End of persistent scheduler loop
                 if const_expr(self.pingpong and not varlen_k):
                     # Need to write the tile_idx to smem for the next WG in the pingpong mode
-                    # tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
-                    tile_scheduler.broadcast_next_work(is_scheduler_warp=is_scheduler_warp)
                     tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
                 ab_pipeline.producer_tail(ab_producer_state)
                 if is_scheduler_warp:
@@ -934,13 +827,11 @@ class GemmSm90:
                 (not self.pingpong and warp_idx == 0)
                 or (self.pingpong and (warp_idx == 0 or warp_idx == 4))
             )
-            if const_expr(varlen_m):
-                # initialize tensormap for D
-                tensormap_manager.init_tensormap_from_atom(
-                    tma_atom_d,
-                    tensormap_d_ptr,
-                    is_manager_warp=is_tma_warp,
-                )
+            varlen_manager.init_tensormap_epi(
+                tma_atom_d, self.epi_get_tma_atoms(epilogue_params), is_tma_warp
+            )
+            tma_desc_d_ptr = varlen_manager.get_tma_desc_d_ptr()
+            tma_desc_epi_ptrs = varlen_manager.get_tma_desc_epi_ptrs()
             # //////////////////////////////////////////////////////////////////////////////
             #  Partition global tensor for TiledMMA_A/B/C
             # //////////////////////////////////////////////////////////////////////////////
@@ -962,7 +853,9 @@ class GemmSm90:
             tCrA = tiled_mma.make_fragment_A(thr_mma.partition_A(sA))
             tCrB = tiled_mma.make_fragment_B(thr_mma.partition_B(sB))
-            acc_shape = tiled_mma.partition_shape_C(cute.select(self.tile_shape_mnk, mode=[0, 1]))
+            acc_shape = tiled_mma.partition_shape_C(
+                cute.select(self.cta_tile_shape_mnk, mode=[0, 1])
+            )
             acc = cute.make_fragment(acc_shape, self.acc_dtype)
             acc_slow = None
             if const_expr(self.fp8_slow_accum):
@@ -974,10 +867,11 @@ class GemmSm90:
                     self.pingpong_barrier_arrive(warp_group_idx=0, stage="mma")
                     self.pingpong_barrier_arrive(warp_group_idx=0, stage="epi")
-            k_tile_cnt_static = cute.ceil_div(mA_mkl.shape[1], self.tile_shape_mnk[2])
-            c_tile_cnt = cute.size(cute.ceil_div(self.tile_shape_mnk[:2], self.epi_tile))
+            k_tile_cnt_static = cute.ceil_div(mA_mkl.shape[1], self.cta_tile_shape_mnk[2])
+            c_tile_cnt = cute.size(cute.ceil_div(self.cta_tile_shape_mnk[:2], self.epi_tile))
             ab_read_state = make_pipeline_state(pipeline.PipelineUserType.Consumer, self.ab_stage)
+            epi_store_pipeline = self.make_epi_store_pipeline()
             epi_read_state = make_pipeline_state(
                 pipeline.PipelineUserType.Consumer, self.epi_c_stage
             )
@@ -996,9 +890,8 @@ class GemmSm90:
                     if const_expr(not varlen_k):
                         ab_read_state.advance_iters(k_tile_cnt_static)
                     else:
-                        batch_idx = work_tile.tile_idx[3]
-                        k_len = cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                        k_tile_cnt = cute.ceil_div(k_len, self.tile_shape_mnk[2])
+                        len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                        k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                         ab_read_state.advance_iters(k_tile_cnt)
                     tile_scheduler.advance_to_next_work()
                     if const_expr(varlen_k):
@@ -1009,31 +902,22 @@ class GemmSm90:
                 work_tile = tile_scheduler.initial_work_tile_info()
             if const_expr(varlen_m):
                 # wait tensormap initialization complete before update
-                tensormap_manager.fence_tensormap_initialization()
-            # batch index of last tile
-            last_batch_idx = cutlass.Int32(-1)
+                varlen_manager.fence_tensormap_init()
             while work_tile.is_valid_tile:
                 tile_coord_mnkl = work_tile.tile_idx
                 batch_idx = tile_coord_mnkl[3]
-                if const_expr(varlen_m):
-                    is_group_changed = batch_idx != last_batch_idx
-                    last_batch_idx = batch_idx
-                    if is_group_changed:
-                        # construct tensor D based on real address, shape and stride information
-                        tensormap_manager.update_tensormap_shape(
-                            (tensormap_d_ptr,),
-                            is_manager_warp=is_tma_warp,
-                            shapes=(cu_seqlens_m[batch_idx + 1],),
-                            orders=(0 if const_expr(self.d_layout.is_m_major_c()) else 1,),
-                            tensormap_smem_ptr=None,
-                        )
-                k_len = (
-                    cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                    if const_expr(varlen_k)
-                    else mA_mkl.shape[1]
+                epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
+                    epilogue_params, varlen_params.cu_seqlens_m, batch_idx
+                )
+                varlen_manager.update_tensormap_epi(
+                    batch_idx,
+                    self.d_layout,
+                    epi_shapes,
+                    epi_orders,
+                    is_tma_warp,
                 )
-                k_tile_cnt = cute.ceil_div(k_len, self.tile_shape_mnk[2])
+                len_k = varlen_manager.len_k(batch_idx)
+                k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                 ab_read_state, tiled_mma = self.mma(
                     ab_pipeline,
                     ab_read_state,
@@ -1056,51 +940,42 @@ class GemmSm90:
                     self.pingpong_barrier_sync(warp_group_idx, "epi")
                 epilogue_barrier = pipeline.NamedBarrier(
-                    barrier_id=int(NamedBarrierGemm.Epilogue), num_threads=self.num_epi_threads
+                    barrier_id=int(NamedBarrierGemm.Epilogue),
+                    num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
                 )
-                if const_expr(varlen_m):
-                    # ensure the update to tensormap has completed before using it
-                    if is_group_changed and is_tma_warp:
-                        tensormap_manager.fence_tensormap_update(tensormap_d_ptr)
-                    tma_desc_d_ptr = tensormap_manager.get_tensormap_ptr(
-                        tensormap_d_ptr, cute.AddressSpace.generic
-                    )
-                else:
-                    tma_desc_d_ptr = None
+                varlen_manager.fence_tensormap_update_epi(is_tma_warp)
+                copy_D = None
                 if const_expr(has_D):
-                    bSG_sD, bSG_gD = self.epilog_gmem_copy_and_partition(
+                    copy_D, _, _ = self.epilog_gmem_copy_and_partition(
                         tma_atom_d,
-                        mD_mnl,
-                        self.tile_shape_mnk[:2],
+                        varlen_manager.offset_batch_epi(mD_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
                         self.epi_tile,
                         sD,
                         tile_coord_mnkl,
-                        cu_seqlens_m,
+                        tma_desc_ptr=tma_desc_d_ptr,
                     )
-                    copy_D = partial(cute.copy, tma_atom_d, tma_desc_ptr=tma_desc_d_ptr)
-                else:
-                    bSG_sD, bSG_gD, copy_D = None, None, None
+                copy_C = None
                 if const_expr(has_C):
-                    bGS_sC, bGS_gC = self.epilog_gmem_copy_and_partition(
+                    copy_C_fn, _, _ = self.epilog_gmem_copy_and_partition(
                         tma_atom_c,
-                        mC_mnl,
-                        self.tile_shape_mnk[:2],
+                        varlen_manager.offset_batch_epi(mC_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
                         self.epi_tile,
                         sC,
                         tile_coord_mnkl,
-                        cu_seqlens_m,
                     )
-                    copy_C = partial(cute.copy, tma_atom_c)
-                    epi_load_g2s = partial(self.epi_load_g2s, epi_pipeline, copy_C, bGS_gC, bGS_sC)
-                else:
-                    epi_load_g2s = None
+                    copy_C = copy_utils.tma_producer_copy_fn(copy_C_fn, epi_pipeline)
                 d_dtype_for_layout = self.d_dtype if self.d_dtype is not None else cutlass.BFloat16
-                tiled_copy_r2s, tRS_rAcc, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
-                    tiled_mma, self.d_layout, d_dtype_for_layout, acc, sD, tidx
+                tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
+                    tiled_mma, self.d_layout, d_dtype_for_layout, sD, tidx
                 )
+                # (R2S, R2S_M, R2S_N)
+                tRS_rAcc = tiled_copy_r2s.retile(acc)
+                load_acc_subtile = partial(self.epi_load_acc_subtile, tRS_rAcc)
                 if const_expr(has_C):
                     tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
                         tiled_mma, self.c_layout, self.c_dtype, sC, tRS_rD.layout, tidx
@@ -1118,24 +993,25 @@ class GemmSm90:
                 epi_read_state, epi_producer_state = self.epilogue(
                     epilogue_params,
                     epi_smem_tensors,
+                    tma_desc_epi_ptrs,
                     epi_pipeline,
+                    epi_store_pipeline,
                     epi_read_state,
                     epi_producer_state,
-                    tiled_mma,
-                    tRS_rAcc,
+                    self.epi_tile,
+                    load_acc_subtile,
                     tRS_rD,
                     tRS_rC,
+                    None,  # tiled_copy_t2r, for Sm100 only
                     tiled_copy_r2s,
                     tRS_sD,
                     tiled_copy_s2r,
                     tSR_rC,
                     tSR_sC,
                     copy_D,
-                    bSG_sD,
-                    bSG_gD,
-                    epi_load_g2s,
+                    copy_C,
                     tile_coord_mnkl,
-                    cu_seqlens_m,
+                    varlen_manager,
                     epilogue_barrier,
                     tile_scheduler,
                     tidx,
@@ -1147,7 +1023,7 @@ class GemmSm90:
                     # so we have to make sure the smem content is done reading before signaling
                     # the next WG's epilogue.
                     if is_tma_warp:
-                        cute.arch.cp_async_bulk_wait_group(0, read=True)
+                        epi_store_pipeline.producer_tail()
                     self.pingpong_barrier_arrive(1 - warp_group_idx, stage="epi")
                 if const_expr(not self.pingpong):
@@ -1166,31 +1042,33 @@ class GemmSm90:
                         tile_scheduler.advance_to_next_work()
                         work_tile = tile_scheduler.get_current_work()
                         if work_tile.is_valid_tile:
-                            batch_idx = work_tile.tile_idx[3]
-                            k_len = cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                            k_tile_cnt = cute.ceil_div(k_len, self.tile_shape_mnk[2])
+                            len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                            k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                             ab_read_state.advance_iters(k_tile_cnt)
                             tile_scheduler.advance_to_next_work()
                             work_tile = tile_scheduler.get_current_work()
                 # End of persistent scheduler loop
+            # Wait for D store complete
             if const_expr(not self.pingpong):
                 if is_tma_warp:
-                    cute.arch.cp_async_bulk_wait_group(0, read=True)
+                    epi_store_pipeline.producer_tail()
     @cute.jit
     def load_AB(
         self,
         ab_pipeline: cutlass.pipeline.PipelineAsync,
         ab_producer_state: cutlass.pipeline.PipelineState,
-        copy_A: Callable,
-        tAgA: cute.Tensor,
-        tAsA: cute.Tensor,
+        copy_A: Optional[Callable],
         copy_B: Callable,
-        tBgB: cute.Tensor,
-        tBsB: cute.Tensor,
         k_tile_cnt: Int32,
+        # These are for Sm100 blockscaled gemm
+        copy_SFA: Optional[Callable] = None,
+        copy_SFB: Optional[Callable] = None,
     ) -> cutlass.pipeline.PipelineState:
+        blockscaled = const_expr(copy_SFA is not None)
+        if const_expr(blockscaled):
+            assert copy_SFB is not None
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
         peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
@@ -1203,8 +1081,13 @@ class GemmSm90:
             # Also sets the transaction barrier for the A/B buffers
             ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
             tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
-            copy_A(tAgA[None, k_tile], tAsA[None, ab_producer_state.index], tma_bar_ptr=tma_bar_ptr)
-            copy_B(tBgB[None, k_tile], tBsB[None, ab_producer_state.index], tma_bar_ptr=tma_bar_ptr)
+            smem_idx = ab_producer_state.index
+            if const_expr(copy_A is not None):
+                copy_A(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            if const_expr(blockscaled):
+                copy_SFA(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+                copy_SFB(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
             # Mainloop pipeline's producer commit is a NOP
             ab_pipeline.producer_commit(ab_producer_state)
             ab_producer_state.advance()
@@ -1218,38 +1101,12 @@ class GemmSm90:
         self,
         ab_pipeline: cutlass.pipeline.PipelineAsync,
         ab_producer_state: cutlass.pipeline.PipelineState,
-        thr_copy_A: cute.core.ThrCopy,
-        mA: cute.Tensor,
-        tAsA: cute.Tensor,
-        gAIdx: cute.Tensor,
+        copy_A: Callable,
+        prefetch_A: Optional[Callable],
         copy_B: Callable,
-        tBgB: cute.Tensor,
-        tBsB: cute.Tensor,
         k_tile_cnt: Int32,
-        limit_A: Tuple[Int32, Int32],
+        varlen_m: bool = True,
     ) -> cutlass.pipeline.PipelineState:
-        # (atom_v, CPY_M, 1, RestK)
-        limit_m, limit_k = limit_A
-        limit_m = min(limit_m, self.tile_shape_mnk[0])  # To avoid writing beyond smem limit
-        cA = cute.make_identity_tensor(cute.select(self.tile_shape_mnk, [0, 2]))
-        tAcA = thr_copy_A.partition_S(cA)
-        t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
-        # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
-        # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
-        # This is so that when we do the comparison, t0AcA is known at compile time.
-        limit_m = limit_m - tAcA[0][0]
-        # Read indices for A
-        rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
-        m_idx = cute.make_fragment(rows_per_thread, Int32)
-        for m in cutlass.range(rows_per_thread):
-            row_idx = tAcA[0, m, 0][0]
-            if t0AcA[0, m, 0][0] < limit_m:
-                m_idx[m] = gAIdx[row_idx]
-            else:
-                m_idx[m] = -1
-        elems_per_load = cute.size(tAsA.shape[0][0])
-        # (m, (bK, RestK))
-        mA_k = cute.logical_divide(mA, (None, self.tile_shape_mnk[2]))
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
         peek_ab_empty_status = Boolean(True)
@@ -1258,35 +1115,27 @@ class GemmSm90:
         # /////////////////////////////////////////////////////////////////////////
         # TMA load on B and cp.async on A
         # /////////////////////////////////////////////////////////////////////////
-        copy_A = partial(cute.copy, thr_copy_A)
         for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile),)
             # Wait for A/B buffers to be empty before loading into them
             # Also sets the transaction barrier for the A/B buffers
-            ab_pipeline.producer_acquire(
-                ab_producer_state,
-                peek_ab_empty_status,
-                # A tiny bit faster to rotate the warp that does TMA
-                is_tma_warp=warp_idx == self.ab_load_warp_id + (k_tile % self.num_ab_load_warps),
+            # A tiny bit faster to rotate the warp that does TMA
+            # However, for varlen_k, we must use the warp_idx == self.ab_load_warp_id
+            # since that's the warp that does the tensormap update.
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (
+                (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
             )
-            # A bit faster to load B first while we calculate the predicate for A
-            if warp_idx == self.ab_load_warp_id + (k_tile % self.num_ab_load_warps):
-                copy_B(
-                    tBgB[None, k_tile],
-                    tBsB[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                )
-            # (m, bK)
-            mA_cur = mA_k[None, (None, k_tile)]
-            for m in cutlass.range_constexpr(tAcA.shape[1]):
-                # (elems_per_load, thread_per_row)
-                mA_row = cute.tiled_divide(mA_cur[m_idx[m], None], (elems_per_load,))
-                if t0AcA[0, m, 0][0] < limit_m:
-                    # There's only 1 load per row
-                    assert cute.size(tAcA.shape, mode=[2]) == 1
-                    ki = tAcA[0, 0, 0][1] // elems_per_load
-                    copy_A(mA_row[None, ki], tAsA[(None, m), ab_producer_state.index])
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
+            smem_idx = ab_producer_state.index
+            # A bit faster to load B first while we calculate the indices for A
+            if is_tma_warp:
+                tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+                copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_A(k_tile, smem_idx, *prefetch_out)
             # This tells mbarrier to track the completion of cp.async
-            ab_pipeline.producer_commit(ab_producer_state)
+            ab_pipeline.producer_cpasync_commit(ab_producer_state)
             ab_producer_state.advance()
             peek_ab_empty_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
@@ -1294,33 +1143,19 @@ class GemmSm90:
         # bound checking in the K dimension on the last k_tile
         if 0 < k_tile_cnt:
             k_tile = k_tile_cnt - 1
-            ab_pipeline.producer_acquire(
-                ab_producer_state,
-                peek_ab_empty_status,
-                is_tma_warp=warp_idx == self.ab_load_warp_id + (k_tile % self.num_ab_load_warps),
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, pred=True),)
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (
+                (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
             )
-            if warp_idx == self.ab_load_warp_id + (k_tile % self.num_ab_load_warps):
-                copy_B(
-                    tBgB[None, k_tile],
-                    tBsB[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                )
-            assert tAcA.shape[2] == 1  # there's only 1 load along the K dimension
-            tApA = cute.make_fragment(1, Boolean)
-            tApA[0] = tAcA[0, 0, 0][1] < limit_k
-            # (m, bK)
-            mA_cur = mA_k[None, (None, k_tile)]
-            for m in cutlass.range_constexpr(tAcA.shape[1]):
-                # (elems_per_load, thread_per_row)
-                mA_row = cute.tiled_divide(mA_cur[m_idx[m], None], (elems_per_load,))
-                if t0AcA[0, m, 0][0] < limit_m:
-                    # There's only 1 load per row
-                    assert cute.size(tAcA.shape, mode=[2]) == 1
-                    ki = tAcA[0, 0, 0][1] // elems_per_load
-                    # copy_A(mA_row[None, ki], tAsA[(None, m), ab_producer_state.index], pred=tApA)
-                    # TODO
-                    copy_A(mA_row[None, ki], tAsA[(None, m), ab_producer_state.index])
-            ab_pipeline.producer_commit(ab_producer_state)
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
+            smem_idx = ab_producer_state.index
+            if is_tma_warp:
+                tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+                copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_A(k_tile, smem_idx, *prefetch_out, pred=True)
+            ab_pipeline.producer_cpasync_commit(ab_producer_state)
             ab_producer_state.advance()
         return ab_producer_state
@@ -1416,24 +1251,25 @@ class GemmSm90:
         self,
         params: EpilogueParams,
         epi_smem_tensors: Tuple[cute.Tensor, ...],
+        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
         epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_store_pipeline: cutlass.pipeline.PipelineAsync,
         epi_read_state: cutlass.pipeline.PipelineState,
-        epi_producer_state: cutlass.pipeline.PipelineState,
-        tiled_mma: cute.TiledMma,
-        tRS_rAcc: cute.Tensor,
+        epi_producer_state: Optional[cutlass.pipeline.PipelineState],
+        epi_tile: cute.Tile,
+        load_acc_subtile: Callable,
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor],
-        tiled_copy_r2s: cute.core.ThrCopy,
+        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
+        tiled_copy_r2s: cute.TiledCopy,
         tRS_sD: cute.Tensor,
-        tiled_copy_s2r: Optional[cute.core.ThrCopy],
+        tiled_copy_s2r: Optional[cute.ThrCopy],
         tSR_rC: Optional[cute.Tensor],
         tSR_sC: Optional[cute.Tensor],
         copy_D: Optional[Callable],
-        bSG_sD: cute.Tensor,
-        bSG_gD: cute.Tensor,
-        epi_load_g2s: Optional[Callable],
+        copy_C: Optional[Callable],
         tile_coord_mnkl: cute.Coord,
-        cu_seqlens_m: Optional[cute.Tensor],
+        varlen_manager: VarlenManager,
         epilogue_barrier: cutlass.pipeline.NamedBarrier,
         tile_scheduler,
         tidx: Int32,
@@ -1441,22 +1277,61 @@ class GemmSm90:
     ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
         has_C = const_expr(tRS_rC is not None)
         has_D = const_expr(copy_D is not None)
-        # We iterate over epi tiles in the N dimension first before the M dimension
         epi_tile_shape = cute.zipped_divide(
-            cute.make_layout(self.tile_shape_mnk[:2]), self.epi_tile
+            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
         ).shape[1]
-        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_layout = cute.make_ordered_layout(epi_tile_shape, order=(1, 0))
         epi_tile_num = cute.size(epi_tile_shape)
         num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
-        if const_expr(epi_load_g2s is not None):
+        epi_tensors = self.epi_begin(
+            params,
+            epi_smem_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        if const_expr(copy_C is not None):
             for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
-                epi_producer_state = epi_load_g2s(epi_producer_state, epi_idx, is_tma_warp)
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+        def tma_store_fn(src_idx, dst_idx):
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
+            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
+            epilogue_barrier.arrive_and_wait()
+        # We could delay the TMA store by 1 epi tile to better overlap the non-TMA ops
+        # with the TMA store. However, currently this doesn't seem to improve perf.
+        delay_tma_store = False
+        src_idx_prev, dst_idx_prev = None, None
         for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # The global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
             # Copy from acc to D registers
-            for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
-                tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+            load_acc_subtile(tRS_rD, epi_idx)
+            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
             if const_expr(has_C):
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
@@ -1468,98 +1343,132 @@ class GemmSm90:
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
                 epi_read_state.advance()
-            if const_expr(epi_load_g2s is not None and epi_idx + self.epi_c_stage < epi_tile_num):
-                epi_producer_state = epi_load_g2s(
-                    epi_producer_state, epi_idx + self.epi_c_stage, is_tma_warp
-                )
-            tRS_rEpi = self.epi_visit_acc_subtile(params, tRS_rD, tRS_rC)
+            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+            tRS_rEpi = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
             epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            if const_expr(delay_tma_store):
+                if const_expr(epi_idx > 0):
+                    tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
             # Copy from D registers to shared memory
             if const_expr(has_D):
-                # Type conversion
-                tRS_rD_out = cute.make_fragment_like(tRS_rD, self.d_dtype)
-                tRS_rD_out.store(tRS_rD.load().to(self.d_dtype))
-                cute.copy(tiled_copy_r2s, tRS_rD_out, tRS_sD[None, None, None, epi_buffer])
-            # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
-            epilogue_barrier.arrive_and_wait()
-            # Get the global memory coordinate for the current epi tile
-            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
-            # Copy from shared memory to global memory
-            if is_tma_warp:
-                if const_expr(has_D):
-                    copy_D(bSG_sD[None, epi_buffer], bSG_gD[None, gmem_coord])
-                cute.arch.cp_async_bulk_commit_group()
-                cute.arch.cp_async_bulk_wait_group(self.epi_stage - 1, read=True)
-            epilogue_barrier.arrive_and_wait()
+                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
+            if const_expr(not delay_tma_store):
+                tma_store_fn(src_idx=epi_buffer, dst_idx=gmem_coord)
+        if const_expr(delay_tma_store):
+            tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+        self.epi_end(
+            params,
+            epi_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
+        )
         return epi_read_state, epi_producer_state
-    @cute.jit
-    def epi_load_g2s(
+    def get_scheduler_class(self, varlen_m: bool = False):
+        """Return the scheduler class to use. Override in subclasses for custom schedulers."""
+        return TileScheduler if not varlen_m else VarlenMTileScheduler
+    def get_scheduler_arguments(
         self,
-        epi_pipeline: cutlass.pipeline.PipelineAsync,
-        copy_C: Callable,
-        bGS_gC: cute.Tensor,
-        bGS_sC: cute.Tensor,
-        epi_producer_state: cutlass.pipeline.PipelineState,
-        epi_idx: Int32,
-        should_load: Boolean,
-    ) -> cutlass.pipeline.PipelineState:
-        # We iterate over epi tiles in the N dimension first before the M dimension
-        epi_tile_layout = cute.make_layout(bGS_gC.shape[1], stride=(bGS_gC.shape[1][1], 1))
-        if should_load:
-            epi_pipeline.producer_acquire(epi_producer_state)
-            # Get the global memory coordinate for the current epi tile
-            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
-            copy_C(
-                bGS_gC[None, gmem_coord],
-                bGS_sC[None, epi_producer_state.index],
-                tma_bar_ptr=epi_pipeline.producer_get_barrier(epi_producer_state),
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mD: Optional[cute.Tensor],
+        scheduler_args,
+        varlen_args,
+    ):
+        """Create scheduler arguments. Override in subclasses for custom schedulers."""
+        if const_expr(varlen_args.mCuSeqlensM is None):
+            num_problems = (
+                mD.shape[2]
+                if mD is not None
+                else (
+                    mB.shape[2]
+                    if varlen_args.mCuSeqlensK is None
+                    else varlen_args.mCuSeqlensK.shape[0] - 1
+                )
+            )
+            problem_shape_ntile_mnl = (
+                cute.ceil_div(mA.shape[0], self.cta_tile_shape_mnk[0]),
+                cute.ceil_div(mB.shape[0], self.cta_tile_shape_mnk[1]),
+                num_problems,
+            )
+            tile_sched_args = TileSchedulerArguments(
+                problem_shape_ntile_mnl=problem_shape_ntile_mnl,
+                raster_order=scheduler_args.raster_order,
+                group_size=scheduler_args.max_swizzle_size,
+                cluster_shape_mnk=self.cluster_shape_mnk,
+                tile_count_semaphore=scheduler_args.tile_count_semaphore,
+                batch_idx_permute=scheduler_args.batch_idx_permute,
+                is_persistent=self.is_persistent,
             )
-            # Epi pipeline's producer commit is a NOP
-            epi_pipeline.producer_commit(epi_producer_state)
-        epi_producer_state.advance()
-        return epi_producer_state
+        else:
+            assert mD is not None or not self.gather_A
+            problem_shape_ntile_mnl = (
+                None,
+                cute.ceil_div(mB.shape[0], self.cta_tile_shape_mnk[1]),
+                varlen_args.mCuSeqlensM.shape[0] - 1,
+            )
+            tile_sched_args = VarlenMTileSchedulerArguments(
+                problem_shape_ntile_mnl=problem_shape_ntile_mnl,
+                total_m=mD.shape[0] if mD is not None else varlen_args.mAIdx.shape[0],
+                cu_seqlens_m=varlen_args.mCuSeqlensM,
+                raster_order=scheduler_args.raster_order,
+                group_size=scheduler_args.max_swizzle_size,
+                tile_shape_mn=self.cta_tile_shape_mnk[:2],
+                cluster_shape_mnk=self.cluster_shape_mnk,
+                tile_count_semaphore=scheduler_args.tile_count_semaphore,
+                is_persistent=self.is_persistent,
+            )
+        return tile_sched_args
+    @cute.jit
+    def epi_load_acc_subtile(self, tRS_rAcc: cute.Tensor, tRS_rD: cute.Tensor, epi_idx: int):
+        for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
+            tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+    @cute.jit
+    def epi_begin(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tidx: Int32,
+    ) -> Tuple[cute.Tensor, ...]:
+        return ()
-    def epi_visit_acc_subtile(
+    def epi_begin_loop(
+        self, params: EpilogueParams, epi_tensors: Tuple[cute.Tensor, ...], epi_coord: cute.Coord
+    ) -> Tuple[cute.Tensor, ...]:
+        return ()
+    def epi_visit_subtile(
         self,
         params: EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
     ) -> Optional[cute.Tensor]:
-        # Apply alpha scaling to accumulator if alpha is provided (not None)
-        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
-            alpha = utils.load_scalar_or_pointer(params.alpha)
-            tRS_rD.store(tRS_rD.load() * alpha)
-        # Apply C with beta scaling
-        if const_expr(tRS_rC is not None):
-            if const_expr(not hasattr(params, "beta") or params.beta is None):
-                # beta is None, default behavior: add C (beta=1.0)
-                tRS_rD.store(tRS_rD.load() + tRS_rC.load().to(tRS_rD.element_type))
-            else:
-                beta = utils.load_scalar_or_pointer(params.beta)
-                tRS_rD.store(tRS_rD.load() + beta * tRS_rC.load().to(tRS_rD.element_type))
         return None
-    def get_scheduler_class(self):
-        """Return the scheduler class to use. Override in subclasses for custom schedulers."""
-        return TileScheduler
-    def get_scheduler_arguments(self, problem_shape_ntile_mnl, scheduler_args):
-        """Create scheduler arguments. Override in subclasses for custom schedulers."""
-        return TileSchedulerArguments(
-            problem_shape_ntile_mnl=problem_shape_ntile_mnl,
-            raster_order=scheduler_args.raster_order,
-            group_size=scheduler_args.max_swizzle_size,
-            cluster_shape_mnk=self.cluster_shape_mnk,
-            tile_count_semaphore=scheduler_args.tile_count_semaphore,
-            batch_idx_permute=scheduler_args.batch_idx_permute,
-            is_persistent=self.is_persistent,
-        )
     def epi_visit_acc(
         self,
         params: EpilogueParams,
@@ -1570,26 +1479,58 @@ class GemmSm90:
     ) -> None:
         pass
+    @cute.jit
+    def epi_end(
+        self,
+        params: EpilogueParams,
+        epi_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager,
+        tidx,
+    ) -> None:
+        pass
     def epi_to_underlying_arguments(
         self, args: EpilogueArguments, *, loc=None, ip=None
     ) -> EpilogueParams:
-        return GemmSm90.EpilogueParams(alpha=args.alpha, beta=args.beta)
+        return self.EpilogueParams()
+    def epi_get_tma_atoms(
+        self, params: EpilogueParams, *, loc=None, ip=None
+    ) -> list[cute.CopyAtom]:
+        """Subclasses can override this"""
+        return []
+    def epi_get_tensormap_update_shapes_orders(
+        self,
+        params: EpilogueParams,
+        cu_seqlens_m: cute.Tensor,
+        batch_idx: Int32,
+        *,
+        loc=None,
+        ip=None,
+    ) -> tuple[list[Int32], list[int]]:
+        """Subclasses can override this"""
+        return [], []
     @staticmethod
     def epi_smem_bytes_per_stage(
         args: Optional[EpilogueArguments],
-        tile_shape_mnk: Tuple[int, int, int],
-        epi_tile: Tuple[int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: cute.Tile,
     ) -> int:
         return 0
     def epi_get_smem_struct(self, params: EpilogueParams):
-        return cute.struct.MemRange[cutlass.Int32, 0]  # Dummy struct
+        return cute.struct.MemRange[Int32, 0]  # Dummy struct
     def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
         return tuple()
-    def pingpong_barrier_sync(self, warp_group_idx: Int32, stage: str):
+    def pingpong_barrier_sync(self, warp_group_idx: Int32, stage: Literal["mma", "epi"]):
         assert stage in ["mma", "epi"]
         barrier = NamedBarrierGemm.MmaWG0 if stage == "mma" else NamedBarrierGemm.EpiWG0
         cute.arch.barrier(
@@ -1597,7 +1538,7 @@ class GemmSm90:
             number_of_threads=2 * self.num_threads_per_warp_group,
         )
-    def pingpong_barrier_arrive(self, warp_group_idx: Int32, stage: str):
+    def pingpong_barrier_arrive(self, warp_group_idx: Int32, stage: Literal["mma", "epi"]):
         assert stage in ["mma", "epi"]
         barrier = NamedBarrierGemm.MmaWG0 if stage == "mma" else NamedBarrierGemm.EpiWG0
         cute.arch.barrier_arrive(
@@ -1611,7 +1552,7 @@ class GemmSm90:
                 self.d_layout.is_m_major_c() if self.d_layout is not None else False,
                 num_matrices=4 if self.epi_tile[1] % 16 == 0 else 2,
             ),
-            cutlass.Float16,  # this is just to get the right source layout
+            Float16,  # this is just to get the right source layout
         )
         tiled_copy_C_atom = cute.make_tiled_copy_C_atom(copy_atom_C, tiled_mma)
         return tiled_copy_C_atom
@@ -1621,8 +1562,7 @@ class GemmSm90:
         tiled_mma: cute.TiledMma,
         d_layout: Optional[LayoutEnum],
         dtype: Type[cutlass.Numeric],
-        acc: cute.Tensor,
-        sD: cute.Tensor,
+        sD: Optional[cute.Tensor],
         tidx: Int32,
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         if d_layout is None:
@@ -1637,12 +1577,10 @@ class GemmSm90:
         # (R2S, R2S_M, R2S_N, PIPE_D)
         thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
         tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
-        # (R2S, R2S_M, R2S_N)
-        tRS_rAcc = tiled_copy_r2s.retile(acc)
         sD_shape = sD.shape[:2] if sD is not None else self.epi_tile
         tRS_rD_shape = thr_copy_r2s.partition_S(cute.make_identity_tensor(sD_shape)).shape
         tRS_rD = cute.make_fragment(tRS_rD_shape, self.acc_dtype)
-        return tiled_copy_r2s, tRS_rAcc, tRS_rD, tRS_sD
+        return tiled_copy_r2s, tRS_rD, tRS_sD
     def epilog_smem_load_and_partition(
         self,
@@ -1654,7 +1592,7 @@ class GemmSm90:
         tidx: Int32,
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
-        copy_atom_s2r = utils.sm90_get_smem_load_op(c_layout, dtype)
+        copy_atom_s2r = copy_utils.sm90_get_smem_load_op(c_layout, dtype)
         tiled_copy_s2r = cute.make_tiled_copy_S(copy_atom_s2r, tiled_copy_C_atom)
         thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
         tSR_sC = thr_copy_s2r.partition_S(sC)
@@ -1665,57 +1603,53 @@ class GemmSm90:
     def epilog_gmem_copy_and_partition(
         self,
         atom: Union[cute.CopyAtom, cute.TiledCopy],
-        mD_mnl: cute.Tensor,
+        mD_mn: cute.Tensor,
         tile_shape_mn: cute.Tile,
         epi_tile: cute.Tile,
         sD: cute.Tensor,
         tile_coord_mnkl: cute.Coord,
-        cu_seqlens_m: Optional[cute.Tensor] = None,
+        tma_desc_ptr: Optional[cute.Pointer] = None,
     ) -> Tuple[cute.Tensor, cute.Tensor]:
-        batch_idx = tile_coord_mnkl[3]
-        if const_expr(cu_seqlens_m is not None):
-            mD_mn = cute.domain_offset((cu_seqlens_m[batch_idx], 0), mD_mnl)
-        else:
-            mD_mn = mD_mnl[None, None, batch_idx]
         # (bM, bN)
         gD = cute.local_tile(mD_mn, tile_shape_mn, tile_coord_mnkl[:2])
         tDgD_for_tma_partition = cute.zipped_divide(gD, epi_tile)
-        bSG_sD, bSG_gD = cpasync.tma_partition(
+        is_s2g = isinstance(
+            atom.op, (cpasync.CopyBulkTensorTileS2GOp, cpasync.CopyReduceBulkTensorTileS2GOp)
+        )
+        src_tensor, dst_tensor = (
+            (sD, tDgD_for_tma_partition) if is_s2g else (tDgD_for_tma_partition, sD)
+        )
+        return copy_utils.tma_get_copy_fn(
             atom,
-            0,
-            cute.make_layout(1),
-            cute.group_modes(sD, 0, 2),
-            tDgD_for_tma_partition,
+            cta_coord=0,
+            cta_layout=cute.make_layout(1),
+            src_tensor=src_tensor,
+            dst_tensor=dst_tensor,
+            tma_desc_ptr=tma_desc_ptr,
         )
-        return bSG_sD, bSG_gD
     def make_ab_pipeline(
         self,
-        a_smem_layout: cute.Layout | cute.ComposedLayout,
-        b_smem_layout: cute.Layout | cute.ComposedLayout,
         tiled_mma: cute.TiledMma,
         cluster_layout_vmnk: cute.Layout,
         ab_pipeline_mbar_ptr: cute.Pointer,
     ):
         # Threads/warps participating in this pipeline
-        producer_cnt = 1 if const_expr(not self.gather_A) else 1 + self.num_ab_load_threads
+        producer_cnt = 1 if const_expr(not self.gather_A) else 1 + self.num_ab_load_warps * 32
         ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, producer_cnt)
         # Each warp will contribute to the arrive count with the number of mcast size
         mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
-        consumer_arrive_cnt = mcast_size * (tiled_mma.size // cute.arch.WARP_SIZE)
+        consumer_arrive_cnt = mcast_size * tiled_mma.size // cute.arch.WARP_SIZE
         ab_pipeline_consumer_group = pipeline.CooperativeGroup(
             pipeline.Agent.Thread, consumer_arrive_cnt
         )
         pipeline_cls = pipeline.PipelineTmaAsync if not self.gather_A else PipelineTmaCpAsync
-        tma_copy_bytes = cute.size_in_bytes(self.b_dtype, b_smem_layout)
-        if const_expr(not self.gather_A):
-            tma_copy_bytes += cute.size_in_bytes(self.a_dtype, a_smem_layout)
         return pipeline_cls.create(
             barrier_storage=ab_pipeline_mbar_ptr,
             num_stages=self.ab_stage,
             producer_group=ab_pipeline_producer_group,
             consumer_group=ab_pipeline_consumer_group,
-            tx_count=tma_copy_bytes,
+            tx_count=self.num_tma_load_bytes,
             cta_layout_vmnk=cluster_layout_vmnk,
         )
@@ -1725,7 +1659,7 @@ class GemmSm90:
         # Threads/warps participating in this pipeline
         epi_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
         # Each warp will contribute 1 to the arrive count
-        consumer_arrive_cnt = self.num_epi_threads // cute.arch.WARP_SIZE
+        consumer_arrive_cnt = self.num_epi_warps
         epi_pipeline_consumer_group = pipeline.CooperativeGroup(
             pipeline.Agent.Thread, consumer_arrive_cnt
         )
@@ -1738,6 +1672,14 @@ class GemmSm90:
             tx_count=tma_copy_c_bytes,
         )
+    def make_epi_store_pipeline(self):
+        # Threads/warps participating in tma store pipeline
+        num_epi_threads = self.num_epi_warps * cute.arch.WARP_SIZE
+        epi_store_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_epi_threads)
+        return pipeline.PipelineTmaStore.create(
+            num_stages=self.epi_stage, producer_group=epi_store_producer_group
+        )
     def make_sched_pipeline(
         self, cluster_layout_mnk: cute.Layout, sched_pipeline_mbar_ptr: cute.Pointer, varlen_k: bool
     ):
@@ -1766,21 +1708,21 @@ class GemmSm90:
     @classmethod
     def _compute_stages(
         cls,
-        tile_shape_mnk: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
         epi_tile: Tuple[int, int],
         a_dtype: Type[cutlass.Numeric],
         b_dtype: Type[cutlass.Numeric],
         d_dtype: Optional[Type[cutlass.Numeric]],
         c_dtype: Optional[Type[cutlass.Numeric]],
-        epilogue_args: Optional[EpilogueArguments],
+        epilogue_args: EpilogueArguments,
         smem_capacity: int,
         occupancy: int,
-        overlap_sD_sA: bool,
+        overlap_sD_sA: bool = False,
     ) -> Tuple[int, int]:
         """Computes the number of stages for A/B/C operands based on heuristics.
-        :param tile_shape_mnk: The shape (M, N, K) of the CTA tile.
-        :type tile_shape_mnk: Tuple[int, int, int]
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: Tuple[int, int, int]
         :param a_dtype: Data type of operand A.
         :type a_dtype: type[cutlass.Numeric]
         :param b_dtype: Data type of operand B.
@@ -1803,15 +1745,15 @@ class GemmSm90:
                 cute.size(epi_tile) * d_dtype.width // 8 if d_dtype is not None else 0
             )
             epi_bytes_per_stage = d_bytes_per_stage + cls.epi_smem_bytes_per_stage(
-                epilogue_args, tile_shape_mnk, epi_tile
+                epilogue_args, cta_tile_shape_mnk, epi_tile
             )
             epi_bytes = epi_bytes_per_stage * epi_stage
         epi_c_stage = 0 if c_dtype is None else (4 if epi_tile[1] <= 16 else 2)
         if c_dtype is not None:
             epi_bytes += cute.size(epi_tile) * c_dtype.width // 8 * epi_c_stage
-        a_shape = cute.slice_(tile_shape_mnk, (None, 0, None))
-        b_shape = cute.slice_(tile_shape_mnk, (0, None, None))
+        a_shape = cute.slice_(cta_tile_shape_mnk, (None, 0, None))
+        b_shape = cute.slice_(cta_tile_shape_mnk, (0, None, None))
         ab_bytes_per_stage = (
             cute.size(a_shape) * a_dtype.width // 8 + cute.size(b_shape) * b_dtype.width // 8
         )
@@ -1829,15 +1771,15 @@ class GemmSm90:
     @staticmethod
     def _sm90_compute_tile_shape_or_override(
-        tile_shape_mnk: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
         atom_layout_mnk: Tuple[int, int, int],
         element_type: Optional[Type[cutlass.Numeric]] = None,
         epi_tile_override: Tuple[int, int] | None = None,
     ) -> Tuple[int, int]:
         """Compute the epilogue tile shape or use override if provided.
-        :param tile_shape_mnk: CTA tile shape (M,N,K)
-        :type tile_shape_mnk: Tuple[int, int, int]
+        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
+        :type cta_tile_shape_mnk: Tuple[int, int, int]
         :param element_type: Data type of elements
         :type element_type: type[cutlass.Numeric]
         :param is_cooperative: Whether to use cooperative approach
@@ -1850,12 +1792,12 @@ class GemmSm90:
         """
         if epi_tile_override is not None:
             return epi_tile_override
-        if tile_shape_mnk[0] % 128 == 0 and atom_layout_mnk[0] > 1:
-            tile_m = math.gcd(128, cute.size(tile_shape_mnk, mode=[0]))
-            tile_n = math.gcd(32, cute.size(tile_shape_mnk, mode=[1]))
-        elif tile_shape_mnk[0] % 192 == 0 and atom_layout_mnk[0] > 1:
-            tile_m = math.gcd(192, cute.size(tile_shape_mnk, mode=[0]))
-            tile_n = math.gcd(32, cute.size(tile_shape_mnk, mode=[1]))
+        if cta_tile_shape_mnk[0] % 128 == 0 and atom_layout_mnk[0] > 1:
+            tile_m = math.gcd(128, cute.size(cta_tile_shape_mnk, mode=[0]))
+            tile_n = math.gcd(32, cute.size(cta_tile_shape_mnk, mode=[1]))
+        elif cta_tile_shape_mnk[0] % 192 == 0 and atom_layout_mnk[0] > 1:
+            tile_m = math.gcd(192, cute.size(cta_tile_shape_mnk, mode=[0]))
+            tile_n = math.gcd(32, cute.size(cta_tile_shape_mnk, mode=[1]))
         else:
             # In the case of tile shape 128 x N but atom_layout 1 x 2, we need to set
             # epi_tile_m = 64. If epi_tile_m = 128, the epilogue would iterate along the
@@ -1864,13 +1806,13 @@ class GemmSm90:
             # We could change the epilogue to accommodate this,
             # but it's easier to just set epi_tile_m = 64.
             n_perf = 64 if element_type is not None and element_type.width == 8 else 32
-            tile_m = math.gcd(64, cute.size(tile_shape_mnk, mode=[0]))
-            tile_n = math.gcd(n_perf, cute.size(tile_shape_mnk, mode=[1]))
+            tile_m = math.gcd(64, cute.size(cta_tile_shape_mnk, mode=[0]))
+            tile_n = math.gcd(n_perf, cute.size(cta_tile_shape_mnk, mode=[1]))
         return (tile_m, tile_n)
     @staticmethod
     def _make_smem_layouts(
-        tile_shape_mnk: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
         epi_tile: Tuple[int, int],
         a_dtype: Type[cutlass.Numeric],
         a_layout: LayoutEnum,
@@ -1888,8 +1830,8 @@ class GemmSm90:
     ]:
         """Create shared memory layouts for A, B, and C tensors.
-        :param tile_shape_mnk: CTA tile shape (M,N,K)
-        :type tile_shape_mnk: Tuple[int, int, int]
+        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
+        :type cta_tile_shape_mnk: Tuple[int, int, int]
         :param epi_tile: Epilogue tile shape
         :type epi_tile: Tuple[int, int]
         :param a_dtype: Data type for matrix A
@@ -1912,11 +1854,11 @@ class GemmSm90:
         :return: Tuple of shared memory layouts for A, B, and C
         :rtype: Tuple[cute.ComposedLayout, cute.ComposedLayout, cute.ComposedLayout]
         """
-        a_smem_shape = cute.slice_(tile_shape_mnk, (None, 0, None))
+        a_smem_shape = cute.slice_(cta_tile_shape_mnk, (None, 0, None))
         a_is_k_major = a_layout.sm90_mma_major_mode() == warpgroup.OperandMajorMode.K
         b_is_k_major = b_layout.sm90_mma_major_mode() == warpgroup.OperandMajorMode.K
-        a_major_mode_size = tile_shape_mnk[2 if a_is_k_major else 0]
+        a_major_mode_size = cta_tile_shape_mnk[2 if a_is_k_major else 0]
         a_smem_layout_atom = warpgroup.make_smem_layout_atom(
             sm90_utils.get_smem_layout_atom(a_layout, a_dtype, a_major_mode_size),
             a_dtype,
@@ -1927,9 +1869,9 @@ class GemmSm90:
             order=(0, 1, 2) if a_is_k_major else (1, 0, 2),
         )
-        b_smem_shape = cute.slice_(tile_shape_mnk, (0, None, None))
+        b_smem_shape = cute.slice_(cta_tile_shape_mnk, (0, None, None))
-        b_major_mode_size = tile_shape_mnk[2 if b_is_k_major else 1]
+        b_major_mode_size = cta_tile_shape_mnk[2 if b_is_k_major else 1]
         b_smem_layout_atom = warpgroup.make_smem_layout_atom(
             sm90_utils.get_smem_layout_atom(b_layout, b_dtype, b_major_mode_size),
             b_dtype,
@@ -1940,36 +1882,18 @@ class GemmSm90:
             order=(0, 1, 2) if b_is_k_major else (1, 0, 2),
         )
+        epi_smem_layout_staged = None
         if d_dtype is not None:
-            d_smem_shape = epi_tile
-            d_major_mode_size = epi_tile[1] if d_layout.is_n_major_c() else epi_tile[0]
-            d_smem_layout_atom = warpgroup.make_smem_layout_atom(
-                sm90_utils.get_smem_layout_atom(d_layout, d_dtype, d_major_mode_size),
-                d_dtype,
-            )
-            epi_smem_layout_staged = cute.tile_to_shape(
-                d_smem_layout_atom,
-                cute.append(d_smem_shape, epi_stage),
-                order=(1, 0, 2) if d_layout.is_m_major_c() else (0, 1, 2),
+            epi_smem_layout_staged = quack_sm90_utils.make_smem_layout_epi(
+                d_dtype, d_layout, epi_tile, epi_stage
             )
-        else:
-            epi_smem_layout_staged = None
+        epi_c_smem_layout_staged = None
         if c_dtype is not None:
             assert c_layout is not None
-            c_smem_shape = epi_tile
-            c_major_mode_size = epi_tile[1] if c_layout.is_n_major_c() else epi_tile[0]
-            c_smem_layout_atom = warpgroup.make_smem_layout_atom(
-                sm90_utils.get_smem_layout_atom(c_layout, c_dtype, c_major_mode_size),
-                c_dtype,
-            )
-            epi_c_smem_layout_staged = cute.tile_to_shape(
-                c_smem_layout_atom,
-                cute.append(c_smem_shape, epi_c_stage),
-                order=(1, 0, 2) if c_layout.is_m_major_c() else (0, 1, 2),
+            epi_c_smem_layout_staged = quack_sm90_utils.make_smem_layout_epi(
+                c_dtype, c_layout, epi_tile, epi_c_stage
             )
-        else:
-            epi_c_smem_layout_staged = None
         return (
             a_smem_layout_staged,
@@ -1983,7 +1907,7 @@ class GemmSm90:
         tensor_d: cute.Tensor,
         epi_smem_layout_staged: cute.ComposedLayout,
         epi_tile: Tuple[int, int],
-        store_or_load: str,
+        op_type: Literal["store", "load", "add"],
     ) -> Tuple[cute.CopyAtom, cute.Tensor]:
         """Create TMA atoms and tensors for storing D or loading C.
@@ -1997,13 +1921,15 @@ class GemmSm90:
         :return: TMA atom and tensor for C
         :rtype: Tuple[cute.CopyAtom, cute.Tensor]
         """
-        assert store_or_load in ["load", "store"]
+        assert op_type in ["load", "store", "add"]
         epi_smem_layout = cute.slice_(epi_smem_layout_staged, (None, None, 0))
         d_cta_v_layout = cute.composition(cute.make_identity_layout(tensor_d.shape), epi_tile)
         op = (
             cpasync.CopyBulkTensorTileG2SOp()
-            if store_or_load == "load"
+            if op_type == "load"
             else cpasync.CopyBulkTensorTileS2GOp()
+            if op_type == "store"
+            else cpasync.CopyReduceBulkTensorTileS2GOp(cute.ReductionOp.ADD)
         )
         tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
             op, tensor_d, epi_smem_layout, d_cta_v_layout
@@ -2013,7 +1939,7 @@ class GemmSm90:
     @staticmethod
     def _make_tma_atoms_and_tensors(
         tensor: cute.Tensor,
-        smem_layout_staged: cute.ComposedLayout,
+        smem_layout: cute.ComposedLayout,
         smem_tile: Tuple[int, int],
         mcast_dim: int,
     ) -> Tuple[cute.CopyAtom, cute.Tensor]:
@@ -2021,8 +1947,8 @@ class GemmSm90:
         :param tensor: Input tensor (A or B)
         :type tensor: cute.Tensor
-        :param smem_layout_staged: Shared memory layout for the tensor
-        :type smem_layout_staged: cute.ComposedLayout
+        :param smem_layout: Shared memory layout for the tensor
+        :type smem_layout: cute.ComposedLayout
         :param smem_tile: Shared memory tile shape
         :type smem_tile: Tuple[int, int]
         :param mcast_dim: Multicast dimension
@@ -2036,8 +1962,6 @@ class GemmSm90:
             if mcast_dim == 1
             else cpasync.CopyBulkTensorTileG2SMulticastOp()
         )
-        smem_layout = cute.slice_(smem_layout_staged, (None, None, 0))
         tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(
             op,
             tensor,
@@ -2054,13 +1978,18 @@ class GemmSm90:
             num_bits_per_copy=copy_bits,
         )
         copy_elems = copy_bits // dtype.width
-        shape_dim_1 = cute.size(self.tile_shape_mnk[2]) // copy_elems
+        loads_per_cache_line = 128 * 8 // copy_bits  # 128 bytes per cache line
+        shape_dim_1 = cute.size(self.cta_tile_shape_mnk[2]) // copy_elems
+        if shape_dim_1 > loads_per_cache_line:
+            shape_dim_1 = math.gcd(shape_dim_1, loads_per_cache_line)
         # thread layout for copy
         thread_layout = cute.make_layout(
             (num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1)
         )
         if major_mode != LayoutEnum.ROW_MAJOR:
-            shape_dim_0 = cute.size(self.tile_shape_mnk[0]) // copy_elems
+            shape_dim_0 = cute.size(self.cta_tile_shape_mnk[0]) // copy_elems
+            if shape_dim_0 > loads_per_cache_line:
+                shape_dim_0 = math.gcd(shape_dim_0, loads_per_cache_line)
             thread_layout = cute.make_layout(
                 (shape_dim_0, num_threads // shape_dim_0), stride=(1, shape_dim_0)
             )
@@ -2102,7 +2031,7 @@ class GemmSm90:
         """
         is_valid = True
         if a_dtype not in {
-            cutlass.Float16,
+            Float16,
             cutlass.BFloat16,
             cutlass.Float8E4M3FN,
             cutlass.Float8E5M2,
@@ -2110,19 +2039,19 @@ class GemmSm90:
             is_valid = False
         # tested b_dtype
         if b_dtype not in {
-            cutlass.Float16,
+            Float16,
             cutlass.BFloat16,
             cutlass.Float8E4M3FN,
             cutlass.Float8E5M2,
         }:
             is_valid = False
-        if acc_dtype not in {cutlass.Float32, cutlass.Float16}:
+        if acc_dtype not in {Float32, Float16}:
             is_valid = False
         # tested d_dtype
         if d_dtype not in {
             None,
-            cutlass.Float32,
-            cutlass.Float16,
+            Float32,
+            Float16,
             cutlass.BFloat16,
             cutlass.Float8E4M3FN,
             cutlass.Float8E5M2,
@@ -2139,107 +2068,3 @@ class GemmSm90:
         if (a_dtype.width == 8 and a_major != "k") or (b_dtype.width == 8 and b_major != "k"):
             is_valid = False
         return is_valid
-def gemm_sm90(
-    A: Tensor,  # (l, m, k)
-    B: Tensor,  # (l, n, k)
-    D: Tensor,  # (l, m, n)
-    C: Optional[Tensor],  # (l, m, n)
-    tile_count_semaphore: Optional[Tensor],  # (1,)
-    tile_M: int,
-    tile_N: int,
-    cluster_M: int,
-    cluster_N: int,
-    pingpong: bool = False,
-    persistent: bool = True,
-    alpha: float | Tensor = 1.0,
-    beta: float | Tensor = 1.0,
-) -> None:
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(A, B, D, C)
-    GemmWrapperBase.permute_tensors(tensor_infos)
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
-    acc_dtype = cutlass.Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmSm90.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    def scalar_arg(scalar: float | Tensor):
-        if isinstance(scalar, float):
-            return Float32(scalar) if scalar != 1.0 else None
-        else:
-            assert isinstance(scalar, Tensor)
-            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
-    epi_args = GemmSm90.EpilogueArguments(scalar_arg(alpha), scalar_arg(beta))
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters, tile_count_semaphore
-    )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        None,
-        tile_shape_mn,
-        cluster_shape_mnk,
-        pingpong,
-        persistent,
-        tile_count_semaphore is not None,
-        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
-        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
-        key_tensor_names=("A", "B", "D", "C"),
-    )
-    cache = gemm_sm90.compile_cache
-    if compile_key not in cache:
-        gemm = GemmSm90(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            pingpong=pingpong,
-            is_persistent=persistent,
-        )
-        cache[compile_key] = cute.compile(
-            gemm,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            None,  # varlen_args
-            None,  # mAIdx
-            current_stream,
-        )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        None,
-        None,
-        current_stream,
-    )
-gemm_sm90.compile_cache = {}

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl