PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/METADATA +4 -2
quack_kernels-0.2.4.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/top_level.txt +0 -0

quack/{dense_gemm_sm90.py → gemm_sm90.py} RENAMED Viewed

@@ -3,11 +3,9 @@
 import enum
 from typing import Tuple, Type, Callable, Optional, Union, Literal
-from dataclasses import dataclass
 from functools import partial
 import math
-from torch import Tensor
 import cuda.bindings.driver as cuda
@@ -16,10 +14,9 @@ import cutlass.cute as cute
 import cutlass.pipeline as pipeline
 from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 import cutlass.utils.hopper_helpers as sm90_utils
-from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass import Int32, Float32, Float16, Boolean, const_expr
+from cutlass.cutlass_dsl import if_generate
 from cutlass.utils import LayoutEnum
-import cutlass.torch as cutlass_torch
-from cutlass.cute.runtime import make_ptr
 from quack.cute_dsl_utils import ParamsBase, ArgumentsBase
@@ -30,14 +27,12 @@ from quack.tile_scheduler import (
     VarlenMTileSchedulerArguments,
     VarlenMTileScheduler,
 )
-from quack.varlen_utils import VarlenArguments
-from quack.tensormap_manager import TensorMapManagerSm90
+from quack.varlen_utils import VarlenArguments, VarlenManager
 # return PipelineStateWAdvance instead of PipelineState
 from quack.pipeline import make_pipeline_state, PipelineTmaCpAsync
-import quack.utils as utils
-from quack.cute_dsl_utils import get_max_active_clusters
-from quack.gemm_wrapper_utils import GemmWrapperBase
+import quack.copy_utils as copy_utils
+import quack.sm90_utils as quack_sm90_utils
 """
 A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Hopper architecture
@@ -119,7 +114,7 @@ class GemmSm90:
     Example:
         >>> gemm = GemmSm90(
-        ...     acc_dtype=cutlass.Float32,
+        ...     acc_dtype=Float32,
         ...     tile_shape_mn=(128, 256),
         ...     cluster_shape_mnk=(1, 1, 1)
         ... )
@@ -127,19 +122,10 @@ class GemmSm90:
     """
     arch = 90
-    bytes_per_tensormap = 128
     num_epi_tensormaps: int = 0
-    @dataclass
-    class EpilogueArguments(ArgumentsBase):
-        alpha: Optional[Float32 | cute.Tensor] = None
-        beta: Optional[Float32 | cute.Tensor] = None
-        add_to_output: bool = False
-    @dataclass
-    class EpilogueParams(ParamsBase):
-        alpha: Optional[Float32 | cute.Tensor] = None
-        beta: Optional[Float32 | cute.Tensor] = None
+    EpilogueArguments = ArgumentsBase
+    EpilogueParams = ParamsBase
     def __init__(
         self,
@@ -222,7 +208,9 @@ class GemmSm90:
             atom_layout_m, atom_layout_n = 1, 1
         self.atom_layout_mnk = (atom_layout_m, atom_layout_n, 1)
-        self.num_mcast_ctas_a = self.cluster_shape_mnk[1] if not self.gather_A else 1
+        self.num_mcast_ctas_a = self.cluster_shape_mnk[1]
+        if self.gather_A:
+            assert self.num_mcast_ctas_a == 1
         self.num_mcast_ctas_b = self.cluster_shape_mnk[0]
         self.is_a_mcast = self.num_mcast_ctas_a > 1
         self.is_b_mcast = self.num_mcast_ctas_b > 1
@@ -237,10 +225,9 @@ class GemmSm90:
         self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_90")
         self.num_epi_warps = (self.mma_warp_groups if not self.pingpong else 1) * 4
         self.num_ab_load_warps = 1 if not self.gather_A else 4
-        self.num_ab_load_threads = cute.arch.WARP_SIZE * self.num_ab_load_warps
-        self.num_epi_load_threads = cute.arch.WARP_SIZE * 1
         self.ab_load_warp_id = self.mma_warp_groups * 4
-        self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
+        # self.num_epi_load_threads = cute.arch.WARP_SIZE * 1
+        # self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
         regs_per_thread = math.prod(self.cta_tile_shape_mnk[:2]) // (
             math.prod(self.atom_layout_mnk) * self.num_threads_per_warp_group
@@ -335,7 +322,7 @@ class GemmSm90:
             self.d_dtype,
             self.c_dtype,
             epilogue_args,
-            self.smem_capacity,
+            cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
             self.occupancy,
             # epi_smem will reuse smem ab if not persistent.
             overlap_sD_sA=not self.is_persistent,
@@ -465,6 +452,7 @@ class GemmSm90:
             )
         epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
+        varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
         TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
         tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
@@ -483,7 +471,7 @@ class GemmSm90:
             ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
             epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
             sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
-            tile_count: cute.struct.MemRange[cutlass.Int32, self.sched_stage]
+            tile_count: cute.struct.MemRange[Int32, self.sched_stage]
             sD: cute.struct.Align[
                 cute.struct.MemRange[
                     self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
@@ -520,10 +508,7 @@ class GemmSm90:
             tma_atom_c,
             tma_tensor_c,
             epilogue_params,
-            varlen_args.mCuSeqlensM,
-            varlen_args.mCuSeqlensK,
-            varlen_args.mTensormaps,
-            varlen_args.mAIdx,
+            varlen_params,
             self.cluster_layout_mnk,
             self.a_smem_layout_staged,
             self.b_smem_layout_staged,
@@ -535,7 +520,6 @@ class GemmSm90:
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
             cluster=self.cluster_shape_mnk,
-            smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
         )
@@ -555,10 +539,7 @@ class GemmSm90:
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
         epilogue_params: ParamsBase,
-        cu_seqlens_m: Optional[cute.Tensor],
-        cu_seqlens_k: Optional[cute.Tensor],
-        tensormaps: Optional[cute.Tensor],
-        mAIdx: Optional[cute.Tensor],
+        varlen_params: VarlenManager.Params,
         cluster_layout_mnk: cute.Layout,
         a_smem_layout: cute.ComposedLayout,
         b_smem_layout: cute.ComposedLayout,
@@ -594,8 +575,8 @@ class GemmSm90:
         :type epi_smem_layout: cute.ComposedLayout
         """
-        varlen_m = const_expr(cu_seqlens_m is not None)
-        varlen_k = const_expr(cu_seqlens_k is not None)
+        varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
+        varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
         assert not (varlen_m and varlen_k)
         if const_expr(self.gather_A):
             assert varlen_m or varlen_k
@@ -657,9 +638,19 @@ class GemmSm90:
             sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
         epi_smem_tensors = self.epi_get_smem_tensors(epilogue_params, storage)
-        # Get tensormap buffer address
-        tensormap_manager, tensormap_ab_ptrs, tensormap_d_ptr, tensormap_epi_ptrs = (
-            self.tensormap_init(tensormaps, varlen_m, varlen_k, has_D, warp_idx)
+        varlen_manager = VarlenManager.create(
+            varlen_params,
+            has_D,
+            self.num_epi_tensormaps,
+            # Only used if not varlen_m
+            len_m_static=Int32(
+                mA_mkl.shape[0]
+                if varlen_k or varlen_params.mAIdx is None
+                else varlen_params.mAIdx.shape[0]
+            ),
+            len_k_static=Int32(mA_mkl.shape[1]),
+            pingpong=self.pingpong,
+            warp_idx=warp_idx,
         )
         TileSchedulerCls = partial(
@@ -673,29 +664,20 @@ class GemmSm90:
                 and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
             ):
                 is_tma_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
-                if const_expr(varlen_k):
-                    # initialize tensormap for A & B
-                    if const_expr(not self.gather_A):
-                        tensormap_manager.init_tensormap_from_atom(
-                            tma_atom_a,
-                            tensormap_ab_ptrs[0],
-                            is_tma_warp,
-                        )
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom_b,
-                        tensormap_ab_ptrs[1],
-                        is_tma_warp,
-                    )
+                # initialize tensormap for A & B
+                varlen_manager.init_tensormap_AB(tma_atom_a, tma_atom_b, is_tma_warp)
+                tma_desc_a_ptr = varlen_manager.get_tma_desc_a_ptr()
+                tma_desc_b_ptr = varlen_manager.get_tma_desc_b_ptr()
                 # ///////////////////////////////////////////////////////////////////////////////
                 # Get mcast mask
                 # ///////////////////////////////////////////////////////////////////////////////
                 cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
-                cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
+                block_in_cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
                 a_mcast_mask = cute.make_layout_image_mask(
-                    cluster_layout_mnk, cluster_coord_mnk, mode=1
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=1
                 )
                 b_mcast_mask = cute.make_layout_image_mask(
-                    cluster_layout_mnk, cluster_coord_mnk, mode=0
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=0
                 )
                 a_mcast_mask = a_mcast_mask if self.is_a_mcast else 0
                 b_mcast_mask = b_mcast_mask if self.is_b_mcast else 0
@@ -711,42 +693,30 @@ class GemmSm90:
                 )
                 if const_expr(varlen_k):
                     # wait tensormap initialization complete before update
-                    tensormap_manager.fence_tensormap_initialization()
-                # batch index of last tile
-                last_batch_idx = cutlass.Int32(-1)
+                    varlen_manager.fence_tensormap_init()
                 while work_tile.is_valid_tile:
                     tile_coord_mnkl = work_tile.tile_idx
                     batch_idx = tile_coord_mnkl[3]
-                    if const_expr(varlen_k):
-                        is_group_changed = batch_idx != last_batch_idx
-                        last_batch_idx = batch_idx
-                        if is_group_changed:
-                            self.tensormap_update_AB(
-                                tensormap_manager,
-                                tensormap_ab_ptrs,
-                                cu_seqlens_k,
-                                batch_idx,
-                                is_tma_warp,
-                            )
+                    varlen_manager.update_tensormap_AB(
+                        batch_idx,
+                        self.a_layout,
+                        self.b_layout,
+                        is_tma_warp,
+                    )
                     # ///////////////////////////////////////////////////////////////////////////
                     #  Local_tile partition global tensors
                     # ///////////////////////////////////////////////////////////////////////////
                     if const_expr(not self.gather_A):
-                        if const_expr(varlen_m):
-                            mA_mk = cute.domain_offset((cu_seqlens_m[batch_idx], 0), mA_mkl)
-                        elif const_expr(varlen_k):
-                            mA_mk = cute.domain_offset((0, cu_seqlens_k[batch_idx]), mA_mkl)
-                        else:
-                            mA_mk = mA_mkl[None, None, batch_idx]
+                        mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
                         # (bM, bK, RestK)
-                        gA_k = cute.local_tile(
+                        gA_mk = cute.local_tile(
                             mA_mk,
                             cute.select(self.cta_tile_shape_mnk, [0, 2]),
                             (tile_coord_mnkl[0], None),
                         )
                     else:
+                        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
                         if const_expr(varlen_m):
-                            mAIdx_mk = cute.domain_offset((cu_seqlens_m[batch_idx],), mAIdx)
                             gAIdx = cute.local_tile(
                                 mAIdx_mk, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0],)
                             )
@@ -754,133 +724,90 @@ class GemmSm90:
                             mA_mk = mA_mkl
                         else:
                             assert varlen_k
-                            mAIdx_mk = cute.domain_offset((cu_seqlens_k[batch_idx],), mAIdx)
                             # (tile_K, RestK)
                             gAIdx = cute.flat_divide(mAIdx_mk, (self.cta_tile_shape_mnk[2],))
                             # (tile_M, K)
                             mA_mk = cute.local_tile(
                                 mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
                             )
-                    if const_expr(varlen_k):
-                        mB_nk = cute.domain_offset((0, cu_seqlens_k[batch_idx]), mB_nkl)
-                    else:
-                        mB_nk = mB_nkl[None, None, batch_idx]
                     # (bN, bK, RestK)
-                    gB_k = cute.local_tile(
-                        mB_nk,
+                    gB_nk = cute.local_tile(
+                        varlen_manager.offset_batch_B(mB_nkl, batch_idx),
                         cute.select(self.cta_tile_shape_mnk, [1, 2]),
                         (tile_coord_mnkl[1], None),
                     )
                     # //////////////////////////////////////////////////////////////////////////
                     #  Partition shared tensor for TMA load A/B
                     # //////////////////////////////////////////////////////////////////////////
-                    tma_desc_a_ptr, tma_desc_b_ptr = None, None
-                    if const_expr(varlen_k):
-                        # ensure the update to tensormap has completed before using it
-                        tensormap_a_ptr, tensormap_b_ptr = tensormap_ab_ptrs
-                        if is_group_changed and is_tma_warp:
-                            if const_expr(not self.gather_A):
-                                tensormap_manager.fence_tensormap_update(tensormap_a_ptr)
-                            tensormap_manager.fence_tensormap_update(tensormap_b_ptr)
-                        if const_expr(not self.gather_A):
-                            tma_desc_a_ptr = tensormap_manager.get_tensormap_ptr(
-                                tensormap_a_ptr, cute.AddressSpace.generic
-                            )
-                        tma_desc_b_ptr = tensormap_manager.get_tensormap_ptr(
-                            tensormap_b_ptr, cute.AddressSpace.generic
-                        )
+                    varlen_manager.fence_tensormap_update_AB(is_tma_warp)
+                    len_m = varlen_manager.len_m(batch_idx)
+                    len_k = varlen_manager.len_k(batch_idx)
                     #  TMA load A partition_S/D
-                    a_cta_layout = cute.make_layout(
-                        cute.slice_(cluster_layout_mnk, (0, None, 0)).shape
-                    )
-                    a_cta_crd = cluster_coord_mnk[1]
+                    copy_A = None
                     if const_expr(not self.gather_A):
-                        # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
-                        tAsA, tAgA_k = cpasync.tma_partition(
-                            tma_atom_a,
-                            a_cta_crd,
-                            a_cta_layout,
-                            cute.group_modes(sA, 0, 2),
-                            cute.group_modes(gA_k, 0, 2),
-                        )
-                        copy_A = partial(
-                            cute.copy,
+                        copy_A, _, _ = copy_utils.tma_get_copy_fn(
                             tma_atom_a,
+                            cta_coord=block_in_cluster_coord_mnk[1],
+                            cta_layout=cute.make_layout(
+                                cute.slice_(cluster_layout_mnk, (0, None, 0)).shape
+                            ),
+                            src_tensor=gA_mk,
+                            dst_tensor=sA,
                             mcast_mask=a_mcast_mask,
                             tma_desc_ptr=tma_desc_a_ptr,
                         )
                     else:
                         tiled_copy_A = self._make_gmem_tiled_copy_A(
-                            mA_mkl.element_type, self.a_layout, self.num_ab_load_threads
+                            mA_mkl.element_type, self.a_layout, self.num_ab_load_warps * 32
                         )
                         tidx = (
-                            cute.arch.thread_idx()[0]
-                            - self.mma_warp_groups * self.num_threads_per_warp_group
+                            cute.arch.thread_idx()[0] - cute.arch.WARP_SIZE * self.ab_load_warp_id
                         )
                         thr_copy_A = tiled_copy_A.get_slice(tidx)
-                        # (atom_v, CPY_M, 1, STAGE)
-                        tAsA = thr_copy_A.partition_D(sA)
-                        if const_expr(varlen_m):  # k-major
-                            assert tAsA.shape[2] == 1
-                            tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
-                        else:  # varlen_k, m-major
-                            tAsA = cute.group_modes(tAsA, 0, 3)
-                        copy_A = partial(cute.copy, tiled_copy_A)
+                        copy_A, prefetch_A = None, None
+                        if const_expr(varlen_m):
+                            copy_A = copy_utils.gather_m_get_copy_fn(
+                                thr_copy_A,
+                                mA_mk,
+                                sA,
+                                gAIdx,
+                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                                limit_k=len_k,
+                            )
+                        else:
+                            copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
+                                thr_copy_A,
+                                mA_mk,
+                                sA,
+                                gAIdx,
+                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                                limit_k=len_k,
+                            )
                     # TMA load B partition_S/D
-                    b_cta_layout = cute.make_layout(
-                        cute.slice_(cluster_layout_mnk, (None, 0, 0)).shape
-                    )
-                    b_cta_crd = cluster_coord_mnk[0]
-                    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
-                    tBsB, tBgB_k = cpasync.tma_partition(
+                    copy_B, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_b,
-                        b_cta_crd,
-                        b_cta_layout,
-                        cute.group_modes(sB, 0, 2),
-                        cute.group_modes(gB_k, 0, 2),
-                    )
-                    copy_B = partial(
-                        cute.copy, tma_atom_b, mcast_mask=b_mcast_mask, tma_desc_ptr=tma_desc_b_ptr
+                        cta_coord=block_in_cluster_coord_mnk[0],
+                        cta_layout=cute.make_layout(
+                            cute.slice_(cluster_layout_mnk, (None, 0, 0)).shape
+                        ),
+                        src_tensor=gB_nk,
+                        dst_tensor=sB,
+                        mcast_mask=b_mcast_mask,
+                        tma_desc_ptr=tma_desc_b_ptr,
                     )
-                    k_len = (
-                        cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                        if const_expr(varlen_k)
-                        else Int32(mA_mkl.shape[1])
-                    )
-                    k_tile_cnt = cute.ceil_div(k_len, self.cta_tile_shape_mnk[2])
+                    k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                     if const_expr(not self.gather_A):
                         ab_producer_state = self.load_AB(
-                            ab_pipeline,
-                            ab_producer_state,
-                            copy_A,
-                            tAgA_k,
-                            tAsA,
-                            copy_B,
-                            tBgB_k,
-                            tBsB,
-                            k_tile_cnt,
+                            ab_pipeline, ab_producer_state, copy_A, copy_B, k_tile_cnt
                         )
                     else:
-                        limit_m = (
-                            Int32(mA_mkl.shape[0])
-                            if const_expr(cu_seqlens_m is None)
-                            else cu_seqlens_m[batch_idx + 1] - cu_seqlens_m[batch_idx]
-                        )
                         ab_producer_state = self.load_AB_gather_A(
                             ab_pipeline,
                             ab_producer_state,
-                            thr_copy_A,
-                            mA_mk,
-                            tAsA,
-                            gAIdx,
+                            copy_A,
+                            prefetch_A,
                             copy_B,
-                            tBgB_k,
-                            tBsB,
                             k_tile_cnt,
-                            limit_A=(
-                                limit_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
-                                k_len,
-                            ),
                             varlen_m=varlen_m,
                         )
                     tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
@@ -900,22 +827,11 @@ class GemmSm90:
                 (not self.pingpong and warp_idx == 0)
                 or (self.pingpong and (warp_idx == 0 or warp_idx == 4))
             )
-            if const_expr(varlen_m):
-                # initialize tensormap for D
-                if const_expr(has_D):
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom_d,
-                        tensormap_d_ptr,
-                        is_manager_warp=is_tma_warp,
-                    )
-                for tma_atom, tensormap_epi_ptr in zip(
-                    self.epi_get_tma_atoms(epilogue_params), tensormap_epi_ptrs
-                ):
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom,
-                        tensormap_epi_ptr,
-                        is_manager_warp=is_tma_warp,
-                    )
+            varlen_manager.init_tensormap_epi(
+                tma_atom_d, self.epi_get_tma_atoms(epilogue_params), is_tma_warp
+            )
+            tma_desc_d_ptr = varlen_manager.get_tma_desc_d_ptr()
+            tma_desc_epi_ptrs = varlen_manager.get_tma_desc_epi_ptrs()
             # //////////////////////////////////////////////////////////////////////////////
             #  Partition global tensor for TiledMMA_A/B/C
             # //////////////////////////////////////////////////////////////////////////////
@@ -974,9 +890,8 @@ class GemmSm90:
                     if const_expr(not varlen_k):
                         ab_read_state.advance_iters(k_tile_cnt_static)
                     else:
-                        batch_idx = work_tile.tile_idx[3]
-                        k_len = cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                        k_tile_cnt = cute.ceil_div(k_len, self.cta_tile_shape_mnk[2])
+                        len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                        k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                         ab_read_state.advance_iters(k_tile_cnt)
                     tile_scheduler.advance_to_next_work()
                     if const_expr(varlen_k):
@@ -987,32 +902,22 @@ class GemmSm90:
                 work_tile = tile_scheduler.initial_work_tile_info()
             if const_expr(varlen_m):
                 # wait tensormap initialization complete before update
-                tensormap_manager.fence_tensormap_initialization()
-            # batch index of last tile
-            last_batch_idx = cutlass.Int32(-1)
+                varlen_manager.fence_tensormap_init()
             while work_tile.is_valid_tile:
                 tile_coord_mnkl = work_tile.tile_idx
                 batch_idx = tile_coord_mnkl[3]
-                if const_expr(varlen_m):
-                    is_group_changed = batch_idx != last_batch_idx
-                    last_batch_idx = batch_idx
-                    if is_group_changed:
-                        self.tensormap_update_D_epi(
-                            tensormap_manager,
-                            tensormap_d_ptr,
-                            tensormap_epi_ptrs,
-                            epilogue_params,
-                            cu_seqlens_m,
-                            batch_idx,
-                            is_manager_warp=is_tma_warp,
-                        )
-                k_len = (
-                    cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                    if const_expr(varlen_k)
-                    else mA_mkl.shape[1]
+                epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
+                    epilogue_params, varlen_params.cu_seqlens_m, batch_idx
+                )
+                varlen_manager.update_tensormap_epi(
+                    batch_idx,
+                    self.d_layout,
+                    epi_shapes,
+                    epi_orders,
+                    is_tma_warp,
                 )
-                k_tile_cnt = cute.ceil_div(k_len, self.cta_tile_shape_mnk[2])
+                len_k = varlen_manager.len_k(batch_idx)
+                k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                 ab_read_state, tiled_mma = self.mma(
                     ab_pipeline,
                     ab_read_state,
@@ -1039,57 +944,38 @@ class GemmSm90:
                     num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
                 )
-                tma_desc_d_ptr, tma_desc_epi_ptrs = None, [None] * self.num_epi_tensormaps
-                if const_expr(varlen_m):
-                    # ensure the update to tensormap has completed before using it
-                    if is_group_changed and is_tma_warp:
-                        if const_expr(has_D):
-                            tensormap_manager.fence_tensormap_update(tensormap_d_ptr)
-                        for tensormap_epi_ptr in tensormap_epi_ptrs:
-                            tensormap_manager.fence_tensormap_update(tensormap_epi_ptr)
-                    if const_expr(has_D):
-                        tma_desc_d_ptr = tensormap_manager.get_tensormap_ptr(
-                            tensormap_d_ptr, cute.AddressSpace.generic
-                        )
-                    tma_desc_epi_ptrs = [
-                        tensormap_manager.get_tensormap_ptr(
-                            tensormap_epi_ptr, cute.AddressSpace.generic
-                        )
-                        for tensormap_epi_ptr in tensormap_epi_ptrs
-                    ]
+                varlen_manager.fence_tensormap_update_epi(is_tma_warp)
+                copy_D = None
                 if const_expr(has_D):
-                    bSG_sD, bSG_gD = self.epilog_gmem_copy_and_partition(
+                    copy_D, _, _ = self.epilog_gmem_copy_and_partition(
                         tma_atom_d,
-                        mD_mnl,
+                        varlen_manager.offset_batch_epi(mD_mnl, batch_idx),
                         self.cta_tile_shape_mnk[:2],
                         self.epi_tile,
                         sD,
                         tile_coord_mnkl,
-                        cu_seqlens_m,
+                        tma_desc_ptr=tma_desc_d_ptr,
                     )
-                    copy_D = partial(cute.copy, tma_atom_d, tma_desc_ptr=tma_desc_d_ptr)
-                else:
-                    bSG_sD, bSG_gD, copy_D = None, None, None
+                copy_C = None
                 if const_expr(has_C):
-                    bGS_sC, bGS_gC = self.epilog_gmem_copy_and_partition(
+                    copy_C_fn, _, _ = self.epilog_gmem_copy_and_partition(
                         tma_atom_c,
-                        mC_mnl,
+                        varlen_manager.offset_batch_epi(mC_mnl, batch_idx),
                         self.cta_tile_shape_mnk[:2],
                         self.epi_tile,
                         sC,
                         tile_coord_mnkl,
-                        cu_seqlens_m,
                     )
-                    copy_C = partial(cute.copy, tma_atom_c)
-                    epi_load_g2s = partial(self.epi_load_g2s, epi_pipeline, copy_C, bGS_gC, bGS_sC)
-                else:
-                    epi_load_g2s = None
+                    copy_C = copy_utils.tma_producer_copy_fn(copy_C_fn, epi_pipeline)
                 d_dtype_for_layout = self.d_dtype if self.d_dtype is not None else cutlass.BFloat16
-                tiled_copy_r2s, tRS_rAcc, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
-                    tiled_mma, self.d_layout, d_dtype_for_layout, acc, sD, tidx
+                tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
+                    tiled_mma, self.d_layout, d_dtype_for_layout, sD, tidx
                 )
+                # (R2S, R2S_M, R2S_N)
+                tRS_rAcc = tiled_copy_r2s.retile(acc)
+                load_acc_subtile = partial(self.epi_load_acc_subtile, tRS_rAcc)
                 if const_expr(has_C):
                     tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
                         tiled_mma, self.c_layout, self.c_dtype, sC, tRS_rD.layout, tidx
@@ -1112,21 +998,20 @@ class GemmSm90:
                     epi_store_pipeline,
                     epi_read_state,
                     epi_producer_state,
-                    tiled_mma,
-                    tRS_rAcc,
+                    self.epi_tile,
+                    load_acc_subtile,
                     tRS_rD,
                     tRS_rC,
+                    None,  # tiled_copy_t2r, for Sm100 only
                     tiled_copy_r2s,
                     tRS_sD,
                     tiled_copy_s2r,
                     tSR_rC,
                     tSR_sC,
                     copy_D,
-                    bSG_sD,
-                    bSG_gD,
-                    epi_load_g2s,
+                    copy_C,
                     tile_coord_mnkl,
-                    cu_seqlens_m,
+                    varlen_manager,
                     epilogue_barrier,
                     tile_scheduler,
                     tidx,
@@ -1157,9 +1042,8 @@ class GemmSm90:
                         tile_scheduler.advance_to_next_work()
                         work_tile = tile_scheduler.get_current_work()
                         if work_tile.is_valid_tile:
-                            batch_idx = work_tile.tile_idx[3]
-                            k_len = cu_seqlens_k[batch_idx + 1] - cu_seqlens_k[batch_idx]
-                            k_tile_cnt = cute.ceil_div(k_len, self.cta_tile_shape_mnk[2])
+                            len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                            k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                             ab_read_state.advance_iters(k_tile_cnt)
                             tile_scheduler.advance_to_next_work()
                             work_tile = tile_scheduler.get_current_work()
@@ -1175,14 +1059,16 @@ class GemmSm90:
         self,
         ab_pipeline: cutlass.pipeline.PipelineAsync,
         ab_producer_state: cutlass.pipeline.PipelineState,
-        copy_A: Callable,
-        tAgA: cute.Tensor,
-        tAsA: cute.Tensor,
+        copy_A: Optional[Callable],
         copy_B: Callable,
-        tBgB: cute.Tensor,
-        tBsB: cute.Tensor,
         k_tile_cnt: Int32,
+        # These are for Sm100 blockscaled gemm
+        copy_SFA: Optional[Callable] = None,
+        copy_SFB: Optional[Callable] = None,
     ) -> cutlass.pipeline.PipelineState:
+        blockscaled = const_expr(copy_SFA is not None)
+        if const_expr(blockscaled):
+            assert copy_SFB is not None
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
         peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
@@ -1195,8 +1081,13 @@ class GemmSm90:
             # Also sets the transaction barrier for the A/B buffers
             ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
             tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
-            copy_A(tAgA[None, k_tile], tAsA[None, ab_producer_state.index], tma_bar_ptr=tma_bar_ptr)
-            copy_B(tBgB[None, k_tile], tBsB[None, ab_producer_state.index], tma_bar_ptr=tma_bar_ptr)
+            smem_idx = ab_producer_state.index
+            if const_expr(copy_A is not None):
+                copy_A(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            if const_expr(blockscaled):
+                copy_SFA(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+                copy_SFB(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
             # Mainloop pipeline's producer commit is a NOP
             ab_pipeline.producer_commit(ab_producer_state)
             ab_producer_state.advance()
@@ -1210,58 +1101,12 @@ class GemmSm90:
         self,
         ab_pipeline: cutlass.pipeline.PipelineAsync,
         ab_producer_state: cutlass.pipeline.PipelineState,
-        thr_copy_A: cute.core.ThrCopy,
-        mA: cute.Tensor,  # (M, K) if varlen_m, (tile_M, K) if varlen_k
-        tAsA: cute.Tensor,
-        gAIdx: cute.Tensor,  # (tile_M,) if varlen_m, (tile_K, RestK) if varlen_k
+        copy_A: Callable,
+        prefetch_A: Optional[Callable],
         copy_B: Callable,
-        tBgB: cute.Tensor,
-        tBsB: cute.Tensor,
         k_tile_cnt: Int32,
-        limit_A: Tuple[Int32, Int32],
-        varlen_m: bool,
+        varlen_m: bool = True,
     ) -> cutlass.pipeline.PipelineState:
-        limit_m, limit_k = limit_A
-        # Do we need to check if we overshoot tile_M when we load A?
-        is_even_m_smem = self.cta_tile_shape_mnk[0] % thr_copy_A.tiler_mn[0].shape == 0
-        if const_expr(not is_even_m_smem):
-            limit_m = min(limit_m, self.cta_tile_shape_mnk[0])
-        elems_per_load = cute.size(tAsA.shape[0][0])
-        cA = cute.make_identity_tensor(cute.select(self.cta_tile_shape_mnk, [0, 2]))
-        tAcA = thr_copy_A.partition_S(cA)
-        t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
-        # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
-        # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
-        # This is so that when we do the comparison, t0AcA is known at compile time.
-        limit_m = limit_m - tAcA[0][0]
-        limit_k = limit_k - tAcA[0][1]
-        # Read indices for A
-        rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
-        cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-        tApA_m = cute.make_fragment(rows_per_thread, Boolean)
-        for m in cutlass.range_constexpr(rows_per_thread):
-            tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
-        m_idx, k_idx, tAmA = None, None, None
-        if const_expr(varlen_m):
-            m_idx = cute.make_fragment(rows_per_thread, Int32)
-            for m in cutlass.range(rows_per_thread):
-                row_idx = tAcA[0, m, 0][0]
-                if tApA_m[m]:
-                    m_idx[m] = gAIdx[row_idx]
-                else:
-                    m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
-        else:
-            k_idx = cute.make_fragment(cols_per_thread, Int32)  # Will be read later
-            threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
-            # This is very convoluted but idk a better way
-            # for tile_M=128, flat_divide gives (8, 16, K),
-            # then logical_divide gives ((8, 1), (8, 2), K).
-            tidx = thr_copy_A.thr_idx
-            tAmA = cute.logical_divide(
-                cute.flat_divide(mA, (elems_per_load,)), (elems_per_load, threads_per_col)
-            )[None, (tidx % threads_per_col, None), None]  # ((8, 1), 2, K)
-        # (m, (bK, RestK))
-        mA_k = cute.logical_divide(mA, (None, self.cta_tile_shape_mnk[2]))
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
         peek_ab_empty_status = Boolean(True)
@@ -1270,59 +1115,27 @@ class GemmSm90:
         # /////////////////////////////////////////////////////////////////////////
         # TMA load on B and cp.async on A
         # /////////////////////////////////////////////////////////////////////////
-        copy_A = partial(cute.copy, thr_copy_A)
         for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
-            if const_expr(not varlen_m):  # Prefetch mAIdx early, even before smem is free
-                gAIdx_cur = gAIdx[None, k_tile]
-                for k in cutlass.range(cols_per_thread):
-                    col_idx = tAcA[0, 0, k][1]
-                    k_idx[k] = gAIdx_cur[col_idx]
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile),)
             # Wait for A/B buffers to be empty before loading into them
             # Also sets the transaction barrier for the A/B buffers
             # A tiny bit faster to rotate the warp that does TMA
             # However, for varlen_k, we must use the warp_idx == self.ab_load_warp_id
             # since that's the warp that does the tensormap update.
-            tma_warp_id = self.ab_load_warp_id + (
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (
                 (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
             )
-            ab_pipeline.producer_acquire(
-                ab_producer_state,
-                peek_ab_empty_status,
-                is_tma_warp=warp_idx == tma_warp_id,
-            )
-            # A bit faster to load B first while we calculate the predicate for A
-            if warp_idx == tma_warp_id:
-                copy_B(
-                    tBgB[None, k_tile],
-                    tBsB[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                )
-            # (m, bK)
-            if const_expr(varlen_m):
-                mA_cur = mA_k[None, (None, k_tile)]
-                for m in cutlass.range_constexpr(tAcA.shape[1]):
-                    # cute.tiled_divide(mA_cur[m_idx[m], None], (elems_per_load,)) would give shape
-                    # ((elems_per_load), thread_per_row)
-                    # But we actually want shape ((elems_per_load, 1), thread_per_row) to match tAsA
-                    # So we append 1s to the last dimension and then do tiled_divide, then slice.
-                    mA_row = cute.tiled_divide(
-                        cute.append_ones(mA_cur[m_idx[m], None], up_to_rank=2), (elems_per_load, 1)
-                    )[None, None, 0]
-                    if const_expr(is_even_m_smem) or tApA_m[m]:
-                        # There's only 1 load per row
-                        assert cute.size(tAcA.shape, mode=[2]) == 1
-                        ki = tAcA[0, 0, 0][1] // elems_per_load
-                        copy_A(mA_row[None, ki], tAsA[(None, m), ab_producer_state.index])
-            else:
-                for k in cutlass.range_constexpr(tAcA.shape[2]):
-                    # copy_A(tAmA[None, None, k_idx[k]], tAsA[(None, None, k), ab_producer_state.index], pred=cute.prepend_ones(tApA_m, up_to_rank=2))
-                    for m in cutlass.range_constexpr(tAcA.shape[1]):
-                        if tApA_m[m]:
-                            copy_A(
-                                tAmA[None, m, k_idx[k]], tAsA[(None, m, k), ab_producer_state.index]
-                            )
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
+            smem_idx = ab_producer_state.index
+            # A bit faster to load B first while we calculate the indices for A
+            if is_tma_warp:
+                tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+                copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_A(k_tile, smem_idx, *prefetch_out)
             # This tells mbarrier to track the completion of cp.async
-            ab_pipeline.producer_commit(ab_producer_state)
+            ab_pipeline.producer_cpasync_commit(ab_producer_state)
             ab_producer_state.advance()
             peek_ab_empty_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
@@ -1330,58 +1143,19 @@ class GemmSm90:
         # bound checking in the K dimension on the last k_tile
         if 0 < k_tile_cnt:
             k_tile = k_tile_cnt - 1
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
-            limit_k -= k_tile * self.cta_tile_shape_mnk[2]
-            for k in cutlass.range_constexpr(cols_per_thread):
-                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k
-            if const_expr(not varlen_m):
-                gAIdx_cur = gAIdx[None, k_tile]
-                for k in cutlass.range(cols_per_thread):
-                    col_idx = tAcA[0, 0, k][1]
-                    if tApA_k[k]:
-                        k_idx[k] = gAIdx_cur[col_idx]
-                    else:
-                        k_idx[k] = -1
-            tma_warp_id = self.ab_load_warp_id + (
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, pred=True),)
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (
                 (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
             )
-            ab_pipeline.producer_acquire(
-                ab_producer_state,
-                peek_ab_empty_status,
-                is_tma_warp=warp_idx == tma_warp_id,
-            )
-            if warp_idx == tma_warp_id:
-                copy_B(
-                    tBgB[None, k_tile],
-                    tBsB[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                )
-            if const_expr(varlen_m):
-                # (m, bK)
-                mA_cur = mA_k[None, (None, k_tile)]
-                for m in cutlass.range_constexpr(tAcA.shape[1]):
-                    # ((elems_per_load, 1), thread_per_row)
-                    mA_row = cute.tiled_divide(
-                        cute.append_ones(mA_cur[m_idx[m], None], up_to_rank=2), (elems_per_load, 1)
-                    )[None, None, 0]
-                    if const_expr(is_even_m_smem) or tApA_m[k]:
-                        # There's only 1 load per row
-                        assert cute.size(tAcA.shape, mode=[2]) == 1
-                        ki = tAcA[0, 0, 0][1] // elems_per_load
-                        copy_A(
-                            mA_row[None, ki], tAsA[(None, m), ab_producer_state.index], pred=tApA_k
-                        )
-            else:
-                tApA_k = cute.prepend_ones(tApA_k, up_to_rank=2)  # (1, cols_per_thread)
-                for k in cutlass.range_constexpr(tAcA.shape[2]):
-                    for m in cutlass.range_constexpr(tAcA.shape[1]):
-                        if tApA_m[m]:
-                            copy_A(
-                                tAmA[None, m, k_idx[k]],
-                                tAsA[(None, m, k), ab_producer_state.index],
-                                pred=tApA_k[None, k],
-                            )
-            ab_pipeline.producer_commit(ab_producer_state)
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
+            smem_idx = ab_producer_state.index
+            if is_tma_warp:
+                tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+                copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_A(k_tile, smem_idx, *prefetch_out, pred=True)
+            ab_pipeline.producer_cpasync_commit(ab_producer_state)
             ab_producer_state.advance()
         return ab_producer_state
@@ -1481,22 +1255,21 @@ class GemmSm90:
         epi_pipeline: cutlass.pipeline.PipelineAsync,
         epi_store_pipeline: cutlass.pipeline.PipelineAsync,
         epi_read_state: cutlass.pipeline.PipelineState,
-        epi_producer_state: cutlass.pipeline.PipelineState,
-        tiled_mma: cute.TiledMma,
-        tRS_rAcc: cute.Tensor,
+        epi_producer_state: Optional[cutlass.pipeline.PipelineState],
+        epi_tile: cute.Tile,
+        load_acc_subtile: Callable,
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor],
-        tiled_copy_r2s: cute.core.ThrCopy,
+        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
+        tiled_copy_r2s: cute.TiledCopy,
         tRS_sD: cute.Tensor,
-        tiled_copy_s2r: Optional[cute.core.ThrCopy],
+        tiled_copy_s2r: Optional[cute.ThrCopy],
         tSR_rC: Optional[cute.Tensor],
         tSR_sC: Optional[cute.Tensor],
         copy_D: Optional[Callable],
-        bSG_sD: cute.Tensor,
-        bSG_gD: cute.Tensor,
-        epi_load_g2s: Optional[Callable],
+        copy_C: Optional[Callable],
         tile_coord_mnkl: cute.Coord,
-        cu_seqlens_m: Optional[cute.Tensor],
+        varlen_manager: VarlenManager,
         epilogue_barrier: cutlass.pipeline.NamedBarrier,
         tile_scheduler,
         tidx: Int32,
@@ -1504,22 +1277,61 @@ class GemmSm90:
     ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
         has_C = const_expr(tRS_rC is not None)
         has_D = const_expr(copy_D is not None)
-        # We iterate over epi tiles in the N dimension first before the M dimension
         epi_tile_shape = cute.zipped_divide(
-            cute.make_layout(self.cta_tile_shape_mnk[:2]), self.epi_tile
+            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
         ).shape[1]
-        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_layout = cute.make_ordered_layout(epi_tile_shape, order=(1, 0))
         epi_tile_num = cute.size(epi_tile_shape)
         num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
-        if const_expr(epi_load_g2s is not None):
+        epi_tensors = self.epi_begin(
+            params,
+            epi_smem_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        if const_expr(copy_C is not None):
             for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
-                epi_producer_state = epi_load_g2s(epi_producer_state, epi_idx, is_tma_warp)
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+        def tma_store_fn(src_idx, dst_idx):
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
+            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
+            epilogue_barrier.arrive_and_wait()
+        # We could delay the TMA store by 1 epi tile to better overlap the non-TMA ops
+        # with the TMA store. However, currently this doesn't seem to improve perf.
+        delay_tma_store = False
+        src_idx_prev, dst_idx_prev = None, None
         for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # The global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
             # Copy from acc to D registers
-            for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
-                tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+            load_acc_subtile(tRS_rD, epi_idx)
+            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
             if const_expr(has_C):
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
@@ -1531,190 +1343,40 @@ class GemmSm90:
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
                 epi_read_state.advance()
-            if const_expr(epi_load_g2s is not None and epi_idx + self.epi_c_stage < epi_tile_num):
-                epi_producer_state = epi_load_g2s(
-                    epi_producer_state, epi_idx + self.epi_c_stage, is_tma_warp
-                )
-            tRS_rEpi = self.epi_visit_acc_subtile(params, tRS_rD, tRS_rC)
+            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+            tRS_rEpi = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
             epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            if const_expr(delay_tma_store):
+                if const_expr(epi_idx > 0):
+                    tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
             # Copy from D registers to shared memory
             if const_expr(has_D):
-                # Type conversion
-                tRS_rD_out = cute.make_fragment_like(tRS_rD, self.d_dtype)
-                tRS_rD_out.store(tRS_rD.load().to(self.d_dtype))
-                cute.copy(tiled_copy_r2s, tRS_rD_out, tRS_sD[None, None, None, epi_buffer])
-            # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
-            epilogue_barrier.arrive_and_wait()
-            # Get the global memory coordinate for the current epi tile
-            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
-            # Copy from shared memory to global memory
-            if is_tma_warp:
-                if const_expr(has_D):
-                    copy_D(bSG_sD[None, epi_buffer], bSG_gD[None, gmem_coord])
-                epi_store_pipeline.producer_commit()
-                epi_store_pipeline.producer_acquire()
-            epilogue_barrier.arrive_and_wait()
-        return epi_read_state, epi_producer_state
-    @cute.jit
-    def epi_load_g2s(
-        self,
-        epi_pipeline: cutlass.pipeline.PipelineAsync,
-        copy_C: Callable,
-        bGS_gC: cute.Tensor,
-        bGS_sC: cute.Tensor,
-        epi_producer_state: cutlass.pipeline.PipelineState,
-        epi_idx: Int32,
-        should_load: Boolean,
-    ) -> cutlass.pipeline.PipelineState:
-        # We iterate over epi tiles in the N dimension first before the M dimension
-        epi_tile_layout = cute.make_layout(bGS_gC.shape[1], stride=(bGS_gC.shape[1][1], 1))
-        if should_load:
-            epi_pipeline.producer_acquire(epi_producer_state)
-            # Get the global memory coordinate for the current epi tile
-            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
-            copy_C(
-                bGS_gC[None, gmem_coord],
-                bGS_sC[None, epi_producer_state.index],
-                tma_bar_ptr=epi_pipeline.producer_get_barrier(epi_producer_state),
-            )
-            # Epi pipeline's producer commit is a NOP
-            epi_pipeline.producer_commit(epi_producer_state)
-        epi_producer_state.advance()
-        return epi_producer_state
-    def epi_visit_acc_subtile(
-        self,
-        params: EpilogueParams,
-        tRS_rD: cute.Tensor,
-        tRS_rC: Optional[cute.Tensor] = None,
-    ) -> Optional[cute.Tensor]:
-        # Apply alpha scaling to accumulator if alpha is provided (not None)
-        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
-            alpha = utils.load_scalar_or_pointer(params.alpha)
-            tRS_rD.store(tRS_rD.load() * alpha)
-        # Apply C with beta scaling
-        if const_expr(tRS_rC is not None):
-            if const_expr(not hasattr(params, "beta") or params.beta is None):
-                # beta is None, default behavior: add C (beta=1.0)
-                tRS_rD.store(tRS_rD.load() + tRS_rC.load().to(tRS_rD.element_type))
-            else:
-                beta = utils.load_scalar_or_pointer(params.beta)
-                tRS_rD.store(tRS_rD.load() + beta * tRS_rC.load().to(tRS_rD.element_type))
-        return None
-    def tensormap_init(
-        self,
-        tensormaps: Optional[cute.Tensor],
-        varlen_m: bool,
-        varlen_k: bool,
-        has_D: bool,
-        warp_idx: Int32,
-    ):
-        tensormap_manager = None
-        tensormap_a_ptr, tensormap_b_ptr, tensormap_d_ptr = None, None, None
-        tensormap_epi_ptrs = [None] * self.num_epi_tensormaps
-        if const_expr(varlen_m or varlen_k):
-            tensormap_manager = TensorMapManagerSm90(
-                cutlass.utils.TensorMapUpdateMode.GMEM, self.__class__.bytes_per_tensormap
-            )
-            # equivalent to bidx + bidy * gridDim.x + bidxz * gridDim.x * gridDim.y
-            tensormap_workspace_idx = cute.make_layout(cute.arch.grid_dim())(cute.arch.block_idx())
-            if const_expr(varlen_m):
-                tensormap_d_idx = warp_idx // 4 if const_expr(self.pingpong) else 0
-                tensormap_epi_offset = tensormap_d_idx
-                if const_expr(has_D):
-                    tensormap_d_ptr = tensormap_manager.get_tensormap_ptr(
-                        tensormaps[tensormap_workspace_idx, tensormap_d_idx, None].iterator
-                    )
-                    tensormap_epi_offset += 1 if not self.pingpong else 2
-                tensormap_epi_ptrs = [
-                    tensormap_manager.get_tensormap_ptr(
-                        tensormaps[
-                            tensormap_workspace_idx,
-                            tensormap_epi_offset + i * (1 if not self.pingpong else 2),
-                            None,
-                        ].iterator
-                    )
-                    for i in range(self.num_epi_tensormaps)
-                ]
-            else:
-                assert varlen_k
-                if const_expr(not self.gather_A):
-                    tensormap_a_ptr = tensormap_manager.get_tensormap_ptr(
-                        tensormaps[tensormap_workspace_idx, 0, None].iterator
-                    )
-                tensormap_b_ptr = tensormap_manager.get_tensormap_ptr(
-                    tensormaps[
-                        tensormap_workspace_idx, 1 if not self.gather_A else 0, None
-                    ].iterator
-                )
-        tensormap_ab_ptrs = [tensormap_a_ptr, tensormap_b_ptr]
-        return (
-            tensormap_manager,
-            tensormap_ab_ptrs,
-            tensormap_d_ptr,
-            tensormap_epi_ptrs,
+                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
+            if const_expr(not delay_tma_store):
+                tma_store_fn(src_idx=epi_buffer, dst_idx=gmem_coord)
+        if const_expr(delay_tma_store):
+            tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+        self.epi_end(
+            params,
+            epi_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
         )
-    def tensormap_update_AB(
-        self,
-        tensormap_manager,
-        tensormap_ab_ptrs,
-        cu_seqlens_k: cute.Tensor,
-        batch_idx: Int32,
-        is_manager_warp: bool | Boolean,
-    ) -> None:
-        # construct tensor A/B based on real address, shape and stride information
-        tensormap_a_ptr, tensormap_b_ptr = tensormap_ab_ptrs
-        tensormap_ptrs = [tensormap_b_ptr]
-        shapes = [cu_seqlens_k[batch_idx + 1]]
-        orders = [0 if const_expr(self.b_layout == LayoutEnum.ROW_MAJOR) else 1]
-        if const_expr(not self.gather_A):
-            tensormap_ptrs.insert(0, tensormap_a_ptr)
-            shapes.insert(0, cu_seqlens_k[batch_idx + 1])
-            orders.insert(0, 0 if const_expr(self.a_layout == LayoutEnum.ROW_MAJOR) else 1)
-        tensormap_manager.update_tensormap_shape(
-            tensormap_ptrs,
-            is_manager_warp=is_manager_warp,
-            shapes=shapes,
-            orders=orders,
-            tensormap_smem_ptr=None,
-        )
-    def tensormap_update_D_epi(
-        self,
-        tensormap_manager,
-        tensormap_d_ptr,
-        tensormap_epi_ptrs,
-        epilogue_params: EpilogueParams,
-        cu_seqlens_m: cute.Tensor,
-        batch_idx: Int32,
-        is_manager_warp: bool | Boolean,
-    ) -> None:
-        # construct tensor D based on real address, shape and stride information
-        tensormap_ptrs, shapes, orders = [], [], []
-        if const_expr(tensormap_d_ptr is not None):
-            tensormap_ptrs.append(tensormap_d_ptr)
-            shapes.append(cu_seqlens_m[batch_idx + 1])
-            orders.append(0 if const_expr(self.d_layout.is_m_major_c()) else 1)
-        epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
-            epilogue_params, cu_seqlens_m, batch_idx
-        )
-        tensormap_ptrs.extend(tensormap_epi_ptrs)
-        shapes.extend(epi_shapes)
-        orders.extend(epi_orders)
-        tensormap_manager.update_tensormap_shape(
-            tensormap_ptrs,
-            is_manager_warp=is_manager_warp,
-            shapes=shapes,
-            orders=orders,
-            tensormap_smem_ptr=None,
-        )
+        return epi_read_state, epi_producer_state
     def get_scheduler_class(self, varlen_m: bool = False):
         """Return the scheduler class to use. Override in subclasses for custom schedulers."""
@@ -1773,6 +1435,40 @@ class GemmSm90:
             )
         return tile_sched_args
+    @cute.jit
+    def epi_load_acc_subtile(self, tRS_rAcc: cute.Tensor, tRS_rD: cute.Tensor, epi_idx: int):
+        for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
+            tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+    @cute.jit
+    def epi_begin(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tidx: Int32,
+    ) -> Tuple[cute.Tensor, ...]:
+        return ()
+    def epi_begin_loop(
+        self, params: EpilogueParams, epi_tensors: Tuple[cute.Tensor, ...], epi_coord: cute.Coord
+    ) -> Tuple[cute.Tensor, ...]:
+        return ()
+    def epi_visit_subtile(
+        self,
+        params: EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        return None
     def epi_visit_acc(
         self,
         params: EpilogueParams,
@@ -1783,10 +1479,24 @@ class GemmSm90:
     ) -> None:
         pass
+    @cute.jit
+    def epi_end(
+        self,
+        params: EpilogueParams,
+        epi_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager,
+        tidx,
+    ) -> None:
+        pass
     def epi_to_underlying_arguments(
         self, args: EpilogueArguments, *, loc=None, ip=None
     ) -> EpilogueParams:
-        return GemmSm90.EpilogueParams(alpha=args.alpha, beta=args.beta)
+        return self.EpilogueParams()
     def epi_get_tma_atoms(
         self, params: EpilogueParams, *, loc=None, ip=None
@@ -1810,12 +1520,12 @@ class GemmSm90:
     def epi_smem_bytes_per_stage(
         args: Optional[EpilogueArguments],
         cta_tile_shape_mnk: Tuple[int, int, int],
-        epi_tile: Tuple[int, int],
+        epi_tile: cute.Tile,
     ) -> int:
         return 0
     def epi_get_smem_struct(self, params: EpilogueParams):
-        return cute.struct.MemRange[cutlass.Int32, 0]  # Dummy struct
+        return cute.struct.MemRange[Int32, 0]  # Dummy struct
     def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
         return tuple()
@@ -1842,7 +1552,7 @@ class GemmSm90:
                 self.d_layout.is_m_major_c() if self.d_layout is not None else False,
                 num_matrices=4 if self.epi_tile[1] % 16 == 0 else 2,
             ),
-            cutlass.Float16,  # this is just to get the right source layout
+            Float16,  # this is just to get the right source layout
         )
         tiled_copy_C_atom = cute.make_tiled_copy_C_atom(copy_atom_C, tiled_mma)
         return tiled_copy_C_atom
@@ -1852,8 +1562,7 @@ class GemmSm90:
         tiled_mma: cute.TiledMma,
         d_layout: Optional[LayoutEnum],
         dtype: Type[cutlass.Numeric],
-        acc: cute.Tensor,
-        sD: cute.Tensor,
+        sD: Optional[cute.Tensor],
         tidx: Int32,
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         if d_layout is None:
@@ -1868,12 +1577,10 @@ class GemmSm90:
         # (R2S, R2S_M, R2S_N, PIPE_D)
         thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
         tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
-        # (R2S, R2S_M, R2S_N)
-        tRS_rAcc = tiled_copy_r2s.retile(acc)
         sD_shape = sD.shape[:2] if sD is not None else self.epi_tile
         tRS_rD_shape = thr_copy_r2s.partition_S(cute.make_identity_tensor(sD_shape)).shape
         tRS_rD = cute.make_fragment(tRS_rD_shape, self.acc_dtype)
-        return tiled_copy_r2s, tRS_rAcc, tRS_rD, tRS_sD
+        return tiled_copy_r2s, tRS_rD, tRS_sD
     def epilog_smem_load_and_partition(
         self,
@@ -1885,7 +1592,7 @@ class GemmSm90:
         tidx: Int32,
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
-        copy_atom_s2r = utils.sm90_get_smem_load_op(c_layout, dtype)
+        copy_atom_s2r = copy_utils.sm90_get_smem_load_op(c_layout, dtype)
         tiled_copy_s2r = cute.make_tiled_copy_S(copy_atom_s2r, tiled_copy_C_atom)
         thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
         tSR_sC = thr_copy_s2r.partition_S(sC)
@@ -1896,29 +1603,30 @@ class GemmSm90:
     def epilog_gmem_copy_and_partition(
         self,
         atom: Union[cute.CopyAtom, cute.TiledCopy],
-        mD_mnl: cute.Tensor,
+        mD_mn: cute.Tensor,
         tile_shape_mn: cute.Tile,
         epi_tile: cute.Tile,
         sD: cute.Tensor,
         tile_coord_mnkl: cute.Coord,
-        cu_seqlens_m: Optional[cute.Tensor] = None,
+        tma_desc_ptr: Optional[cute.Pointer] = None,
     ) -> Tuple[cute.Tensor, cute.Tensor]:
-        batch_idx = tile_coord_mnkl[3]
-        if const_expr(cu_seqlens_m is not None):
-            mD_mn = cute.domain_offset((cu_seqlens_m[batch_idx], 0), mD_mnl)
-        else:
-            mD_mn = mD_mnl[None, None, batch_idx]
         # (bM, bN)
         gD = cute.local_tile(mD_mn, tile_shape_mn, tile_coord_mnkl[:2])
         tDgD_for_tma_partition = cute.zipped_divide(gD, epi_tile)
-        bSG_sD, bSG_gD = cpasync.tma_partition(
+        is_s2g = isinstance(
+            atom.op, (cpasync.CopyBulkTensorTileS2GOp, cpasync.CopyReduceBulkTensorTileS2GOp)
+        )
+        src_tensor, dst_tensor = (
+            (sD, tDgD_for_tma_partition) if is_s2g else (tDgD_for_tma_partition, sD)
+        )
+        return copy_utils.tma_get_copy_fn(
             atom,
-            0,
-            cute.make_layout(1),
-            cute.group_modes(sD, 0, 2),
-            tDgD_for_tma_partition,
+            cta_coord=0,
+            cta_layout=cute.make_layout(1),
+            src_tensor=src_tensor,
+            dst_tensor=dst_tensor,
+            tma_desc_ptr=tma_desc_ptr,
         )
-        return bSG_sD, bSG_gD
     def make_ab_pipeline(
         self,
@@ -1927,21 +1635,15 @@ class GemmSm90:
         ab_pipeline_mbar_ptr: cute.Pointer,
     ):
         # Threads/warps participating in this pipeline
-        producer_cnt = 1 if const_expr(not self.gather_A) else 1 + self.num_ab_load_threads
+        producer_cnt = 1 if const_expr(not self.gather_A) else 1 + self.num_ab_load_warps * 32
         ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, producer_cnt)
         # Each warp will contribute to the arrive count with the number of mcast size
         mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
-        consumer_arrive_cnt = mcast_size
-        if const_expr(self.arch != 100):
-            consumer_arrive_cnt *= tiled_mma.size // cute.arch.WARP_SIZE
+        consumer_arrive_cnt = mcast_size * tiled_mma.size // cute.arch.WARP_SIZE
         ab_pipeline_consumer_group = pipeline.CooperativeGroup(
             pipeline.Agent.Thread, consumer_arrive_cnt
         )
-        if const_expr(self.arch != 100):
-            pipeline_cls = pipeline.PipelineTmaAsync if not self.gather_A else PipelineTmaCpAsync
-        else:
-            # TODO: we need a pipeline class for TMACpAsyncUMMA
-            pipeline_cls = pipeline.PipelineTmaUmma if not self.gather_A else PipelineTmaCpAsync
+        pipeline_cls = pipeline.PipelineTmaAsync if not self.gather_A else PipelineTmaCpAsync
         return pipeline_cls.create(
             barrier_storage=ab_pipeline_mbar_ptr,
             num_stages=self.ab_stage,
@@ -1973,9 +1675,7 @@ class GemmSm90:
     def make_epi_store_pipeline(self):
         # Threads/warps participating in tma store pipeline
         num_epi_threads = self.num_epi_warps * cute.arch.WARP_SIZE
-        epi_store_producer_group = pipeline.CooperativeGroup(
-            pipeline.Agent.Thread, num_epi_threads, num_epi_threads
-        )
+        epi_store_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_epi_threads)
         return pipeline.PipelineTmaStore.create(
             num_stages=self.epi_stage, producer_group=epi_store_producer_group
         )
@@ -2182,36 +1882,18 @@ class GemmSm90:
             order=(0, 1, 2) if b_is_k_major else (1, 0, 2),
         )
+        epi_smem_layout_staged = None
         if d_dtype is not None:
-            d_smem_shape = epi_tile
-            d_major_mode_size = epi_tile[1] if d_layout.is_n_major_c() else epi_tile[0]
-            d_smem_layout_atom = warpgroup.make_smem_layout_atom(
-                sm90_utils.get_smem_layout_atom(d_layout, d_dtype, d_major_mode_size),
-                d_dtype,
-            )
-            epi_smem_layout_staged = cute.tile_to_shape(
-                d_smem_layout_atom,
-                cute.append(d_smem_shape, epi_stage),
-                order=(1, 0, 2) if d_layout.is_m_major_c() else (0, 1, 2),
+            epi_smem_layout_staged = quack_sm90_utils.make_smem_layout_epi(
+                d_dtype, d_layout, epi_tile, epi_stage
             )
-        else:
-            epi_smem_layout_staged = None
+        epi_c_smem_layout_staged = None
         if c_dtype is not None:
             assert c_layout is not None
-            c_smem_shape = epi_tile
-            c_major_mode_size = epi_tile[1] if c_layout.is_n_major_c() else epi_tile[0]
-            c_smem_layout_atom = warpgroup.make_smem_layout_atom(
-                sm90_utils.get_smem_layout_atom(c_layout, c_dtype, c_major_mode_size),
-                c_dtype,
-            )
-            epi_c_smem_layout_staged = cute.tile_to_shape(
-                c_smem_layout_atom,
-                cute.append(c_smem_shape, epi_c_stage),
-                order=(1, 0, 2) if c_layout.is_m_major_c() else (0, 1, 2),
+            epi_c_smem_layout_staged = quack_sm90_utils.make_smem_layout_epi(
+                c_dtype, c_layout, epi_tile, epi_c_stage
             )
-        else:
-            epi_c_smem_layout_staged = None
         return (
             a_smem_layout_staged,
@@ -2349,7 +2031,7 @@ class GemmSm90:
         """
         is_valid = True
         if a_dtype not in {
-            cutlass.Float16,
+            Float16,
             cutlass.BFloat16,
             cutlass.Float8E4M3FN,
             cutlass.Float8E5M2,
@@ -2357,19 +2039,19 @@ class GemmSm90:
             is_valid = False
         # tested b_dtype
         if b_dtype not in {
-            cutlass.Float16,
+            Float16,
             cutlass.BFloat16,
             cutlass.Float8E4M3FN,
             cutlass.Float8E5M2,
         }:
             is_valid = False
-        if acc_dtype not in {cutlass.Float32, cutlass.Float16}:
+        if acc_dtype not in {Float32, Float16}:
             is_valid = False
         # tested d_dtype
         if d_dtype not in {
             None,
-            cutlass.Float32,
-            cutlass.Float16,
+            Float32,
+            Float16,
             cutlass.BFloat16,
             cutlass.Float8E4M3FN,
             cutlass.Float8E5M2,
@@ -2386,155 +2068,3 @@ class GemmSm90:
         if (a_dtype.width == 8 and a_major != "k") or (b_dtype.width == 8 and b_major != "k"):
             is_valid = False
         return is_valid
-def gemm_sm90(
-    # (l, m, k) or (total_m, k) if varlen_m or (m, total_k) if varlen_k or (whatever, k) if gather_A_varlen_m or (m, whatever) if gather_A_varlen_k
-    A: Tensor,
-    B: Tensor,  # (l, n, k) or (n, total_k) if varlen_k
-    D: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
-    C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
-    tile_count_semaphore: Optional[Tensor],  # (1,)
-    tile_M: int,
-    tile_N: int,
-    cluster_M: int,
-    cluster_N: int,
-    pingpong: bool = False,
-    persistent: bool = True,
-    alpha: float | Tensor = 1.0,
-    beta: float | Tensor = 1.0,
-    cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
-    cu_seqlens_k: Optional[Tensor] = None,  # (l+1,) cumulative sum of k values for variable length
-    A_idx: Optional[Tensor] = None,  # (total_m,) or (total_k,) indices for gather_A when varlen
-    batch_idx_permute: Optional[Tensor] = None,  # (l,) permutation of batch indices for scheduler
-    add_to_output: bool = False,
-) -> None:
-    varlen = cu_seqlens_m is not None or cu_seqlens_k is not None
-    assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
-        "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
-    )
-    gather_A = A_idx is not None
-    if gather_A:
-        assert varlen, "gather_A requires varlen (cu_seqlens_m or cu_seqlens_k must be specified)"
-        assert cluster_N == 1, "gather_A requires cluster_N=1"
-    if varlen:
-        assert persistent, "varlen requires persistent=True"
-    if add_to_output:
-        assert cu_seqlens_m is None, "Add to output not supported with varlen_m"
-    if cu_seqlens_m is not None:
-        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
-        assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
-    if cu_seqlens_k is not None:
-        assert A.stride(-2) == 1, "varlen_k requires A to be m-major"
-        assert B.stride(-2) == 1, "varlen_k requires B to be n-major"
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A, B, D, C, cu_seqlens_m=cu_seqlens_m, cu_seqlens_k=cu_seqlens_k, A_idx=A_idx
-    )
-    GemmWrapperBase.permute_tensors(
-        tensor_infos, varlen_m=cu_seqlens_m is not None, varlen_k=cu_seqlens_k is not None
-    )
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
-    acc_dtype = cutlass.Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmSm90.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    def scalar_arg(scalar: float | Tensor):
-        if isinstance(scalar, float):
-            return Float32(scalar) if scalar != 1.0 else None
-        else:
-            assert isinstance(scalar, Tensor)
-            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
-    epi_args = GemmSm90.EpilogueArguments(scalar_arg(alpha), scalar_arg(beta), add_to_output)
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters,
-        tile_count_semaphore,
-        batch_idx_permute,
-    )
-    # Create varlen arguments if needed (assumes persistent=True when varlen)
-    varlen_args = GemmWrapperBase.create_varlen_args(
-        cu_seqlens_m,
-        cu_seqlens_k,
-        A_idx,
-        max_active_clusters,
-        cluster_shape_mnk,
-        tensor_infos,
-        GemmSm90.num_epi_tensormaps,
-        pingpong,
-    )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        None,
-        tile_shape_mn,
-        cluster_shape_mnk,
-        pingpong,
-        persistent,
-        tile_count_semaphore is not None,
-        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
-        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
-        add_to_output,
-        cu_seqlens_m is not None,
-        cu_seqlens_k is not None,
-        gather_A,
-        batch_idx_permute is not None,
-        key_tensor_names=("A", "B", "D", "C"),
-    )
-    cache = gemm_sm90.compile_cache
-    if compile_key not in cache:
-        gemm = GemmSm90(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            pingpong=pingpong,
-            is_persistent=persistent,
-            gather_A=gather_A,
-        )
-        cache[compile_key] = cute.compile(
-            gemm,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            varlen_args,
-            current_stream,
-        )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        varlen_args,
-        current_stream,
-    )
-gemm_sm90.compile_cache = {}

quack-kernels 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl