PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/gemm_sm100.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
 import argparse
-from typing import Optional, Type, Tuple, Union, Callable
+from typing import Optional, Type, Tuple, Union, Callable, Literal
 from functools import partial
 import cuda.bindings.driver as cuda
@@ -15,14 +15,23 @@ import cutlass.torch as cutlass_torch
 import cutlass.pipeline as pipeline
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.nvgpu.warp import (
+    LdMatrix8x8x16bOp,
+    LdMatrix16x16x8bOp,
+    StMatrix8x8x16bOp,
+    StMatrix16x8x8bOp,
+)
 from cutlass import Int32, Float32, Boolean, const_expr
 from cutlass.utils import LayoutEnum
 from cutlass.cute.runtime import from_dlpack, make_ptr
+from quack.pipeline import PipelineTmaCpAsyncUmma
 from quack.cute_dsl_utils import ParamsBase, ArgumentsBase
 from quack.tile_scheduler import TileSchedulerOptions
-from quack.varlen_utils import VarlenArguments
-from quack.dense_gemm_sm90 import GemmSm90, NamedBarrierGemm
+from quack.varlen_utils import VarlenArguments, VarlenManager
+from quack.gemm_sm90 import GemmSm90, NamedBarrierGemm
+import quack.copy_utils as copy_utils
+import quack.sm100_utils as quack_sm100_utils
 # return PipelineStateWAdvance instead of PipelineState
@@ -148,6 +157,7 @@ class GemmSm100(GemmSm90):
     def __init__(
         self,
         acc_dtype: Type[cutlass.Numeric],
+        a_dtype: Type[cutlass.Numeric],  # ignored for now
         mma_tiler_mn: Tuple[int, int],
         cluster_shape_mnk: Tuple[int, int, int],
         sf_vec_size: Optional[int] = None,
@@ -175,7 +185,7 @@ class GemmSm100(GemmSm90):
         """
         self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
-        self.use_2cta_instrs = cluster_shape_mnk[0] == 2 and mma_tiler_mn[0] in (128, 256)
+        self.use_2cta_instrs = cluster_shape_mnk[0] == 2 and mma_tiler_mn[0] in (256,)
         self.cluster_shape_mnk = cluster_shape_mnk
         assert cluster_shape_mnk[2] == 1, "Cluster shape K must be 1"
         # K dimension is deferred in _setup_attributes
@@ -190,19 +200,28 @@ class GemmSm100(GemmSm90):
         self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        self.num_ab_load_warps = 1 if not self.gather_A else 5
         self.occupancy = 1
         # Set specialized warp ids
         self.epilog_warp_id = (0, 1, 2, 3)
         self.mma_warp_id = 4
-        self.tma_warp_id = 5
-        self.tma_epi_warp_id = 6
+        self.ab_load_warp_id = 5
+        self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
+        self.scheduler_warp_id = self.epi_load_warp_id + 1
         self.num_epi_warps = len(self.epilog_warp_id)
-        self.threads_per_cta = 32 * len(
-            (self.mma_warp_id, self.tma_warp_id, self.tma_epi_warp_id, *self.epilog_warp_id)
+        self.threads_per_cta = cute.arch.WARP_SIZE * (
+            self.num_ab_load_warps
+            + len(
+                (
+                    self.mma_warp_id,
+                    self.epi_load_warp_id,
+                    self.scheduler_warp_id,
+                    *self.epilog_warp_id,
+                )
+            )
         )
-        self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_100")
-    def _setup_attributes(self):
+    def _setup_attributes(self, epilogue_args: EpilogueArguments, varlen_args: VarlenArguments):
         """Set up configurations that are dependent on GEMM inputs
         This method configures various attributes based on the input tensor properties
@@ -298,6 +317,8 @@ class GemmSm100(GemmSm90):
         # Compute number of multicast CTAs for A/B
         self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        if self.gather_A:
+            assert self.num_mcast_ctas_a == 1
         self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
         self.is_a_mcast = self.num_mcast_ctas_a > 1
         self.is_b_mcast = self.num_mcast_ctas_b > 1
@@ -309,11 +330,18 @@ class GemmSm100(GemmSm90):
         self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
             self.cta_tile_shape_mnk,
             self.use_2cta_instrs,
-            self.d_layout,
-            self.d_dtype,
+            self.d_layout if self.d_layout is not None else LayoutEnum.ROW_MAJOR,
+            self.d_dtype if self.d_dtype is not None else cutlass.BFloat16,
+            layout_c=self.c_layout,
+            elem_ty_c=self.c_dtype,
         )
         # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        prefetch_A_idx = (
+            None
+            if not self.gather_A
+            else ("varlen_m" if varlen_args.mCuSeqlensM is not None else "varlen_k")
+        )
         (
             self.num_acc_stage,
             self.ab_stage,
@@ -322,36 +350,50 @@ class GemmSm100(GemmSm90):
         ) = self._compute_stages(
             self.tiled_mma,
             self.mma_tiler,
+            self.cta_tile_shape_mnk,
+            self.epi_tile,
             self.a_dtype,
             self.b_dtype,
-            self.epi_tile,
+            self.sf_dtype,
+            self.sf_vec_size,
             self.d_dtype,
             self.c_dtype,
             self.d_layout,
             self.c_layout,
-            self.sf_dtype,
-            self.sf_vec_size,
-            self.smem_capacity,
+            epilogue_args,
+            prefetch_A_idx,
+            cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
             self.occupancy,
         )
-        self.sched_stage = 1  # For compatibility with GemmSm90
+        self.sched_stage = 1
+        self.a_prefetch_stage = (
+            0
+            if not self.gather_A
+            else (2 if varlen_args.mCuSeqlensM is not None else self.ab_stage)
+        )
         # Compute A/B/SFA/SFB/C shared memory layout
         self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
             self.tiled_mma, self.mma_tiler, self.a_dtype, self.ab_stage
         )
+        self.a_smem_load_layout_staged = self.a_smem_layout_staged
+        if const_expr(self.gather_A):
+            self.a_smem_load_layout_staged = quack_sm100_utils.make_smem_layout_cpasync_a(
+                self.tiled_mma, self.mma_tiler, self.a_dtype, self.ab_stage
+            )
         self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
             self.tiled_mma, self.mma_tiler, self.b_dtype, self.ab_stage
         )
-        self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi(
-            self.d_dtype, self.d_layout, self.epi_tile, self.epi_stage
-        )
+        self.epi_smem_layout_staged = None
+        if const_expr(self.d_dtype is not None):
+            self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+                self.d_dtype, self.d_layout, self.epi_tile, self.epi_stage
+            )
+        self.epi_c_smem_layout_staged = None
         if const_expr(self.c_dtype is not None):
             self.epi_c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
                 self.c_dtype, self.c_layout, self.epi_tile, self.epi_c_stage
             )
-        else:
-            self.epi_c_smem_layout_staged = None
         if const_expr(self.blockscaled):
             self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
                 self.tiled_mma,
@@ -449,7 +491,7 @@ class GemmSm100(GemmSm90):
         ]
         # Setup attributes that dependent on gemm inputs
-        self._setup_attributes()
+        self._setup_attributes(epilogue_args, varlen_args)
         if const_expr(self.blockscaled):
             # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
@@ -536,24 +578,22 @@ class GemmSm100(GemmSm90):
         # Setup TMA store for D
         tma_atom_d, tma_tensor_d = None, None
         if const_expr(mD is not None):
-            epi_smem_layout = cute.slice_(self.epi_smem_layout_staged, (None, None, 0))
-            tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
-                cpasync.CopyBulkTensorTileS2GOp(),
+            tma_atom_d, tma_tensor_d = self._make_tma_epi_atoms_and_tensors(
                 mD,
-                epi_smem_layout,
+                self.epi_smem_layout_staged,
                 self.epi_tile,
+                op_type="store"
+                if not (hasattr(epilogue_args, "add_to_output") and epilogue_args.add_to_output)
+                else "add",
             )
         tma_atom_c, tma_tensor_c = None, None
         if const_expr(mC is not None):
-            epi_c_smem_layout = cute.slice_(self.epi_c_smem_layout_staged, (None, None, 0))
-            tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
-                cpasync.CopyBulkTensorTileG2SOp(),
-                mC,
-                epi_c_smem_layout,
-                self.epi_tile,
+            tma_atom_c, tma_tensor_c = self._make_tma_epi_atoms_and_tensors(
+                mC, self.epi_c_smem_layout_staged, self.epi_tile, op_type="load"
             )
         epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
+        varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
         TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
         tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
@@ -573,6 +613,13 @@ class GemmSm100(GemmSm90):
         sfb_smem_size = (
             cute.cosize(self.sfb_smem_layout_staged) if const_expr(self.blockscaled) else 0
         )
+        a_idx_smem_size = 0
+        if const_expr(self.gather_A):
+            a_idx_smem_size = self.a_prefetch_stage * (
+                self.cta_tile_shape_mnk[0]
+                if varlen_args.mCuSeqlensM is not None
+                else self.cta_tile_shape_mnk[2]
+            )
         # Define shared storage for kernel
         @cute.struct
@@ -581,9 +628,13 @@ class GemmSm100(GemmSm90):
             epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
             acc_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
             sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
-            tile_count: cute.struct.MemRange[cutlass.Int32, self.sched_stage]
+            a_prefetch_pipeline_array_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.a_prefetch_stage * 2
+            ]
+            tile_count: cute.struct.MemRange[Int32, self.sched_stage]
             tmem_dealloc_mbar_ptr: cutlass.Int64
             tmem_holding_buf: Int32
+            sAIdx: cute.struct.Align[cute.struct.MemRange[Int32, a_idx_smem_size], 16]
             # (EPI_TILE_M, EPI_TILE_N, STAGE)
             sD: cute.struct.Align[
                 cute.struct.MemRange[
@@ -638,13 +689,11 @@ class GemmSm100(GemmSm90):
             tma_atom_c,
             tma_tensor_c,
             epilogue_params,
-            varlen_args.mCuSeqlensM,
-            varlen_args.mCuSeqlensK,
-            varlen_args.mTensormaps,
-            varlen_args.mAIdx,
+            varlen_params,
             self.cluster_layout_vmnk,
             self.cluster_layout_sfb_vmnk,
             self.a_smem_layout_staged,
+            self.a_smem_load_layout_staged,
             self.b_smem_layout_staged,
             self.sfa_smem_layout_staged,
             self.sfb_smem_layout_staged,
@@ -657,7 +706,6 @@ class GemmSm100(GemmSm90):
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
             cluster=self.cluster_shape_mnk,
-            smem=self.shared_storage.size_in_bytes(),
             stream=stream,
             min_blocks_per_mp=1,
         )
@@ -682,13 +730,11 @@ class GemmSm100(GemmSm90):
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
         epilogue_params: ParamsBase,
-        cu_seqlens_m: Optional[cute.Tensor],
-        cu_seqlens_k: Optional[cute.Tensor],
-        tensormaps: Optional[cute.Tensor],
-        mAIdx: Optional[cute.Tensor],
+        varlen_params: VarlenManager.Params,
         cluster_layout_vmnk: cute.Layout,
         cluster_layout_sfb_vmnk: Optional[cute.Layout],
         a_smem_layout: cute.ComposedLayout,
+        a_smem_load_layout: cute.ComposedLayout,
         b_smem_layout: cute.ComposedLayout,
         sfa_smem_layout: Optional[cute.Layout],
         sfb_smem_layout: Optional[cute.Layout],
@@ -702,8 +748,8 @@ class GemmSm100(GemmSm90):
         GPU device kernel performing the Persistent batched GEMM computation.
         """
-        varlen_m = const_expr(cu_seqlens_m is not None)
-        varlen_k = const_expr(cu_seqlens_k is not None)
+        varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
+        varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
         assert not (varlen_m and varlen_k)
         if const_expr(self.gather_A):
             assert varlen_m or varlen_k
@@ -715,7 +761,7 @@ class GemmSm100(GemmSm90):
         # /////////////////////////////////////////////////////////////////////////////
         #  Prefetch Tma desc
         # /////////////////////////////////////////////////////////////////////////////
-        if warp_idx == self.tma_warp_id:
+        if warp_idx == self.ab_load_warp_id:
             for tma_atom in (
                 tma_atom_a,
                 tma_atom_b,
@@ -751,7 +797,7 @@ class GemmSm100(GemmSm90):
         # Tensor memory dealloc barrier init
         if use_2cta_instrs:
-            if warp_idx == self.tma_warp_id:
+            if warp_idx == self.ab_load_warp_id:
                 num_tmem_dealloc_threads = 32
                 cute.arch.mbarrier_init(tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads)
@@ -760,6 +806,7 @@ class GemmSm100(GemmSm90):
             tiled_mma=tiled_mma,
             cluster_layout_vmnk=cluster_layout_vmnk,
             ab_pipeline_mbar_ptr=storage.ab_pipeline_array_ptr.data_ptr(),
+            is_leader_cta=is_leader_cta,
         )
         epi_pipeline = None
         if const_expr(has_C):
@@ -774,20 +821,30 @@ class GemmSm100(GemmSm90):
         sched_pipeline = None
         tile_count = None
         if const_expr(tile_sched_params.tile_count_semaphore is not None):
-            # TODO: Untested, not sure if this is right for Sm100
             # Dynamic persistent scheduler
             sched_pipeline = self.make_sched_pipeline(
                 self.cluster_shape_mnk,
                 sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
-                varlen_k=varlen_k,
+                has_C=has_C,
             )
             tile_count = storage.tile_count.get_tensor((self.sched_stage,))
+        a_prefetch_pipeline = None
+        if const_expr(self.gather_A):
+            a_prefetch_pipeline = self.make_a_prefetch_pipeline(
+                storage.a_prefetch_pipeline_array_ptr.data_ptr(),
+            )
         # Setup smem tensor A/B/D
         # (MMA, MMA_M, MMA_K, STAGE)
-        sA = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
+        sA_mma = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
+        sA = storage.sA.get_tensor(a_smem_load_layout.outer, swizzle=a_smem_load_layout.inner)
         # (MMA, MMA_N, MMA_K, STAGE)
         sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
+        sAIdx = None
+        if const_expr(self.gather_A):
+            a_idx_smem_dim = self.cta_tile_shape_mnk[0] if varlen_m else self.cta_tile_shape_mnk[2]
+            a_idx_smem_layout = cute.make_layout((a_idx_smem_dim, self.a_prefetch_stage))
+            sAIdx = storage.sAIdx.get_tensor(a_idx_smem_layout)
         sSFA, sSFB = None, None
         if const_expr(self.blockscaled):
             # (MMA, MMA_M, MMA_K, STAGE)
@@ -813,9 +870,17 @@ class GemmSm100(GemmSm90):
         # (MMA, MMA_M, MMA_N, STAGE)
         tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
-        # Get tensormap buffer address
-        tensormap_manager, tensormap_ab_ptrs, tensormap_d_ptr, tensormap_epi_ptrs = (
-            self.tensormap_init(tensormaps, varlen_m, varlen_k, has_D, warp_idx)
+        varlen_manager = VarlenManager.create(
+            varlen_params,
+            has_D,
+            self.num_epi_tensormaps,
+            # Only used if not varlen_m
+            len_m_static=Int32(
+                mA_mkl.shape[0]
+                if varlen_k or varlen_params.mAIdx is None
+                else varlen_params.mAIdx.shape[0]
+            ),
+            len_k_static=Int32(mA_mkl.shape[1]),
         )
         TileSchedulerCls = partial(
@@ -833,22 +898,14 @@ class GemmSm100(GemmSm90):
             )
         #
-        # Specialized TMA load warp
+        # Specialized AB load warps
         #
-        if warp_idx == self.tma_warp_id:
-            if const_expr(varlen_k):
-                # initialize tensormap for A & B
-                if const_expr(not self.gather_A):
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom_a,
-                        tensormap_ab_ptrs[0],
-                        is_manager_warp=True,
-                    )
-                tensormap_manager.init_tensormap_from_atom(
-                    tma_atom_b,
-                    tensormap_ab_ptrs[1],
-                    is_manager_warp=True,
-                )
+        if warp_idx == self.ab_load_warp_id:
+            is_tma_warp = True
+            # initialize tensormap for A & B
+            varlen_manager.init_tensormap_AB(tma_atom_a, tma_atom_b, is_tma_warp)
+            tma_desc_a_ptr = varlen_manager.get_tma_desc_a_ptr()
+            tma_desc_b_ptr = varlen_manager.get_tma_desc_b_ptr()
             # Compute multicast mask for A/B buffer full
             block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
             block_in_cluster_coord_sfb_vmnk = None
@@ -874,34 +931,24 @@ class GemmSm100(GemmSm90):
                     )
             # Persistent tile scheduling loop
-            is_scheduler_warp = True
-            if const_expr(cute.size(cluster_layout_vmnk) > 1):
-                is_scheduler_warp = cute.arch.block_idx_in_cluster() == 0
-            tile_scheduler = TileSchedulerCls(is_scheduler_warp=is_scheduler_warp)
+            tile_scheduler = TileSchedulerCls()
             work_tile = tile_scheduler.initial_work_tile_info()
             ab_producer_state = pipeline.make_pipeline_state(
                 pipeline.PipelineUserType.Producer, self.ab_stage
             )
             if const_expr(varlen_k):
                 # wait tensormap initialization complete before update
-                tensormap_manager.fence_tensormap_initialization()
-            # batch index of last tile
-            last_batch_idx = cutlass.Int32(-1)
+                varlen_manager.fence_tensormap_init()
             do_epi_load_barrier_arrive = Boolean(True)
             while work_tile.is_valid_tile:
                 tile_coord_mnkl = work_tile.tile_idx
                 batch_idx = tile_coord_mnkl[3]
-                if const_expr(varlen_k):
-                    is_group_changed = batch_idx != last_batch_idx
-                    last_batch_idx = batch_idx
-                    if is_group_changed:
-                        self.tensormap_update_AB(
-                            tensormap_manager,
-                            tensormap_ab_ptrs,
-                            cu_seqlens_k,
-                            batch_idx,
-                            is_manager_warp=True,
-                        )
+                varlen_manager.update_tensormap_AB(
+                    batch_idx,
+                    self.a_layout,
+                    self.b_layout,
+                    is_tma_warp,
+                )
                 # ///////////////////////////////////////////////////////////////////////////
                 #  Local_tile partition global tensors
                 # ///////////////////////////////////////////////////////////////////////////
@@ -910,120 +957,111 @@ class GemmSm100(GemmSm90):
                     tile_coord_mnkl[1],
                     tile_coord_mnkl[3],
                 )
-                # TODO: varlen_m
-                # (bM, bK, RestK)
-                gA_mkl = cute.local_tile(
-                    mA_mkl,
-                    cute.slice_(self.mma_tiler, (None, 0, None)),
-                    (mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2]),
-                )
+                gA_mk = None
+                if const_expr(not self.gather_A):
+                    mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
+                    # (bM, bK, RestK)
+                    gA_mk = cute.local_tile(
+                        mA_mk,
+                        cute.select(self.mma_tiler, [0, 2]),
+                        (mma_tile_coord_mnl[0], None),
+                    )
                 # (bN, bK, RestK)
-                gB_nkl = cute.local_tile(
-                    mB_nkl,
-                    cute.slice_(self.mma_tiler, (0, None, None)),
-                    (mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2]),
+                gB_nk = cute.local_tile(
+                    varlen_manager.offset_batch_B(mB_nkl, batch_idx),
+                    cute.select(self.mma_tiler, [1, 2]),
+                    (mma_tile_coord_mnl[1], None),
                 )
                 if const_expr(self.blockscaled):
                     # (bM, bK)
                     gSFA_mkl = cute.local_tile(
-                        mSFA_mkl,
-                        cute.slice_(self.mma_tiler, (None, 0, None)),
-                        (mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2]),
+                        varlen_manager.offset_batch_A(mSFA_mkl, batch_idx),
+                        cute.select(self.mma_tiler, [0, 2]),
+                        (mma_tile_coord_mnl[0], None),
                     )
                     # (bN, bK)
                     gSFB_nkl = cute.local_tile(
-                        mSFB_nkl,
-                        cute.slice_(self.mma_tiler, (0, None, None)),
-                        (mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2]),
+                        varlen_manager.offset_batch_B(mSFB_nkl, batch_idx),
+                        cute.select(self.mma_tiler, [1, 2]),
+                        (mma_tile_coord_mnl[1], None),
                     )
                 # Partition global tensor for TiledMMA_A/B/D
-                # (MMA, MMA_M, MMA_K, RestK)
-                tCgA = thr_mma.partition_A(gA_mkl)
+                # Then partition global/shared tensor for TMA load A/B
+                varlen_manager.fence_tensormap_update_AB(is_tma_warp)
+                len_k = varlen_manager.len_k(batch_idx)
+                # TMA load A partition_S/D
+                a_cta_layout = cute.make_layout(
+                    cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+                )
+                copy_A = None
+                if const_expr(not self.gather_A):
+                    # (MMA, MMA_M, MMA_K, RestK)
+                    tCgA = thr_mma.partition_A(gA_mk)
+                    copy_A, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_a,
+                        cta_coord=block_in_cluster_coord_vmnk[2],
+                        cta_layout=a_cta_layout,
+                        src_tensor=tCgA,
+                        dst_tensor=sA,
+                        mcast_mask=a_mcast_mask,
+                        tma_desc_ptr=tma_desc_a_ptr,
+                    )
                 # (MMA, MMA_N, MMA_K, RestK)
-                tCgB = thr_mma.partition_B(gB_nkl)
+                tCgB = thr_mma.partition_B(gB_nk)
                 if const_expr(self.blockscaled):
                     # (MMA, MMA_M, MMA_K)
                     tCgSFA = thr_mma.partition_A(gSFA_mkl)
                     # (MMA, MMA_N, MMA_K)
                     tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
-                # Partition global/shared tensor for TMA load A/B
-                # TMA load A partition_S/D
-                a_cta_layout = cute.make_layout(
-                    cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
-                )
-                # ((atom_v, rest_v), STAGE)
-                # ((atom_v, rest_v), RestK)
-                tAsA, tAgA = cpasync.tma_partition(
-                    tma_atom_a,
-                    block_in_cluster_coord_vmnk[2],
-                    a_cta_layout,
-                    cute.group_modes(sA, 0, 3),
-                    cute.group_modes(tCgA, 0, 3),
-                )
                 # TMA load B partition_S/D
-                b_cta_layout = cute.make_layout(
-                    cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
-                )
-                # ((atom_v, rest_v), STAGE)
-                # ((atom_v, rest_v), RestK)
-                tBsB, tBgB = cpasync.tma_partition(
+                copy_B, _, _ = copy_utils.tma_get_copy_fn(
                     tma_atom_b,
-                    block_in_cluster_coord_vmnk[1],
-                    b_cta_layout,
-                    cute.group_modes(sB, 0, 3),
-                    cute.group_modes(tCgB, 0, 3),
+                    cta_coord=block_in_cluster_coord_vmnk[1],
+                    cta_layout=cute.make_layout(
+                        cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+                    ),
+                    src_tensor=tCgB,
+                    dst_tensor=sB,
+                    mcast_mask=b_mcast_mask,
+                    tma_desc_ptr=tma_desc_b_ptr,
                 )
+                copy_SFA, copy_SFB = None, None
                 if const_expr(self.blockscaled):
                     #  TMA load SFA partition_S/D
-                    sfa_cta_layout = a_cta_layout
-                    # ((atom_v, rest_v), STAGE)
-                    # ((atom_v, rest_v), RestK)
-                    tAsSFA, tAgSFA = cpasync.tma_partition(
+                    copy_SFA, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_sfa,
-                        block_in_cluster_coord_vmnk[2],
-                        sfa_cta_layout,
-                        cute.group_modes(sSFA, 0, 3),
-                        cute.group_modes(tCgSFA, 0, 3),
+                        cta_coord=block_in_cluster_coord_vmnk[2],
+                        cta_layout=a_cta_layout,
+                        src_tensor=tCgSFA,
+                        dst_tensor=sSFA,
+                        filter_zeros=True,
+                        mcast_mask=sfa_mcast_mask,
+                        # tma_desc_ptr=tma_desc_sfa_ptr,
                     )
-                    tAsSFA = cute.filter_zeros(tAsSFA)
-                    tAgSFA = cute.filter_zeros(tAgSFA)
                     # TMA load SFB partition_S/D
                     sfb_cta_layout = cute.make_layout(
                         cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
                     )
-                    # ((atom_v, rest_v), STAGE)
-                    # ((atom_v, rest_v), RestK)
-                    tBsSFB, tBgSFB = cpasync.tma_partition(
+                    copy_SFB, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_sfb,
-                        block_in_cluster_coord_sfb_vmnk[1],
-                        sfb_cta_layout,
-                        cute.group_modes(sSFB, 0, 3),
-                        cute.group_modes(tCgSFB, 0, 3),
+                        cta_coord=block_in_cluster_coord_sfb_vmnk[1],
+                        cta_layout=sfb_cta_layout,
+                        src_tensor=tCgSFB,
+                        dst_tensor=sSFB,
+                        filter_zeros=True,
+                        mcast_mask=sfb_mcast_mask,
+                        # tma_desc_ptr=tma_desc_sfa_ptr,
                     )
-                    tBsSFB = cute.filter_zeros(tBsSFB)
-                    tBgSFB = cute.filter_zeros(tBgSFB)
-                else:
-                    tAsSFA, tAgSFA = None, None
-                    tBsSFB, tBgSFB = None, None
+                k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                 ab_producer_state = self.load_AB(
                     ab_pipeline,
                     ab_producer_state,
-                    tma_atom_a,
-                    tAgA,
-                    tAsA,
-                    a_mcast_mask,
-                    tma_atom_b,
-                    tBgB,
-                    tBsB,
-                    b_mcast_mask,
-                    tma_atom_sfa,
-                    tAgSFA,
-                    tAsSFA,
-                    sfa_mcast_mask,
-                    tma_atom_sfb,
-                    tBgSFB,
-                    tBsSFB,
-                    sfb_mcast_mask,
+                    copy_A,
+                    copy_B,
+                    k_tile_cnt,
+                    copy_SFA,
+                    copy_SFB,
                 )
                 if const_expr(epi_load_barrier is not None):
                     # In the first work tile, the epi load warp will wait for the signal
@@ -1033,19 +1071,180 @@ class GemmSm100(GemmSm90):
                         epi_load_barrier.arrive()
                         do_epi_load_barrier_arrive = Boolean(False)
                 # Advance to next tile
-                tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
                 tile_scheduler.advance_to_next_work()
                 work_tile = tile_scheduler.get_current_work()
             # Wait A/B buffer empty
             ab_pipeline.producer_tail(ab_producer_state)
-            if is_scheduler_warp:
-                tile_scheduler.producer_tail()
+        if const_expr(self.gather_A):
+            if (
+                warp_idx >= self.ab_load_warp_id + 1
+                and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
+            ):
+                # Persistent tile scheduling loop
+                tile_scheduler = TileSchedulerCls()
+                work_tile = tile_scheduler.initial_work_tile_info()
+                ab_producer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Producer, self.ab_stage
+                )
+                a_prefetch_consumer_state = pipeline.make_pipeline_state(
+                    pipeline.PipelineUserType.Consumer, self.a_prefetch_stage
+                )
+                while work_tile.is_valid_tile:
+                    tile_coord_mnkl = work_tile.tile_idx
+                    batch_idx = tile_coord_mnkl[3]
+                    # ///////////////////////////////////////////////////////////////////////////
+                    #  Local_tile partition global tensors
+                    # ///////////////////////////////////////////////////////////////////////////
+                    mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
+                    if const_expr(varlen_m):
+                        # (M, K)
+                        mA_mk = mA_mkl
+                    else:
+                        assert varlen_k
+                        # (tile_M, K)
+                        mA_mk = cute.local_tile(
+                            mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
+                        )
+                    # Partition global tensor for TiledMMA_A/B/D
+                    len_m = varlen_manager.len_m(batch_idx)
+                    len_k = varlen_manager.len_k(batch_idx)
+                    # TMA load A partition_S/D
+                    tiled_copy_A = self._make_gmem_tiled_copy_A(
+                        mA_mkl.element_type, self.a_layout, (self.num_ab_load_warps - 1) * 32
+                    )
+                    tidx = cute.arch.thread_idx()[0] - (self.ab_load_warp_id + 1) * 32
+                    thr_copy_A = tiled_copy_A.get_slice(tidx)
+                    copy_A, prefetch_A = None, None
+                    if const_expr(varlen_m):
+                        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+                        copy_A = copy_utils.gather_m_get_copy_fn(
+                            thr_copy_A,
+                            mA_mk,
+                            sA,
+                            sAIdx[None, a_prefetch_consumer_state.index],
+                            limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                            limit_k=len_k,
+                        )
+                        cute.arch.sync_warp()
+                        with cute.arch.elect_one():
+                            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+                        a_prefetch_consumer_state.advance()
+                    else:
+                        copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
+                            thr_copy_A,
+                            mA_mk,
+                            sA,
+                            sAIdx,
+                            limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                            limit_k=len_k,
+                        )
+                        prefetch_A = partial(prefetch_A, a_prefetch_pipeline)
+                    k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                    ab_producer_state, a_prefetch_consumer_state = self.load_A_gather_A(
+                        ab_pipeline,
+                        ab_producer_state,
+                        a_prefetch_consumer_state,
+                        copy_A,
+                        prefetch_A,
+                        k_tile_cnt,
+                    )
+                    # Advance to next tile
+                    tile_scheduler.advance_to_next_work()
+                    work_tile = tile_scheduler.get_current_work()
+        #
+        # Specialized scheduler warp. Will also prefetch A indices if gatherA
+        #
+        if const_expr(tile_sched_params.tile_count_semaphore is not None or self.gather_A):
+            if warp_idx == self.scheduler_warp_id:
+                is_scheduler_warp = True
+                if const_expr(cute.size(cluster_layout_vmnk) > 1):
+                    is_scheduler_warp = cute.arch.block_idx_in_cluster() == 0
+                tile_M = self.cta_tile_shape_mnk[0]
+                tile_K = self.cta_tile_shape_mnk[2]
+                thr_copy_AIdx, tAsAIdx, tAcAIdx = None, None, None
+                if const_expr(self.gather_A):
+                    tiled_copy_AIdx = copy_utils.tiled_copy_1d(Int32, num_threads=32, is_async=True)
+                    thr_copy_AIdx = tiled_copy_AIdx.get_slice(cute.arch.lane_idx())
+                    tAsAIdx = thr_copy_AIdx.partition_D(sAIdx)
+                    tAcAIdx = thr_copy_AIdx.partition_S(
+                        cute.make_identity_tensor(tile_M if varlen_m else tile_K)
+                    )
+                # Persistent tile scheduling loop
+                tile_scheduler = TileSchedulerCls(is_scheduler_warp=is_scheduler_warp)
+                work_tile = tile_scheduler.initial_work_tile_info()
+                a_prefetch_producer_state = None
+                if const_expr(self.gather_A):
+                    a_prefetch_producer_state = pipeline.make_pipeline_state(
+                        pipeline.PipelineUserType.Producer, self.a_prefetch_stage
+                    )
+                while work_tile.is_valid_tile:
+                    if const_expr(self.gather_A):
+                        tile_coord_mnkl = work_tile.tile_idx
+                        batch_idx = tile_coord_mnkl[3]
+                        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
+                        if const_expr(varlen_m):
+                            # (tile_M,)
+                            gAIdx = cute.local_tile(mAIdx_mk, (tile_M,), (tile_coord_mnkl[0],))
+                            tAgAIdx = thr_copy_AIdx.partition_S(gAIdx)
+                            len_m = varlen_manager.len_m(batch_idx)
+                            m_limit = len_m - tile_coord_mnkl[0] * tile_M
+                            tApAIdx_m = cute.make_fragment((1, tAsAIdx.shape[1]), Boolean)
+                            for m in cutlass.range(tAsAIdx.shape[1], unroll_full=True):
+                                tApAIdx_m[0, m] = tAcAIdx[0, m] < m_limit
+                            a_prefetch_pipeline.producer_acquire(a_prefetch_producer_state)
+                            cute.copy(
+                                thr_copy_AIdx,
+                                tAgAIdx,
+                                tAsAIdx[None, None, a_prefetch_producer_state.index],
+                                pred=tApAIdx_m,
+                            )
+                            a_prefetch_pipeline.producer_commit(a_prefetch_producer_state)
+                            a_prefetch_producer_state.advance()
+                        else:
+                            # (tile_K, RestK)
+                            gAIdx = cute.flat_divide(mAIdx_mk, (tile_K,))
+                            tAgAIdx = thr_copy_AIdx.partition_S(gAIdx)
+                            len_k = varlen_manager.len_k(batch_idx)
+                            k_tile_cnt = cute.ceil_div(len_k, tile_K)
+                            for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+                                a_prefetch_pipeline.producer_acquire(a_prefetch_producer_state)
+                                cute.copy(
+                                    thr_copy_AIdx,
+                                    tAgAIdx[None, None, k_tile],
+                                    tAsAIdx[None, None, a_prefetch_producer_state.index],
+                                )
+                                a_prefetch_pipeline.producer_commit(a_prefetch_producer_state)
+                                a_prefetch_producer_state.advance()
+                            if 0 < k_tile_cnt:
+                                k_tile = k_tile_cnt - 1
+                                k_limit = len_k - k_tile * tile_K
+                                tApAIdx_k = cute.make_fragment((1, tAsAIdx.shape[1]), Boolean)
+                                for m in cutlass.range(tAsAIdx.shape[1], unroll_full=True):
+                                    tApAIdx_k[0, m] = tAcAIdx[0, m] < k_limit
+                                a_prefetch_pipeline.producer_acquire(a_prefetch_producer_state)
+                                cute.copy(
+                                    tiled_copy_AIdx,
+                                    tAgAIdx[None, None, k_tile],
+                                    tAsAIdx[None, None, a_prefetch_producer_state.index],
+                                    pred=tApAIdx_k,
+                                )
+                                a_prefetch_pipeline.producer_commit(a_prefetch_producer_state)
+                                a_prefetch_producer_state.advance()
+                    # Advance to next tile
+                    tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
+                    tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
+                    work_tile = tile_scheduler.get_current_work()
+                    # End of persistent scheduler loop
+                if is_scheduler_warp:
+                    tile_scheduler.producer_tail()
         #
         # Specialized TMA epi load warp
         #
         if const_expr(mC_mnl is not None):
-            if warp_idx == self.tma_epi_warp_id:
+            if warp_idx == self.epi_load_warp_id:
                 epi_producer_state = pipeline.make_pipeline_state(
                     pipeline.PipelineUserType.Producer, self.epi_c_stage
                 )
@@ -1056,37 +1255,23 @@ class GemmSm100(GemmSm90):
                 while work_tile.is_valid_tile:
                     # Get tile coord from tile scheduler
                     tile_coord_mnkl = work_tile.tile_idx
-                    # TODO: varlen_m
-                    mma_tile_coord_mnl = (
-                        tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
-                        tile_coord_mnkl[1],
-                        tile_coord_mnkl[3],
-                    )
-                    # Local_tile partition global tensors
-                    # (bM, bN)
-                    gC_mnl = cute.local_tile(
-                        mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), mma_tile_coord_mnl
+                    batch_idx = tile_coord_mnkl[3]
+                    copy_C_fn, _, bGS_gC = self.epilog_gmem_copy_and_partition(
+                        tma_atom_c,
+                        varlen_manager.offset_batch_epi(mC_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
+                        epi_tile,
+                        sC,
+                        tile_coord_mnkl,
                     )
-                    # Partition global tensor for TiledMMA_A/B/D
-                    # (MMA, MMA_M, MMA_N)
-                    tCgC = thr_mma.partition_C(gC_mnl)
-                    # bGS_gC has shape ((ATOM_V, REST_V), EPI_M, EPI_N)
-                    bGS_sC, bGS_gC = self.epilog_gmem_copy_and_partition(
-                        tma_atom_c, tCgC, epi_tile, sC
-                    )
-                    bGS_gC = cute.group_modes(bGS_gC, 1, cute.rank(bGS_gC))
+                    copy_C = copy_utils.tma_producer_copy_fn(copy_C_fn, epi_pipeline)
                     if do_epi_load_barrier_wait:
                         epi_load_barrier.arrive_and_wait()
                         do_epi_load_barrier_wait = Boolean(False)
                     epi_tile_num = const_expr(cute.size(bGS_gC, mode=[1]))
-                    for subtile_idx in cutlass.range(epi_tile_num, unroll=1):
+                    for epi_idx in cutlass.range(epi_tile_num, unroll=1):
                         epi_pipeline.producer_acquire(epi_producer_state)
-                        cute.copy(
-                            tma_atom_c,
-                            bGS_gC[None, subtile_idx],
-                            bGS_sC[None, epi_producer_state.index],
-                            tma_bar_ptr=epi_pipeline.producer_get_barrier(epi_producer_state),
-                        )
+                        copy_C(src_idx=epi_idx, producer_state=epi_producer_state)
                         # Epi pipeline's producer commit is a NOP
                         epi_pipeline.producer_commit(epi_producer_state)
                         epi_producer_state.advance()
@@ -1107,7 +1292,7 @@ class GemmSm100(GemmSm90):
             )
             # Partition shared/tensor memory tensor for TiledMMA_A/B/D
             # (MMA, MMA_M, MMA_K, STAGE)
-            tCrA = tiled_mma.make_fragment_A(sA)
+            tCrA = tiled_mma.make_fragment_A(sA_mma)
             # (MMA, MMA_N, MMA_K, STAGE)
             tCrB = tiled_mma.make_fragment_B(sB)
             # (MMA, MMA_M, MMA_N, STAGE)
@@ -1154,10 +1339,10 @@ class GemmSm100(GemmSm90):
                     tCtSFB_compact_s2t,
                 ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
             else:
+                tCtSFA, tCtSFB = None, None
                 tiled_copy_s2t_sfa, tCsSFA_compact_s2t, tCtSFA_compact_s2t = None, None, None
                 tiled_copy_s2t_sfb, tCsSFB_compact_s2t, tCtSFB_compact_s2t = None, None, None
-            k_tile_cnt = cute.ceil_div(cute.size(mA_mkl.shape[1]), self.mma_tiler[2])
             # Persistent tile scheduling loop
             tile_scheduler = TileSchedulerCls()
             work_tile = tile_scheduler.initial_work_tile_info()
@@ -1170,6 +1355,9 @@ class GemmSm100(GemmSm90):
             while work_tile.is_valid_tile:
                 # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                k_len = varlen_manager.len_k(batch_idx)
+                k_tile_cnt = cute.ceil_div(k_len, self.mma_tiler[2])
                 # Set tensor memory buffer for current tile
                 # (MMA, MMA_M, MMA_N)
                 tCtAcc = tCtAcc_base[None, None, None, acc_producer_state.index]
@@ -1184,6 +1372,9 @@ class GemmSm100(GemmSm90):
                     tCtAcc,
                     k_tile_cnt,
                     is_leader_cta,
+                    cta_rank_in_cluster,
+                    tCtSFA,
+                    tCtSFB,
                     tiled_copy_s2t_sfa,
                     tiled_copy_s2t_sfb,
                     tCsSFA_compact_s2t,
@@ -1209,6 +1400,14 @@ class GemmSm100(GemmSm90):
                 )
             # Bar sync for retrieve tensor memory ptr from shared memory
             tmem_alloc_barrier.arrive_and_wait()
+            is_tma_warp = Boolean(warp_idx == self.epilog_warp_id[0])
+            varlen_manager.init_tensormap_epi(
+                tma_atom_d, self.epi_get_tma_atoms(epilogue_params), is_tma_warp
+            )
+            tma_desc_d_ptr = varlen_manager.get_tma_desc_d_ptr()
+            tma_desc_epi_ptrs = varlen_manager.get_tma_desc_epi_ptrs()
             # Retrieving tensor memory ptr and make accumulator tensor
             acc_tmem_ptr = cute.arch.retrieve_tmem_ptr(
                 self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
@@ -1221,44 +1420,22 @@ class GemmSm100(GemmSm90):
                 num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
             )
-            is_tma_warp = Boolean(warp_idx == self.epilog_warp_id[0])
-            if const_expr(varlen_m):
-                # initialize tensormap for D
-                if const_expr(has_D):
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom_d,
-                        tensormap_d_ptr,
-                        is_manager_warp=is_tma_warp,
-                    )
-                for tma_atom, tensormap_epi_ptr in zip(
-                    self.epi_get_tma_atoms(epilogue_params), tensormap_epi_ptrs
-                ):
-                    tensormap_manager.init_tensormap_from_atom(
-                        tma_atom,
-                        tensormap_epi_ptr,
-                        is_manager_warp=is_tma_warp,
-                    )
             # Partition for epilogue
             epi_tidx = tidx
             tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = self.epilog_tmem_copy_and_partition(
                 epi_tidx, tCtAcc_base, epi_tile, use_2cta_instrs
             )
-            tTR_rD = cute.make_fragment(tTR_rAcc.shape, self.d_dtype)
-            tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_copy_and_partition(
-                tiled_copy_t2r, tTR_rD, epi_tidx, sD
+            tTR_rD = cute.make_fragment(tTR_rAcc.shape, self.acc_dtype)
+            tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
+                tiled_copy_t2r, self.d_layout, self.d_dtype, tTR_rD, sD, epi_tidx
             )
-            tRS_rC, tSR_rC = None, None
+            tRS_rC, tSR_rC, tSR_sC = None, None, None
+            tiled_copy_s2r = None
             if const_expr(mC_mnl is not None):
-                tTR_rC = cute.make_fragment_like(tTR_rD, self.c_dtype)
-                tiled_copy_s2r, tSR_rC, tSR_sC = self.epilog_smem_copy_and_partition(
-                    tiled_copy_t2r, tTR_rC, epi_tidx, sC
+                tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
+                    tiled_copy_t2r, self.c_layout, self.c_dtype, sC, tRS_rD.layout, epi_tidx
                 )
-                # TODO: for m major, D is being stored w STSM so we'd need LDSM here
-                # tRS_rC = tSR_rC  # TODO: retile?
-                tRS_rC = cute.make_fragment(tRS_rD.layout, self.c_dtype)
-                tSR_rC = tiled_copy_s2r.get_slice(epi_tidx).retile(tRS_rC)
             # Persistent tile scheduling loop
             tile_scheduler = TileSchedulerCls()
@@ -1272,42 +1449,21 @@ class GemmSm100(GemmSm90):
             )
             if const_expr(varlen_m):
                 # wait tensormap initialization complete before update
-                tensormap_manager.fence_tensormap_initialization()
-            # batch index of last tile
-            last_batch_idx = cutlass.Int32(-1)
+                varlen_manager.fence_tensormap_init()
             while work_tile.is_valid_tile:
                 # Get tile coord from tile scheduler
                 tile_coord_mnkl = work_tile.tile_idx
                 batch_idx = tile_coord_mnkl[3]
-                if const_expr(varlen_m):
-                    is_group_changed = batch_idx != last_batch_idx
-                    last_batch_idx = batch_idx
-                    if is_group_changed:
-                        self.tensormap_update_D_epi(
-                            tensormap_manager,
-                            tensormap_d_ptr,
-                            tensormap_epi_ptrs,
-                            epilogue_params,
-                            cu_seqlens_m,
-                            batch_idx,
-                            is_manager_warp=is_tma_warp,
-                        )
-                mma_tile_coord_mnl = (
-                    tile_coord_mnkl[0] // cute.size(tiled_mma.thr_id.shape),
-                    tile_coord_mnkl[1],
-                    tile_coord_mnkl[3],
+                epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
+                    epilogue_params, varlen_params.cu_seqlens_m, batch_idx
                 )
-                # Local_tile partition global tensors
-                # (bM, bN)
-                gD_mnl = cute.local_tile(
-                    mD_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), mma_tile_coord_mnl
+                varlen_manager.update_tensormap_epi(
+                    batch_idx,
+                    self.d_layout,
+                    epi_shapes,
+                    epi_orders,
+                    is_tma_warp,
                 )
-                # Partition global tensor for TiledMMA_A/B/D
-                # (MMA, MMA_M, MMA_N)
-                tDgD = thr_mma.partition_C(gD_mnl)
-                # bSG_gD has shape ((ATOM_V, REST_V), EPI_M, EPI_N)
-                bSG_sD, bSG_gD = self.epilog_gmem_copy_and_partition(tma_atom_d, tDgD, epi_tile, sD)
                 # Set tensor memory buffer for current tile
                 # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
@@ -1316,67 +1472,59 @@ class GemmSm100(GemmSm90):
                 # Wait for accumulator buffer full
                 acc_pipeline.consumer_wait(acc_consumer_state)
-                tma_desc_d_ptr, tma_desc_epi_ptrs = None, [None] * self.num_epi_tensormaps
-                if const_expr(varlen_m):
-                    # ensure the update to tensormap has completed before using it
-                    if is_group_changed and is_tma_warp:
-                        if const_expr(has_D):
-                            tensormap_manager.fence_tensormap_update(tensormap_d_ptr)
-                        for tensormap_epi_ptr in tensormap_epi_ptrs:
-                            tensormap_manager.fence_tensormap_update(tensormap_epi_ptr)
-                    if const_expr(has_D):
-                        tma_desc_d_ptr = tensormap_manager.get_tensormap_ptr(
-                            tensormap_d_ptr, cute.AddressSpace.generic
-                        )
-                    tma_desc_epi_ptrs = [
-                        tensormap_manager.get_tensormap_ptr(
-                            tensormap_epi_ptr, cute.AddressSpace.generic
-                        )
-                        for tensormap_epi_ptr in tensormap_epi_ptrs
-                    ]
+                varlen_manager.fence_tensormap_update_epi(is_tma_warp)
-                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
-                bSG_gD = cute.group_modes(bSG_gD, 1, cute.rank(bSG_gD))
-                # Store accumulator to global memory in subtiles
-                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
-                num_prev_subtiles = tile_scheduler.num_tiles_executed * subtile_cnt
-                for subtile_idx in cutlass.range(subtile_cnt):
-                    # Load accumulator from tensor memory buffer to register
-                    tTR_tAcc_mn = tTR_tAcc[None, None, None, subtile_idx]
-                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
-                    # Convert to D type
-                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
-                    if const_expr(mC_mnl is not None):
-                        epi_pipeline.consumer_wait(epi_read_state)
-                        cute.copy(
-                            tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC
-                        )
-                        # Fence to make sure shared memory read is visible to TMA load
-                        cute.arch.fence_proxy(
-                            cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                        )
-                        cute.arch.sync_warp()
-                        with cute.arch.elect_one():
-                            epi_pipeline.consumer_release(epi_read_state)
-                        epi_read_state.advance()
-                        acc_vec = acc_vec + tRS_rC.load().to(self.acc_dtype)
-                    tRS_rD.store(acc_vec.to(self.d_dtype))
-                    # Store D to shared memory
-                    d_buffer = (num_prev_subtiles + subtile_idx) % self.epi_stage
-                    cute.copy(tiled_copy_r2s, tRS_rD, tRS_sD[(None, None, None, d_buffer)])
-                    # Fence and barrier to make sure shared memory store is visible to TMA store
-                    cute.arch.fence_proxy(
-                        cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                copy_D = None
+                if const_expr(has_D):
+                    copy_D, _, _ = self.epilog_gmem_copy_and_partition(
+                        tma_atom_d,
+                        varlen_manager.offset_batch_epi(mD_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
+                        epi_tile,
+                        sD,
+                        tile_coord_mnkl,
+                        tma_desc_ptr=tma_desc_d_ptr,
                     )
-                    epilogue_barrier.arrive_and_wait()
-                    # TMA store D to global memory
-                    if is_tma_warp:
-                        cute.copy(tma_atom_d, bSG_sD[None, d_buffer], bSG_gD[None, subtile_idx])
-                        # Fence and barrier to make sure shared memory store is visible to TMA store
-                        epi_store_pipeline.producer_commit()
-                        epi_store_pipeline.producer_acquire()
-                    epilogue_barrier.arrive_and_wait()
+                copy_C = None  # We're using a separate warp to load C
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                k_len = varlen_manager.len_k(batch_idx)
+                load_acc_subtile = partial(
+                    self.epi_load_acc_subtile,
+                    tiled_copy_t2r,
+                    tiled_copy_r2s,
+                    tTR_tAcc,
+                    tTR_rAcc,
+                    clear_acc=varlen_k and k_len == 0,
+                )
+                epi_read_state, _ = self.epilogue(
+                    epilogue_params,
+                    epi_smem_tensors,
+                    tma_desc_epi_ptrs,
+                    epi_pipeline,
+                    epi_store_pipeline,
+                    epi_read_state,
+                    None,  # epi_producer_state
+                    epi_tile,
+                    load_acc_subtile,
+                    tRS_rD,
+                    tRS_rC,
+                    tiled_copy_t2r,
+                    tiled_copy_r2s,
+                    tRS_sD,
+                    tiled_copy_s2r,
+                    tSR_rC,
+                    tSR_sC,
+                    copy_D,
+                    copy_C,
+                    tile_coord_mnkl,
+                    varlen_manager,
+                    epilogue_barrier,
+                    tile_scheduler,
+                    epi_tidx,
+                    is_tma_warp,
+                )
                 # Async arrive accumulator buffer empty
                 with cute.arch.elect_one():
@@ -1404,79 +1552,50 @@ class GemmSm100(GemmSm90):
                 epi_store_pipeline.producer_tail()
     @cute.jit
-    def load_AB(
+    def load_A_gather_A(
         self,
-        ab_pipeline: cutlass.pipeline.PipelineAsync,
-        ab_producer_state: cutlass.pipeline.PipelineState,
-        tma_atom_a: cute.CopyAtom,
-        tAgA: cute.Tensor,
-        tAsA: cute.Tensor,
-        a_mcast_mask: cutlass.Int16,
-        tma_atom_b: cute.CopyAtom,
-        tBgB: cute.Tensor,
-        tBsB: cute.Tensor,
-        b_mcast_mask: cutlass.Int16,
-        tma_atom_sfa: Optional[cute.CopyAtom] = None,
-        tAgSFA: Optional[cute.Tensor] = None,
-        tAsSFA: Optional[cute.Tensor] = None,
-        sfa_mcast_mask: Optional[cutlass.Int16] = None,
-        tma_atom_sfb: Optional[cute.CopyAtom] = None,
-        tBgSFB: Optional[cute.Tensor] = None,
-        tBsSFB: Optional[cute.Tensor] = None,
-        sfb_mcast_mask: Optional[cutlass.Int16] = None,
-    ) -> cutlass.pipeline.PipelineState:
-        blockscaled = const_expr(tma_atom_sfa is not None)
-        if const_expr(blockscaled):
-            assert all(x is not None for x in (tma_atom_sfa, tAgSFA, tAsSFA))
-            assert all(x is not None for x in (tma_atom_sfb, tBgSFB, tBsSFB))
-        k_tile_cnt = cute.size(tAgA, mode=[1])
+        a_pipeline: cutlass.pipeline.PipelineAsync,
+        a_producer_state: cutlass.pipeline.PipelineState,
+        a_prefetch_consumer_state: Optional[cutlass.pipeline.PipelineState],
+        copy_A: Callable,
+        prefetch_A: Optional[Callable],
+        k_tile_cnt: Int32,
+    ) -> Tuple[cutlass.pipeline.PipelineState, Optional[cutlass.pipeline.PipelineState]]:
         # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
-        peek_ab_empty_status = Boolean(True)
+        peek_a_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
-            peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+            peek_a_empty_status = a_pipeline.producer_try_acquire(a_producer_state)
         # /////////////////////////////////////////////////////////////////////////
-        # TMA load
+        # cp.async on A
         # /////////////////////////////////////////////////////////////////////////
-        for k_tile in cutlass.range(k_tile_cnt, unroll=1):
-            # Wait for A/B buffers to be empty before loading into them
-            # Also sets the transaction barrier for the A/B buffers
-            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
-            cute.copy(
-                tma_atom_a,
-                tAgA[None, k_tile],
-                tAsA[None, ab_producer_state.index],
-                tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                mcast_mask=a_mcast_mask,
-            )
-            cute.copy(
-                tma_atom_b,
-                tBgB[None, k_tile],
-                tBsB[None, ab_producer_state.index],
-                tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                mcast_mask=b_mcast_mask,
-            )
-            if const_expr(blockscaled):
-                cute.copy(
-                    tma_atom_sfa,
-                    tAgSFA[None, ab_producer_state.count],
-                    tAsSFA[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                    mcast_mask=sfa_mcast_mask,
-                )
-                cute.copy(
-                    tma_atom_sfb,
-                    tBgSFB[None, ab_producer_state.count],
-                    tBsSFB[None, ab_producer_state.index],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                    mcast_mask=sfb_mcast_mask,
-                )
-            # Mainloop pipeline's producer commit is a NOP
-            ab_pipeline.producer_commit(ab_producer_state)
-            ab_producer_state.advance()
-            peek_ab_empty_status = Boolean(True)
+        is_tma_warp = False
+        for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+            smem_idx = a_producer_state.index
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, smem_idx, a_prefetch_consumer_state),)
+                a_prefetch_consumer_state.advance()
+            a_pipeline.producer_acquire(a_producer_state, peek_a_empty_status, is_tma_warp)
+            copy_A(k_tile, smem_idx, *prefetch_out)
+            # This tells mbarrier to track the completion of cp.async
+            a_pipeline.producer_cpasync_commit(a_producer_state)
+            a_producer_state.advance()
+            peek_a_empty_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
-                peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
-        return ab_producer_state
+                peek_a_empty_status = a_pipeline.producer_try_acquire(a_producer_state)
+        # bound checking in the K dimension on the last k_tile
+        if 0 < k_tile_cnt:
+            k_tile = k_tile_cnt - 1
+            smem_idx = a_producer_state.index
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, smem_idx, a_prefetch_consumer_state, pred=True),)
+                a_prefetch_consumer_state.advance()
+            a_pipeline.producer_acquire(a_producer_state, peek_a_empty_status, is_tma_warp)
+            copy_A(k_tile, smem_idx, *prefetch_out, pred=True)
+            a_pipeline.producer_cpasync_commit(a_producer_state)
+            a_producer_state.advance()
+        return a_producer_state, a_prefetch_consumer_state
     @cute.jit
     def mma(
@@ -1491,6 +1610,9 @@ class GemmSm100(GemmSm90):
         acc: cute.Tensor,
         k_tile_cnt: Int32,
         is_leader_cta: Boolean,
+        cta_rank_in_cluster: Int32,
+        tCtSFA: Optional[cute.Tensor] = None,
+        tCtSFB: Optional[cute.Tensor] = None,
         tiled_copy_s2t_sfa: Optional[cute.TiledCopy] = None,
         tiled_copy_s2t_sfb: Optional[cute.TiledCopy] = None,
         tCsSFA_compact_s2t: Optional[cute.Tensor] = None,
@@ -1500,12 +1622,17 @@ class GemmSm100(GemmSm90):
     ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState, cute.TiledMma]:
         blockscaled = const_expr(tiled_copy_s2t_sfa is not None)
         if const_expr(blockscaled):
+            assert all(x is not None for x in (tCtSFA, tCtSFB))
             assert all(x is not None for x in (tiled_copy_s2t_sfa, tiled_copy_s2t_sfb))
             assert all(x is not None for x in (tCsSFA_compact_s2t, tCsSFB_compact_s2t))
             assert all(x is not None for x in (tCtSFA_compact_s2t, tCtSFB_compact_s2t))
+        # If gather_A and use_2cta_instrs, the cp.async for the non-leader CTA will
+        # arrive at an mbarrier on the non-leader CTA side, then the mma warp of the non-leader
+        # CTA will wait for that then arrive at the mbarrier on the leader CTA.
+        need_nonleader_cta = const_expr(self.gather_A and self.use_2cta_instrs)
         # Peek (try_wait) AB buffer full for k_tile = 0
         peek_ab_full_status = Boolean(True)
-        if 0 < k_tile_cnt and is_leader_cta:
+        if 0 < k_tile_cnt and (is_leader_cta or need_nonleader_cta):
             peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
         # Wait for accumulator buffer empty
         if is_leader_cta:
@@ -1515,6 +1642,14 @@ class GemmSm100(GemmSm90):
         # Mma mainloop
         num_k_blocks = cute.size(tCrA, mode=[2])
         for k_tile in cutlass.range(k_tile_cnt, unroll=1):
+            if const_expr(need_nonleader_cta):
+                if not is_leader_cta:
+                    ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+                    with cute.arch.elect_one():
+                        # The odd CTA signals the even CTA
+                        ab_pipeline.sync_object_full.arrive_mbarrier(
+                            ab_consumer_state.index, dst_rank=cta_rank_in_cluster & 0xFE
+                        )
             if is_leader_cta:
                 # Conditionally wait for AB buffer full
                 ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
@@ -1527,6 +1662,11 @@ class GemmSm100(GemmSm90):
                     cute.copy(tiled_copy_s2t_sfb, tCsSFB_compact_s2t_staged, tCtSFB_compact_s2t)
                 for k_blk_idx in cutlass.range(num_k_blocks, unroll_full=True):
                     k_blk_coord = (None, None, k_blk_idx, ab_consumer_state.index)
+                    if const_expr(blockscaled):
+                        # Set SFA/SFB tensor to tiled_mma
+                        sf_kblock_coord = (None, None, k_blk_idx)
+                        tiled_mma.set(tcgen05.Field.SFA, tCtSFA[sf_kblock_coord].iterator)
+                        tiled_mma.set(tcgen05.Field.SFB, tCtSFB[sf_kblock_coord].iterator)
                     cute.gemm(tiled_mma, acc, tCrA[k_blk_coord], tCrB[k_blk_coord], acc)
                     tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
                 # Async arrive AB buffer empty
@@ -1534,7 +1674,7 @@ class GemmSm100(GemmSm90):
             ab_consumer_state.advance()
             # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
             peek_ab_full_status = Boolean(True)
-            if k_tile + 1 < k_tile_cnt and is_leader_cta:
+            if k_tile + 1 < k_tile_cnt and (is_leader_cta or need_nonleader_cta):
                 peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
         # Async arrive accumulator buffer full
         if is_leader_cta:
@@ -1544,6 +1684,25 @@ class GemmSm100(GemmSm90):
         # "operand #0 does not dominate this use"
         return ab_consumer_state, acc_producer_state, tiled_mma
+    @cute.jit
+    def epi_load_acc_subtile(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tiled_copy_r2s: cute.TiledCopy,
+        tTR_tAcc: cute.Tensor,
+        tTR_rAcc: cute.Tensor,
+        tRS_rD: cute.Tensor,
+        epi_idx: int,
+        clear_acc: Boolean = False,
+    ):
+        if not clear_acc:
+            # Load accumulator from tensor memory buffer to register
+            cute.copy(tiled_copy_t2r, tTR_tAcc[None, None, None, epi_idx], tTR_rAcc)
+            tRS_rAcc = tiled_copy_r2s.retile(tTR_rAcc)
+            tRS_rD.store(tRS_rAcc.load())
+        else:
+            tRS_rD.fill(0.0)
     def mainloop_s2t_copy_and_partition(
         self,
         sSF: cute.Tensor,
@@ -1607,8 +1766,8 @@ class GemmSm100(GemmSm90):
         # Make tiledCopy for tensor memory load
         copy_atom_t2r = sm100_utils.get_tmem_load_op(
             self.cta_tile_shape_mnk,
-            self.d_layout,
-            self.d_dtype,
+            self.d_layout if self.d_layout is not None else LayoutEnum.ROW_MAJOR,
+            self.d_dtype if self.d_dtype is not None else cutlass.BFloat16,
             self.acc_dtype,
             epi_tile,
             use_2cta_instrs,
@@ -1631,12 +1790,14 @@ class GemmSm100(GemmSm90):
         tTR_rAcc = cute.make_fragment(tTR_cAcc[None, None, None, 0, 0].shape, self.acc_dtype)
         return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
-    def epilog_smem_copy_and_partition(
+    def epilog_smem_store_and_partition(
         self,
         tiled_copy_t2r: cute.TiledCopy,
+        d_layout: Optional[LayoutEnum],
+        dtype: Optional[Type[cutlass.Numeric]],
         tTR_rD: cute.Tensor,
-        tidx: Int32,
         sD: cute.Tensor,
+        tidx: Int32,
     ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
         """
         Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
@@ -1658,83 +1819,106 @@ class GemmSm100(GemmSm90):
         :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
         """
         copy_atom_r2s = sm100_utils.get_smem_store_op(
-            self.d_layout, self.d_dtype, self.acc_dtype, tiled_copy_t2r
+            d_layout if d_layout is not None else LayoutEnum.ROW_MAJOR,
+            dtype if dtype is not None else cutlass.BFloat16,
+            self.acc_dtype,
+            tiled_copy_t2r,
         )
         tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
         # (R2S, R2S_M, R2S_N, PIPE_D)
         thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
-        tRS_sD = thr_copy_r2s.partition_D(sD)
+        tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
         # (R2S, R2S_M, R2S_N)
         tRS_rD = tiled_copy_r2s.retile(tTR_rD)
         return tiled_copy_r2s, tRS_rD, tRS_sD
-    # def epilog_smem_load_copy_and_partition(
-    #     self,
-    #     tiled_copy_t2r: cute.TiledCopy,
-    #     tTR_rC: cute.Tensor,
-    #     tidx: Int32,
-    #     sC: cute.Tensor,
-    # ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
-    #     copy_atom_s2r = cute.make_copy_atom(
-    #         warp.LdMatrix8x8x16bOp(self.c_layout.is_m_major_c(), num_matrices=4),
-    #         self.c_dtype,  # TODO: this probably only works for f16 for now?
-    #     )
-    #     # copy_atom_s2r = utils.sm90_get_smem_load_op(self.c_layout, self.c_dtype)
-    #     tiled_copy_s2r = cute.make_tiled_copy_D(copy_atom_s2r, tiled_copy_t2r)
-    #     # (R2S, R2S_M, R2S_N, PIPE_D)
-    #     thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
-    #     # (R2S, R2S_M, R2S_N)
-    #     tSR_sC = thr_copy_s2r.partition_S(sC)
-    #     return tiled_copy_s2r, tSR_sC
-    def epilog_gmem_copy_and_partition(
+    def epilog_smem_load_and_partition(
         self,
-        atom: Union[cute.CopyAtom, cute.TiledCopy],
-        gD_mnl: cute.Tensor,
-        epi_tile: cute.Tile,
-        sD: cute.Tensor,
-    ) -> Tuple[cute.Tensor, cute.Tensor]:
-        """Make tiledCopy for global memory store, then use it to:
-        - partition register array (source) and global memory (destination) for none TMA store version;
-        - partition shared memory (source) and global memory (destination) for TMA store version.
-        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
-        :type atom: cute.CopyAtom or cute.TiledCopy
-        :param gD_mnl: The global tensor C
-        :type gD_mnl: cute.Tensor
-        :param epi_tile: The epilogue tiler
-        :type epi_tile: cute.Tile
-        :param sD: The shared memory tensor to be copied and partitioned
-        :type sD: cute.Tensor
+        tiled_copy_t2r: cute.TiledCopy,
+        c_layout: LayoutEnum,
+        dtype: Type[cutlass.Numeric],
+        # tTR_rC: cute.Tensor,
+        sC: cute.Tensor,
+        tRS_rD_layout: cutlass.Layout,
+        tidx: Int32,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            c_layout, dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        store_op = copy_atom_r2s.op
+        # m8n8 16-bit path
+        if isinstance(store_op, StMatrix8x8x16bOp):
+            op = LdMatrix8x8x16bOp(num_matrices=store_op.num_matrices, transpose=store_op.transpose)
+        # m16n8 8-bit store -> m16n16 8-bit load
+        elif isinstance(store_op, StMatrix16x8x8bOp) and store_op.num_matrices in [2, 4]:
+            # transpose=True is enforced by the class
+            op = LdMatrix16x16x8bOp(num_matrices=store_op.num_matrices // 2)
+        else:
+            op = cute.nvgpu.CopyUniversalOp()
+        copy_atom_s2r = cute.make_copy_atom(op, dtype)
+        tiled_copy_s2r = cute.make_tiled_copy_D(copy_atom_s2r, tiled_copy_t2r)
+        thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        tSR_sC = thr_copy_s2r.partition_S(sC)
+        tRS_rC = cute.make_fragment(tRS_rD_layout, dtype)
+        # (R2S, R2S_M, R2S_N)
+        tSR_rC = tiled_copy_s2r.retile(tRS_rC)
+        return tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC
-        :return: A tuple containing either:
-            - For TMA store: (tma_atom_d, bSG_sD, bSG_gD) where:
-                - tma_atom_d: The TMA copy atom
-                - bSG_sD: The partitioned shared memory tensor C
-                - bSG_gD: The partitioned global tensor C
-            - For non-TMA store: (simt_atom, tTR_rD, tTR_gD) where:
-                - simt_atom: The SIMT copy atom
-                - tTR_rD: The register tensor C
-                - tTR_gD: The partitioned global tensor C
-        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
-        """
-        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N)
-        gD_epi = cute.flat_divide(gD_mnl[((None, None), 0, 0)], epi_tile)
-        sD_for_tma_partition = cute.group_modes(sD, 0, 2)
-        gD_for_tma_partition = cute.group_modes(gD_epi, 0, 2)
-        # ((ATOM_V, REST_V), EPI_M, EPI_N)
-        bSG_sD, bSG_gD = cpasync.tma_partition(
-            atom,
-            0,
-            cute.make_layout(1),
-            sD_for_tma_partition,
-            gD_for_tma_partition,
+    @cute.jit
+    def make_ab_pipeline(
+        self,
+        tiled_mma: cute.TiledMma,
+        cluster_layout_vmnk: cute.Layout,
+        ab_pipeline_mbar_ptr: cute.Pointer,
+        is_leader_cta: Boolean,
+    ) -> pipeline.PipelineAsync:
+        # If gather_A and use_2cta_instrs, the cp.async for the non-leader CTA will
+        # arrive at an mbarrier on the non-leader CTA side, then the mma warp of the non-leader
+        # CTA will wait for that then arrive at the mbarrier on the leader CTA.
+        # The producer count for the leader CTA is 1 (TMA) + num_cpasync_threads
+        # + 1 (from non-leader CTA).
+        # The producer count for the non-leader CTA is num_cpasync_threads
+        # (TMA doesn't arrive there).
+        if const_expr(not self.gather_A):
+            producer_cnt = 1
+        else:
+            producer_cnt = (self.num_ab_load_warps - 1) * 32 + (
+                1 if const_expr(not self.use_2cta_instrs) else 2
+            )
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, producer_cnt)
+        # Each warp will contribute to the arrive count with the number of mcast size
+        mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        consumer_arrive_cnt = mcast_size
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
         )
-        return bSG_sD, bSG_gD
+        if const_expr(not self.gather_A):
+            pipeline_ab = pipeline.PipelineTmaUmma.create(
+                barrier_storage=ab_pipeline_mbar_ptr,
+                num_stages=self.ab_stage,
+                producer_group=ab_pipeline_producer_group,
+                consumer_group=ab_pipeline_consumer_group,
+                tx_count=self.num_tma_load_bytes,
+                cta_layout_vmnk=cluster_layout_vmnk,
+            )
+        else:
+            pipeline_ab = PipelineTmaCpAsyncUmma.create(
+                barrier_storage=ab_pipeline_mbar_ptr,
+                num_stages=self.ab_stage,
+                producer_group=ab_pipeline_producer_group,
+                consumer_group=ab_pipeline_consumer_group,
+                tx_count=self.num_tma_load_bytes,
+                cta_layout_vmnk=cluster_layout_vmnk,
+                producer_drop_count=None
+                if not self.use_2cta_instrs
+                else (2 if not is_leader_cta else 0),
+            )
+        return pipeline_ab
     def make_acc_pipeline(
         self, cluster_layout_vmnk: cute.Layout, acc_pipeline_mbar_ptr: cute.Pointer
-    ):
+    ) -> pipeline.PipelineAsync:
         acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
         num_acc_consumer_threads = self.num_epi_warps * (2 if self.use_2cta_instrs else 1)
         acc_pipeline_consumer_group = pipeline.CooperativeGroup(
@@ -1748,19 +1932,70 @@ class GemmSm100(GemmSm90):
             cta_layout_vmnk=cluster_layout_vmnk,
         )
-    @staticmethod
+    def make_sched_pipeline(
+        self,
+        cluster_layout_mnk: cute.Layout,
+        sched_pipeline_mbar_ptr: cute.Pointer,
+        has_C: bool = False,
+    ) -> pipeline.PipelineAsync:
+        # Threads/warps participating in this pipeline
+        sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        cluster_size = cute.size(cluster_layout_mnk)
+        # Each warp that are not the scheduler warp will contribute 1 to the arrive count
+        warps_per_cta = self.num_ab_load_warps + len(
+            (self.mma_warp_id, *self.epilog_warp_id, self.scheduler_warp_id)
+        )
+        if has_C:
+            warps_per_cta += 1
+        consumer_arrive_cnt = warps_per_cta * cluster_size - 1
+        sched_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        return pipeline.PipelineAsync.create(
+            barrier_storage=sched_pipeline_mbar_ptr,
+            num_stages=self.sched_stage,
+            producer_group=sched_pipeline_producer_group,
+            consumer_group=sched_pipeline_consumer_group,
+            # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
+            consumer_mask=None if const_expr(cluster_size == 1) else 0,
+        )
+    @cute.jit
+    def make_a_prefetch_pipeline(
+        self, a_prefetch_pipeline_mbar_ptr: cute.Pointer
+    ) -> pipeline.PipelineAsync:
+        producer_cnt = 32
+        a_prefetch_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, producer_cnt, alignment=producer_cnt
+        )
+        consumer_arrive_cnt = self.num_ab_load_warps - 1
+        a_prefetch_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        return pipeline.PipelineCpAsync.create(
+            barrier_storage=a_prefetch_pipeline_mbar_ptr,
+            num_stages=self.a_prefetch_stage,
+            producer_group=a_prefetch_producer_group,
+            consumer_group=a_prefetch_consumer_group,
+        )
+    @classmethod
     def _compute_stages(
+        cls,
         tiled_mma: cute.TiledMma,
         mma_tiler_mnk: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: cute.Tile,
         a_dtype: Type[cutlass.Numeric],
         b_dtype: Type[cutlass.Numeric],
-        epi_tile: cute.Tile,
-        d_dtype: Type[cutlass.Numeric],
-        c_dtype: Optional[Type[cutlass.Numeric]],
-        d_layout: LayoutEnum,
-        c_layout: Optional[LayoutEnum],
         sf_dtype: Optional[Type[cutlass.Numeric]],
         sf_vec_size: Optional[int],
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        c_dtype: Optional[Type[cutlass.Numeric]],
+        d_layout: Optional[LayoutEnum],
+        c_layout: Optional[LayoutEnum],
+        epilogue_args: EpilogueArguments,
+        prefetch_A_idx: Literal[None, "varlen_m", "varlen_k"],
         smem_capacity: int,
         occupancy: int,
     ) -> Tuple[int, int, int]:
@@ -1778,7 +2013,7 @@ class GemmSm100(GemmSm90):
         :type epi_tile: cute.Tile
         :param d_dtype: Data type of operand C (output).
         :type d_dtype: type[cutlass.Numeric]
-        :param d_layout: Layout enum of operand C.
+        :param d_layout: Layout enum of operand D.
         :type d_layout: LayoutEnum
         :param smem_capacity: Total available shared memory capacity in bytes.
         :type smem_capacity: int
@@ -1797,8 +2032,8 @@ class GemmSm100(GemmSm90):
             num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
         # Default D stages
-        epi_stage = 2
-        epi_c_stage = 2 if c_dtype is not None else 0
+        epi_stage = 4 if cute.size(epi_tile[1]) <= 16 else 2
+        epi_c_stage = 0 if c_dtype is None else (4 if cute.size(epi_tile[1]) <= 16 else 2)
         # Calculate smem layout and size for one stage of A, B, and C
         a_smem_layout_staged_one = sm100_utils.make_smem_layout_a(
@@ -1813,7 +2048,11 @@ class GemmSm100(GemmSm90):
             b_dtype,
             1,  # a tmp 1 stage is provided
         )
-        d_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(d_dtype, d_layout, epi_tile, 1)
+        d_smem_layout_staged_one = (
+            sm100_utils.make_smem_layout_epi(d_dtype, d_layout, epi_tile, 1)
+            if d_dtype is not None
+            else None
+        )
         c_smem_layout_staged_one = (
             sm100_utils.make_smem_layout_epi(c_dtype, c_layout, epi_tile, 1)
             if c_dtype is not None
@@ -1836,13 +2075,22 @@ class GemmSm100(GemmSm90):
         ab_bytes_per_stage = cute.size_in_bytes(
             a_dtype, a_smem_layout_staged_one
         ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+        if const_expr(prefetch_A_idx == "varlen_k"):  # Need smem to prefetch A indices
+            ab_bytes_per_stage += Int32.width // 8 * cta_tile_shape_mnk[2]
         if const_expr(blockscaled):
             ab_bytes_per_stage += cute.size_in_bytes(
                 sf_dtype, sfa_smem_layout_staged_one
             ) + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
         mbar_helpers_bytes = 1024
-        d_bytes_per_stage = cute.size_in_bytes(d_dtype, d_smem_layout_staged_one)
-        epi_bytes = d_bytes_per_stage * epi_stage
+        if const_expr(prefetch_A_idx == "varlen_m"):
+            mbar_helpers_bytes += Int32.width // 8 * cta_tile_shape_mnk[0] * 2
+        d_bytes_per_stage = (
+            cute.size_in_bytes(d_dtype, d_smem_layout_staged_one) if d_dtype is not None else 0
+        )
+        epi_bytes_per_stage = d_bytes_per_stage + cls.epi_smem_bytes_per_stage(
+            epilogue_args, cta_tile_shape_mnk, epi_tile
+        )
+        epi_bytes = epi_bytes_per_stage * epi_stage
         if const_expr(c_dtype is not None):
             c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
             epi_bytes += c_bytes_per_stage * epi_c_stage
@@ -1851,18 +2099,13 @@ class GemmSm100(GemmSm90):
         # Start with total smem per CTA (capacity / occupancy)
         # Subtract reserved bytes and initial C stages bytes
         # Divide remaining by bytes needed per A/B/SFA/SFB stage
-        ab_stage = (
-            smem_capacity // occupancy - (mbar_helpers_bytes + epi_bytes)
-        ) // ab_bytes_per_stage
+        remaining_bytes = smem_capacity // occupancy - mbar_helpers_bytes - epi_bytes
+        ab_stage = remaining_bytes // ab_bytes_per_stage
         # Refine epilogue stages:
         # Calculate remaining smem after allocating for A/B stages and reserved bytes
         # Add remaining unused smem to epilogue
-        epi_stage += (
-            smem_capacity
-            - occupancy * ab_bytes_per_stage * ab_stage
-            - occupancy * (mbar_helpers_bytes + epi_bytes)
-        ) // (occupancy * d_bytes_per_stage)
+        epi_stage += (remaining_bytes - ab_bytes_per_stage * ab_stage) // (epi_bytes_per_stage)
         return num_acc_stage, ab_stage, epi_stage, epi_c_stage
     @staticmethod
@@ -1891,9 +2134,12 @@ class GemmSm100(GemmSm90):
     @staticmethod
     def is_valid_dtypes(
-        ab_dtype: Type[cutlass.Numeric],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
         acc_dtype: Type[cutlass.Numeric],
-        d_dtype: Type[cutlass.Numeric],
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        a_major: str,
+        b_major: str,
     ) -> bool:
         """
         Check if the dtypes are valid
@@ -1909,6 +2155,9 @@ class GemmSm100(GemmSm90):
         :rtype: bool
         """
         is_valid = True
+        if b_dtype != a_dtype:
+            is_valid = False
+        ab_dtype = a_dtype
         if ab_dtype not in {
             cutlass.Float16,
             cutlass.BFloat16,
@@ -1927,7 +2176,7 @@ class GemmSm100(GemmSm90):
             and ab_dtype not in {cutlass.Uint8, cutlass.Int8}
         ):
             is_valid = False
-        if (
+        if d_dtype is not None and (
             acc_dtype == Float32
             and d_dtype
             not in {
@@ -1958,6 +2207,8 @@ class GemmSm100(GemmSm90):
             }
         ):
             is_valid = False
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
         return is_valid
     @staticmethod
@@ -2014,34 +2265,6 @@ class GemmSm100(GemmSm90):
         return is_valid
-    @staticmethod
-    def is_valid_layouts(
-        ab_dtype: Type[cutlass.Numeric],
-        a_major: str,
-        b_major: str,
-    ) -> bool:
-        """
-        Check if the dtypes and sf_vec_size are valid combinations
-        :param ab_dtype: The data type of the A and B operands
-        :type ab_dtype: Type[cutlass.Numeric]
-        :param d_dtype: The data type of the output tensor
-        :type d_dtype: Type[cutlass.Numeric]
-        :param a_major: The major dimension of the A tensor
-        :type a_major: str
-        :param b_major: The major dimension of the B tensor
-        :type b_major: str
-        :param d_major: The major dimension of the C tensor
-        :type d_major: str
-        :return: True if the layouts are valid, False otherwise
-        :rtype: bool
-        """
-        is_valid = True
-        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
-            is_valid = False
-        return is_valid
     @staticmethod
     def is_valid_mma_tiler_and_cluster_shape(
         mma_tiler_mn: Tuple[int, int],
@@ -2187,7 +2410,7 @@ class GemmSm100(GemmSm90):
         """
         can_implement = True
         # Skip unsupported types
-        if not GemmSm100.is_valid_dtypes(ab_dtype, acc_dtype, d_dtype):
+        if not GemmSm100.is_valid_dtypes(ab_dtype, ab_dtype, acc_dtype, d_dtype, a_major, b_major):
             can_implement = False
         # Skip invalid mma tile shape and cluster shape
         if not GemmSm100.is_valid_mma_tiler_and_cluster_shape(
@@ -2362,7 +2585,7 @@ def run(
     # Configure gemm kernel
     cluster_shape_mnk = (*cluster_shape_mn, 1)
-    gemm = GemmSm100(acc_dtype, mma_tiler_mn, cluster_shape_mnk)
+    gemm = GemmSm100(acc_dtype, ab_dtype, mma_tiler_mn, cluster_shape_mnk)
     # Compute max active clusters on current device
     hardware_info = cutlass.utils.HardwareInfo()

quack-kernels 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl