PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/gemm_symmetric.py ADDED Viewed

@@ -0,0 +1,330 @@
+from typing import Tuple, Optional, Callable
+from functools import partial
+from torch import Tensor
+from quack.gemm_act import GemmActMixin, act_fn_map, gemm_act
+from quack.gemm_sm90 import GemmSm90
+from quack.gemm_sm100 import GemmSm100
+from quack.tile_scheduler import TriangularTileScheduler
+from quack.gemm_wrapper_utils import GemmWrapperBase
+from quack.cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from quack.varlen_utils import VarlenManager
+import quack.copy_utils as copy_utils
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import make_ptr
+from cutlass import Int32, Float32, Boolean, const_expr
+import cutlass.utils.hopper_helpers as sm90_utils_og
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cutlass_dsl import if_generate
+class GemmSymmetricMixin(GemmActMixin, GemmSm90):
+    def get_scheduler_class(self, varlen_m: bool = False):
+        return TriangularTileScheduler
+    @cute.jit
+    def epilogue(
+        self,
+        params: GemmActMixin.EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
+        epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_store_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_read_state: cutlass.pipeline.PipelineState,
+        epi_producer_state: cutlass.pipeline.PipelineState,
+        epi_tile: cute.Tile,
+        load_acc_subtile: Callable,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor],
+        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
+        tiled_copy_r2s: cute.TiledCopy,
+        tRS_sD: cute.Tensor,
+        tiled_copy_s2r: Optional[cute.TiledCopy],
+        tSR_rC: Optional[cute.Tensor],
+        tSR_sC: Optional[cute.Tensor],
+        copy_D: Optional[Callable],
+        copy_C: Optional[Callable],
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tile_scheduler,
+        tidx: Int32,
+        is_tma_warp: Boolean,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
+        has_C = const_expr(tRS_rC is not None)
+        has_D = const_expr(copy_D is not None)
+        tma_atom_postact = params.tma_atom_postact
+        mPostAct_mnl = params.mPostAct_mnl
+        sRowVec, sColVec, sPostAct = epi_smem_tensors
+        get_smem_store_op = (
+            partial(sm100_utils.get_smem_store_op, tiled_tmem_load=tiled_copy_t2r)
+            if self.arch == 100
+            else sm90_utils_og.sm90_get_smem_store_op
+        )
+        copy_atom_postact_r2s = get_smem_store_op(
+            self.postact_layout, self.postact_dtype, self.acc_dtype
+        )
+        # tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        # tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
+        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_r2s)
+        tRS_sPostAct = tiled_copy_postact_r2s.get_slice(tidx).partition_D(sPostAct)
+        (tma_desc_postact_ptr,) = tma_desc_epi_ptrs
+        batch_idx = tile_coord_mnkl[3]
+        copy_postact, _, _ = self.epilog_gmem_copy_and_partition(
+            tma_atom_postact,
+            varlen_manager.offset_batch_epi(mPostAct_mnl, batch_idx),
+            self.cta_tile_shape_postact_mn,
+            params.epi_tile_postact,
+            sPostAct,
+            tile_coord_mnkl,
+            tma_desc_ptr=tma_desc_postact_ptr,
+        )
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_shape = cute.zipped_divide(
+            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
+        ).shape[1]
+        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        epi_tile_num = cute.size(epi_tile_shape)
+        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
+        epi_tensors = self.epi_begin(
+            params,
+            epi_smem_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        if const_expr(copy_C is not None):
+            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+        def tma_store_fn(src_idx, dst_idx, tile_coord_mnkl):
+            pid_m = tile_coord_mnkl[0]
+            pid_n = tile_coord_mnkl[1]
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                square_tile_m = pid_m // self.cluster_shape_mnk[0]
+                square_tile_n = pid_n // self.cluster_shape_mnk[1]
+                if const_expr(has_D):
+                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
+                if square_tile_m != square_tile_n:  # don't write twice to the same tile
+                    copy_postact(src_idx=src_idx, dst_idx=dst_idx)
+            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
+            epilogue_barrier.arrive_and_wait()
+        delay_tma_store = True
+        src_idx_prev, dst_idx_prev = None, None
+        for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # The global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from acc to D registers
+            load_acc_subtile(tRS_rD, epi_idx)
+            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
+            if const_expr(has_C):
+                epi_pipeline.consumer_wait(epi_read_state)
+                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
+                # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
+                cute.arch.sync_warp()
+                with cute.arch.elect_one():
+                    epi_pipeline.consumer_release(epi_read_state)
+                epi_read_state.advance()
+            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+            tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            if const_expr(delay_tma_store):
+                if const_expr(epi_idx > 0):
+                    tma_store_fn(
+                        src_idx=src_idx_prev, dst_idx=dst_idx_prev, tile_coord_mnkl=tile_coord_mnkl
+                    )
+                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
+            # Copy from D registers to shared memory
+            if const_expr(has_D):
+                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
+            cute.copy(
+                tiled_copy_postact_r2s,
+                tiled_copy_postact_r2s.retile(tRS_rPostAct),
+                tRS_sPostAct[None, None, None, epi_buffer],
+            )
+            if const_expr(not delay_tma_store):
+                tma_store_fn(
+                    src_idx=epi_buffer, dst_idx=gmem_coord, tile_coord_mnkl=tile_coord_mnkl
+                )
+        if const_expr(delay_tma_store):
+            tma_store_fn(
+                src_idx=src_idx_prev, dst_idx=dst_idx_prev, tile_coord_mnkl=tile_coord_mnkl
+            )
+        self.epi_end(
+            params,
+            epi_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
+        )
+        return epi_read_state, epi_producer_state
+class GemmSymmetricSm90(GemmSymmetricMixin, GemmSm90):
+    pass
+class GemmSymmetricSm100(GemmSymmetricMixin, GemmSm100):
+    pass
+def gemm_symmetric(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, m, k)
+    D: Optional[Tensor],  # (l, m, m)
+    C: Optional[Tensor],  # (l, m, m)
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+) -> None:
+    # Tranpose D so the "activation" is a write to the mirrored tile
+    PostAct = D.mT
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, additional_tensors={"PostAct": PostAct}
+    )
+    assert M == N, "M and N must be the same; symmetric gemm only supports square matrices"
+    GemmWrapperBase.permute_tensors(tensor_infos)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmSymmetricSm90 if device_capacity[0] == 9 else GemmSymmetricSm100
+    acc_dtype = Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors({k: v for k, v in tensor_infos.items()}, major_configs)
+    def scalar_arg(scalar: float | Tensor):
+        if isinstance(scalar, float):
+            return Float32(scalar) if scalar != 1.0 else None
+        else:
+            assert isinstance(scalar, Tensor)
+            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
+    activation = None  # Equivalent to identity
+    act_fn = act_fn_map[activation]
+    epi_args = GemmCls.EpilogueArguments(
+        tensor_infos["PostAct"].cute_tensor, act_fn, scalar_arg(alpha), scalar_arg(beta)
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
+    )
+    varlen_args = None
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        max_swizzle_size,
+        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
+        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_act.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm_obj = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=False,
+        )
+        cache[compile_key] = cute.compile(
+            gemm_obj,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm_act.compile_cache = {}

quack/gemm_wrapper_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import Optional, Tuple, Dict, Any
 from dataclasses import dataclass
+import torch
 from torch import Tensor
 import cutlass.cute as cute
@@ -9,7 +10,8 @@ from cutlass import Int32
 from cutlass.cute.runtime import from_dlpack, make_ptr
 from quack.cute_dsl_utils import torch2cute_dtype_map
-from quack.dense_gemm_sm90 import TileSchedulerOptions
+from quack.varlen_utils import VarlenArguments
+from quack.tile_scheduler import TileSchedulerOptions
 @dataclass
@@ -22,8 +24,8 @@ class GemmTensorInfo:
 class GemmWrapperBase:
     @staticmethod
-    def validate_tensor_3d(tensor: Tensor, name: str) -> None:
-        assert tensor.dim() == 3 and tensor.is_cuda, f"{name} must be a 3D CUDA tensor"
+    def validate_tensor(tensor: Tensor, name: str, ndim: int) -> None:
+        assert tensor.dim() == ndim and tensor.is_cuda, f"{name} must be a {ndim}D CUDA tensor"
         assert tensor.dtype in torch2cute_dtype_map, f"Unsupported dtype for {name}"
     @staticmethod
@@ -47,7 +49,7 @@ class GemmWrapperBase:
     ) -> Optional[cute.Tensor]:
         if tensor is None:
             return None
-        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # Tensor is already permuted to (dims[0], dims[1], dims[2]) or (dim[0], dim[1])
         # If major is dims[1], leading_dim is 1; if major is dims[0], leading_dim is 0
         leading_dim = 1 if major == dims[1] else 0
         return from_dlpack(tensor.detach(), assumed_align=assumed_align).mark_layout_dynamic(
@@ -61,43 +63,131 @@ class GemmWrapperBase:
         D: Optional[Tensor] = None,
         C: Optional[Tensor] = None,
         additional_tensors: Optional[Dict[str, Tensor]] = None,
+        cu_seqlens_m: Optional[Tensor] = None,
+        cu_seqlens_k: Optional[Tensor] = None,
+        A_idx: Optional[Tensor] = None,
     ) -> Tuple[int, int, int, int, Dict[str, GemmTensorInfo]]:
-        GemmWrapperBase.validate_tensor_3d(A, "A")
-        L, M, K = A.shape
-        GemmWrapperBase.validate_tensor_3d(B, "B")
-        _, N, _ = B.shape
+        assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
+            "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
+        )
         assert B.dtype == A.dtype, "A and B must have the same dtype"
-        GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+        # Validate A_idx if provided (for gather_A case)
+        gather_A = A_idx is not None
+        if gather_A:
+            assert cu_seqlens_m is not None or cu_seqlens_k is not None, (
+                "gather_A requires either varlen_m or varlen_k"
+            )
+            assert A_idx.dtype == torch.int32, f"A_idx must be int32, got {A_idx.dtype}"
+            assert A_idx.dim() == 1, f"A_idx must be 1D, got {A_idx.dim()}D"
+        # Determine mode and extract dimensions
+        if cu_seqlens_m is not None:
+            # varlen_m: A is (total_m, k) or (whatever, k) if gather_A, B is (l, n, k), D/C are (total_m, n)
+            assert A.dim() == 2, f"A must be 2D when using varlen_m, got {A.dim()}D"
+            assert B.dim() == 3, f"B must be 3D with varlen_m, got {B.dim()}D"
+            if gather_A:
+                # When gather_A, A can have any number of rows, we use A_idx.shape[0] as total_M
+                total_M = A_idx.shape[0]
+                _, K = A.shape
+            else:
+                total_M, K = A.shape
+            L, N, K_B = B.shape
+            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
+            assert cu_seqlens_m.shape == (L + 1,), (
+                f"cu_seqlens_m must have shape ({L + 1},), got {cu_seqlens_m.shape}"
+            )
+            M = total_M
+            dc_shape = (total_M, N)
+            dc_ndim = 2
+        elif cu_seqlens_k is not None:
+            # varlen_k: A is (m, total_k) or (m, whatever) if gather_A, B is (n, total_k), D/C are (l, m, n)
+            assert A.dim() == 2, f"A must be 2D when using varlen_k, got {A.dim()}D"
+            assert B.dim() == 2, f"B must be 2D with varlen_k, got {B.dim()}D"
+            if gather_A:
+                # When gather_A with varlen_k, A can have any number of columns, we use A_idx.shape[0] as total_K
+                M, _ = A.shape
+                total_K = A_idx.shape[0]
+            else:
+                M, total_K = A.shape
+            N, K_B = B.shape
+            assert total_K == K_B, f"K dimension mismatch: expected {total_K}, B has {K_B}"
+            L = cu_seqlens_k.shape[0] - 1
+            assert cu_seqlens_k.shape == (L + 1,), (
+                f"cu_seqlens_k must have shape ({L + 1},), got {cu_seqlens_k.shape}"
+            )
+            K = total_K
+            dc_shape = (L, M, N)
+            dc_ndim = 3
+        else:
+            # Normal case - all tensors must be 3D
+            GemmWrapperBase.validate_tensor(A, "A", 3)
+            GemmWrapperBase.validate_tensor(B, "B", 3)
+            L, M, K = A.shape
+            _, N, K_B = B.shape
+            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
+            GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+            dc_shape = (L, M, N)
+            dc_ndim = 3
+        # Validate D and C shapes uniformly
+        for tensor, name in [(D, "D"), (C, "C")]:
+            if tensor is not None:
+                assert tensor.dim() == dc_ndim, (
+                    f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
+                )
+                assert tensor.shape == dc_shape, (
+                    f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
+                )
         tensors = {
             "A": GemmTensorInfo(A),
             "B": GemmTensorInfo(B),
             "D": GemmTensorInfo(D),
             "C": GemmTensorInfo(C),
         }
-        if D is not None:
-            GemmWrapperBase.validate_tensor_3d(D, "D")
-            GemmWrapperBase.validate_shape(D, (L, M, N), "D")
-        if C is not None:
-            GemmWrapperBase.validate_tensor_3d(C, "C")
-            GemmWrapperBase.validate_shape(C, (L, M, N), "C")
         if additional_tensors:
             for name, tensor in additional_tensors.items():
                 if tensor is not None:
-                    GemmWrapperBase.validate_tensor_3d(tensor, name)
-                    GemmWrapperBase.validate_shape(tensor, (L, M, N), name)
+                    assert tensor.dim() == dc_ndim, (
+                        f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
+                    )
+                    assert tensor.shape == dc_shape, (
+                        f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
+                    )
                 tensors[name] = GemmTensorInfo(tensor)
         return L, M, K, N, tensors
     @staticmethod
-    def permute_tensors(tensors: Dict[str, GemmTensorInfo]) -> None:
-        for info in tensors.values():
-            if info.tensor is not None:
-                info.tensor = info.tensor.permute(1, 2, 0)
+    def permute_tensors(
+        tensors: Dict[str, GemmTensorInfo], varlen_m: bool = False, varlen_k: bool = False
+    ) -> None:
+        # Determine which tensors need permutation
+        if varlen_m:
+            # Only B needs permutation (3D tensor)
+            tensors_to_permute = ["B"]
+        elif varlen_k:
+            # Only D and C need permutation (3D tensors)
+            tensors_to_permute = ["D", "C"]
+        else:
+            # All tensors need permutation
+            tensors_to_permute = None
+        # Apply permutation from (L, *, *) -> (*, *, L) for selected tensors
+        for name, info in tensors.items():
+            if info.tensor is not None and info.tensor.ndim == 3:
+                if tensors_to_permute is None or name in tensors_to_permute:
+                    info.tensor = info.tensor.permute(1, 2, 0)
     @staticmethod
     def extract_dtypes(tensors: Dict[str, GemmTensorInfo]) -> None:
-        for info in tensors.values():
+        for name, info in tensors.items():
             if info.tensor is not None:
                 info.dtype = torch2cute_dtype_map[info.tensor.dtype]
@@ -121,7 +211,10 @@ class GemmWrapperBase:
     @staticmethod
     def create_scheduler_args(
-        max_active_clusters: int, tile_count_semaphore: Optional[Tensor] = None
+        max_active_clusters: int,
+        tile_count_semaphore: Optional[Tensor] = None,
+        batch_idx_permute: Optional[Tensor] = None,
+        max_swizzle_size: int = 8,
     ) -> TileSchedulerOptions:
         return TileSchedulerOptions(
             Int32(max_active_clusters),
@@ -130,6 +223,72 @@ class GemmWrapperBase:
             )
             if tile_count_semaphore is not None
             else None,
+            batch_idx_permute=(
+                from_dlpack(batch_idx_permute, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+            )
+            if batch_idx_permute is not None
+            else None,
+            max_swizzle_size=Int32(max_swizzle_size),
+        )
+    @staticmethod
+    def create_varlen_args(
+        cu_seqlens_m: Optional[Tensor],
+        cu_seqlens_k: Optional[Tensor],
+        A_idx: Optional[Tensor],
+        max_active_clusters: int,
+        cluster_shape_mnk: Tuple[int, int, int],
+        tensors: Dict[str, GemmTensorInfo],
+        num_epi_tensormaps: int = 0,
+        pingpong: bool = False,
+    ) -> Optional[Any]:
+        if cu_seqlens_m is None and cu_seqlens_k is None:
+            return None
+        # When varlen_m, we assume persistent=True
+        # Grid size depends on num_active_clusters and cluster size
+        cluster_size = cluster_shape_mnk[0] * cluster_shape_mnk[1]
+        num_blocks = max_active_clusters * cluster_size
+        # Calculate number of tensormaps needed
+        if cu_seqlens_m is not None:
+            # For varlen_m: need tensormaps for D and epilogue tensors
+            num_tensormaps = num_epi_tensormaps * (1 if not pingpong else 2)
+            if tensors["D"].tensor is not None:
+                num_tensormaps += 1 if not pingpong else 2  # D tensormap
+        else:
+            # For varlen_k: need tensormaps for A & B
+            num_tensormaps = 2 if A_idx is None else 1
+        # Create tensormap buffer (each tensormap is 128 bytes = 16 int64s)
+        tensormap_size = 128 // 8  # 16 int64s
+        if num_tensormaps > 0:
+            device = cu_seqlens_m.device if cu_seqlens_m is not None else cu_seqlens_k.device
+            tensormaps = torch.empty(
+                (num_blocks, num_tensormaps, tensormap_size),
+                dtype=torch.int64,
+                device=device,
+            )
+            tensormaps_cute = from_dlpack(tensormaps, assumed_align=128).mark_compact_shape_dynamic(
+                mode=0, stride_order=(0, 1, 2)
+            )
+        else:
+            tensormaps_cute = None
+        return VarlenArguments(
+            mCuSeqlensM=(
+                from_dlpack(cu_seqlens_m, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if cu_seqlens_m is not None
+                else None
+            ),
+            mCuSeqlensK=(
+                from_dlpack(cu_seqlens_k, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if cu_seqlens_k is not None
+                else None
+            ),
+            mTensormaps=tensormaps_cute,
+            mAIdx=(
+                from_dlpack(A_idx, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if A_idx is not None
+                else None
+            ),
         )
     @staticmethod

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl