PyPI - quack-kernels - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

quack-kernels 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

quack/__init__.py +7 -3
quack/activation.py +279 -0
quack/autotuner.py +2 -1
quack/cross_entropy.py +330 -184
quack/cute_dsl_utils.py +83 -4
quack/dense_gemm_sm100.py +1 -1
quack/dense_gemm_sm90.py +911 -1140
quack/fast_math.py +10 -27
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +43 -35
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +491 -243
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +6 -4
quack/linear.py +128 -64
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +30 -160
quack/pipeline.py +2 -17
quack/reduce.py +240 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +614 -228
quack/softmax.py +28 -16
quack/symmetric_dense_gemm_sm90.py +6 -3
quack/tensormap_manager.py +1 -0
quack/tile_scheduler.py +64 -61
quack/topk.py +14 -8
quack/utils.py +14 -322
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/METADATA +3 -3
quack_kernels-0.2.1.dist-info/RECORD +37 -0
quack/lse.py +0 -62
quack_kernels-0.1.11.dist-info/RECORD +0 -31
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/top_level.txt +0 -0

quack/fast_math.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) 2025, Tri Dao.
 from typing import Tuple
+from dataclasses import dataclass
 import cutlass
 import cutlass.cute as cute
@@ -8,6 +9,8 @@ from cutlass import Int32, Uint32
 from cutlass.cutlass_dsl import T, dsl_user_op
 from cutlass._mlir.dialects import llvm
+from quack.cute_dsl_utils import ParamsBase
 @cute.jit
 def clz(x: Int32) -> Int32:
@@ -45,18 +48,15 @@ def umulhi(a: Int32, b: Int32, *, loc=None, ip=None) -> Uint32:
     )
-class FastDivmod:
-    def __init__(
-        self, divisor: Int32, multipler: Uint32, shift_right: Uint32, *, loc=None, ip=None
-    ):
-        self.divisor = divisor
-        self.multiplier = multipler
-        self.shift_right = shift_right
-        self._loc = loc
+@dataclass
+class FastDivmod(ParamsBase):
+    divisor: Int32
+    multiplier: Uint32
+    shift_right: Uint32
     # called by host
     @staticmethod
-    def create(divisor: Int32, *, loc=None, ip=None) -> "FastDivmod":
+    def create(divisor: Int32) -> "FastDivmod":
         """Construct the FastDivmod object, in host code.
         This precomputes some values based on the divisor and is computationally expensive.
         """
@@ -64,7 +64,7 @@ class FastDivmod:
         divisor_u32 = Uint32(divisor)
         multiplier = Uint32(((cutlass.Uint64(1) << p) + divisor_u32 - 1) // divisor_u32)
         shift_right = Uint32(p - 32)
-        return FastDivmod(divisor, multiplier, shift_right, loc=loc, ip=ip)
+        return FastDivmod(divisor, multiplier, shift_right)
     @cute.jit
     def div(self, dividend: Int32) -> Int32:
@@ -78,20 +78,3 @@ class FastDivmod:
         quotient = self.div(dividend)
         remainder = dividend - quotient * self.divisor
         return quotient, remainder
-    def __extract_mlir_values__(self):
-        values, self._values_pos = [], []
-        for obj in [self.divisor, self.multiplier, self.shift_right]:
-            obj_values = cutlass.extract_mlir_values(obj)
-            values += obj_values
-            self._values_pos.append(len(obj_values))
-        return values
-    def __new_from_mlir_values__(self, values):
-        obj_list = []
-        for obj, n_items in zip(
-            [self.divisor, self.multiplier, self.shift_right], self._values_pos
-        ):
-            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
-            values = values[n_items:]
-        return FastDivmod(*(tuple(obj_list)), loc=self._loc)

quack/gemm_act_sm90.py ADDED Viewed

@@ -0,0 +1,368 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple, Optional, Callable
+from dataclasses import dataclass
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import warpgroup
+import cutlass.utils.hopper_helpers as sm90_utils
+from cutlass import Int32, Float32, Boolean, const_expr
+import cutlass.torch as cutlass_torch
+from quack.cute_dsl_utils import ArgumentsBase, ParamsBase
+from quack.dense_gemm_sm90 import GemmSm90
+from quack.cute_dsl_utils import get_max_active_clusters
+from quack.gemm_wrapper_utils import GemmWrapperBase
+import quack.activation
+class GemmActSm90(GemmSm90):
+    @dataclass
+    class EpilogueArguments(ArgumentsBase):
+        mPostAct: cute.Tensor
+        act_fn: cutlass.Constexpr[Optional[Callable]] = None
+        alpha: Optional[Float32] = None
+        beta: Optional[Float32] = None
+    @dataclass
+    class EpilogueParams(ParamsBase):
+        tma_atom_postact: cute.CopyAtom
+        mPostAct_mnl: cute.Tensor
+        epi_postact_smem_layout_staged: cute.ComposedLayout
+        act_fn: cutlass.Constexpr[Optional[Callable]] = None
+        alpha: Optional[Float32] = None
+        beta: Optional[Float32] = None
+    def epi_to_underlying_arguments(
+        self, args: EpilogueArguments, *, loc=None, ip=None
+    ) -> EpilogueParams:
+        self.postact_dtype = args.mPostAct.element_type
+        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
+        self.tile_shape_postact_mn = self.tile_shape_mnk[:2]
+        self.epi_tile_postact = self.epi_tile
+        postact_major_mode_size = (
+            self.epi_tile_postact[1]
+            if self.postact_layout.is_n_major_c()
+            else self.epi_tile_postact[0]
+        )
+        postact_smem_layout_atom = warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(
+                self.postact_layout, self.postact_dtype, postact_major_mode_size
+            ),
+            self.postact_dtype,
+        )
+        epi_postact_smem_layout_staged = cute.tile_to_shape(
+            postact_smem_layout_atom,
+            cute.append(self.epi_tile_postact, self.epi_stage),
+            order=(0, 1, 2),
+        )
+        tma_atom_postact, tma_tensor_postact = self._make_tma_epi_atoms_and_tensors(
+            args.mPostAct,
+            epi_postact_smem_layout_staged,
+            self.epi_tile_postact,
+            store_or_load="store",
+        )
+        return GemmActSm90.EpilogueParams(
+            tma_atom_postact,
+            tma_tensor_postact,
+            epi_postact_smem_layout_staged,
+            args.act_fn,
+            args.alpha,
+            args.beta,
+        )
+    @staticmethod
+    def epi_smem_bytes_per_stage(
+        args: EpilogueArguments,
+        tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: Tuple[int, int],
+    ) -> int:
+        postact_dtype = args.mPostAct.element_type
+        postact_bytes_per_stage = cute.size(epi_tile) * (postact_dtype.width // 8)
+        return postact_bytes_per_stage
+    def epi_get_smem_struct(self, params: EpilogueParams):
+        @cute.struct
+        class EpiSharedStorage:
+            sPostAct: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.postact_dtype, cute.cosize(params.epi_postact_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+        return EpiSharedStorage
+    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
+        sPostAct = storage.epi.sPostAct.get_tensor(
+            params.epi_postact_smem_layout_staged.outer,
+            swizzle=params.epi_postact_smem_layout_staged.inner,
+        )
+        return (sPostAct,)
+    @cute.jit
+    def epilogue(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_read_state: cutlass.pipeline.PipelineState,
+        epi_producer_state: cutlass.pipeline.PipelineState,
+        tiled_mma: cute.TiledMma,
+        tRS_rAcc: cute.Tensor,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor],
+        tiled_copy_r2s: cute.core.ThrCopy,
+        tRS_sD: cute.Tensor,
+        tiled_copy_s2r: Optional[cute.core.ThrCopy],
+        tSR_rC: Optional[cute.Tensor],
+        tSR_sC: Optional[cute.Tensor],
+        copy_D: Optional[Callable],
+        bSG_sD: cute.Tensor,
+        bSG_gD: cute.Tensor,
+        epi_load_g2s: Optional[Callable],
+        tile_coord_mnkl: cute.Coord,
+        cu_seqlens_m: Optional[cute.Tensor],
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tile_scheduler,
+        tidx: Int32,
+        is_tma_warp: Boolean,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
+        has_C = const_expr(tRS_rC is not None)
+        has_D = const_expr(copy_D is not None)
+        assert cu_seqlens_m is None, "GemmActSm90 doesn't support varlen_m for now"
+        tma_atom_postact = params.tma_atom_postact
+        mPostAct_mnl = params.mPostAct_mnl
+        (sPostAct,) = epi_smem_tensors
+        tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        copy_atom_postact_r2s = sm90_utils.sm90_get_smem_store_op(
+            self.postact_layout, elem_ty_d=self.postact_dtype, elem_ty_acc=self.acc_dtype
+        )
+        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
+        thr_copy_postact_r2s = tiled_copy_postact_r2s.get_slice(tidx)
+        tRS_sPostAct = thr_copy_postact_r2s.partition_D(sPostAct)
+        bSG_sPostAct, bSG_gPostAct = self.epilog_gmem_copy_and_partition(
+            tma_atom_postact,
+            mPostAct_mnl,
+            self.tile_shape_postact_mn,
+            self.epi_tile_postact,
+            sPostAct,
+            tile_coord_mnkl,
+            cu_seqlens_m,
+        )
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_shape = cute.zipped_divide(
+            cute.make_layout(self.tile_shape_mnk[:2]), self.epi_tile
+        ).shape[1]
+        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        epi_tile_num = cute.size(epi_tile_shape)
+        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
+        if const_expr(epi_load_g2s is not None):
+            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
+                epi_producer_state = epi_load_g2s(epi_producer_state, epi_idx, is_tma_warp)
+        for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # Copy from acc to D registers
+            for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
+                tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+            if const_expr(has_C):
+                epi_pipeline.consumer_wait(epi_read_state)
+                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
+                # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
+                cute.arch.sync_warp()
+                with cute.arch.elect_one():
+                    epi_pipeline.consumer_release(epi_read_state)
+                epi_read_state.advance()
+            if const_expr(epi_load_g2s is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                epi_producer_state = epi_load_g2s(
+                    epi_producer_state, epi_idx + self.epi_c_stage, is_tma_warp
+                )
+            tRS_rPostAct = self.epi_visit_acc_subtile(params, tRS_rD, tRS_rC)
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            # Copy from D registers to shared memory
+            if const_expr(has_D):
+                # Type conversion
+                tRS_rD_out = cute.make_fragment_like(tRS_rD, self.d_dtype)
+                tRS_rD_out.store(tRS_rD.load().to(self.d_dtype))
+                cute.copy(tiled_copy_r2s, tRS_rD_out, tRS_sD[None, None, None, epi_buffer])
+            cute.copy(
+                tiled_copy_postact_r2s,
+                tiled_copy_postact_r2s.retile(tRS_rPostAct),
+                tRS_sPostAct[None, None, None, epi_buffer],
+            )
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Get the global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(bSG_sD[None, epi_buffer], bSG_gD[None, gmem_coord])
+                cute.copy(
+                    tma_atom_postact,
+                    bSG_sPostAct[None, epi_buffer],
+                    bSG_gPostAct[None, gmem_coord],
+                )
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(self.epi_stage - 1, read=True)
+            epilogue_barrier.arrive_and_wait()
+        return epi_read_state, epi_producer_state
+    @cute.jit
+    def epi_visit_acc_subtile(
+        self,
+        params: EpilogueParams,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        # Apply alpha scaling to accumulator if alpha is provided (not None)
+        if const_expr(params.alpha is not None):
+            tRS_rD.store(tRS_rD.load() * params.alpha)
+        # Apply C with beta scaling
+        if const_expr(tRS_rC is not None):
+            if const_expr(params.beta is None):
+                # beta is None, default behavior: add C (beta=1.0)
+                tRS_rD.store(tRS_rD.load() + tRS_rC.load().to(tRS_rD.element_type))
+            else:
+                tRS_rD.store(tRS_rD.load() + params.beta * tRS_rC.load().to(tRS_rD.element_type))
+        # Apply activation function if provided
+        # If we don't have .shape here, the compiler generates local stores and loads
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
+        else:
+            tRS_rPostAct = tRS_rD
+        # Type conversion
+        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
+act_fn_map = {
+    None: None,
+    "relu": quack.activation.relu,
+    "relu_sq": quack.activation.relu_sq,
+    "gelu_tanh_approx": quack.activation.gelu_tanh_approx,
+}
+def gemm_act_sm90(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, n, k)
+    D: Optional[Tensor],  # (l, m, n)
+    C: Optional[Tensor],  # (l, m, n)
+    PostAct: Tensor,  # (l, m, n)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+) -> None:
+    tile_count_semaphore = None
+    assert activation in act_fn_map, f"Unsupported activation {activation}"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, additional_tensors={"PostAct": PostAct}
+    )
+    GemmWrapperBase.permute_tensors(tensor_infos)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    acc_dtype = cutlass.Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmActSm90.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = act_fn_map[activation]
+    epi_args = GemmActSm90.EpilogueArguments(
+        tensor_infos["PostAct"].cute_tensor,
+        act_fn,
+        alpha=Float32(alpha) if alpha != 1.0 else None,
+        beta=Float32(beta) if beta != 1.0 else None,
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        alpha != 1.0,
+        beta != 1.0,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_act_sm90.compile_cache
+    if compile_key not in cache:
+        gemm = GemmActSm90(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            pingpong=pingpong,
+            is_persistent=persistent,
+        )
+        cache[compile_key] = cute.compile(
+            gemm,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            None,  # varlen_args
+            None,  # mAIdx
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        None,
+        None,
+        current_stream,
+    )
+gemm_act_sm90.compile_cache = {}

quack/gemm_config.py CHANGED Viewed

@@ -1,61 +1,69 @@
-# Copyright (C) 2025, Tri Dao.
+# Copyright (C) 2025, Fri Dao.
 import itertools
-from typing import Optional
-from pydantic import BaseModel
+from typing import Optional, List
+from dataclasses import dataclass
-class GemmConfig(BaseModel, frozen=True):
-    tile_m: int = 256
-    tile_n: int = 128
+@dataclass(frozen=True)
+class GemmConfig:
+    tile_m: int = 128
+    tile_n: int = 192
+    pingpong: bool = True
     cluster_m: int = 2
     cluster_n: int = 1
     swap_ab: bool = False
-    pingpong: bool = False
-    raster_order: int = 2
-    max_swizzle_size: int = 1
+    # raster_order: int = 1
+    # max_swizzle_size: int = 8
 def get_all_configs(
-    epilogue: Optional[str],
-    tune_pingpong=True,
-    tune_raster_order=True,
-) -> list[GemmConfig]:
+    epilogue: Optional[str] = None,
+    tune_coop: bool = True,
+    # tune_raster_order=True,
+) -> List[GemmConfig]:
     tile_n_vals = [128, 144, 160, 176, 192, 208]
-    tile_mn_vals = [(256, tile_n) for tile_n in tile_n_vals]
-    if epilogue in ["swiglu"]:
-        tile_mn_vals = [(m, n) for m, n in tile_mn_vals if n % 32 == 0]
-    cluster = [(1, 1), (1, 2), (2, 1)]
-    # cluster = [(1, 2), (2, 1)]
+    tile_mn_coop_vals = [(256, tile_n) for tile_n in tile_n_vals] + [
+        (128, 224),
+        (128, 256),
+        # (192, 256),  # Getting IOT instruction (core dumped) in the bwd
+    ]
+    tile_mn_pingpong_vals = [(128, tile_n) for tile_n in tile_n_vals] + [(192, 128)]
+    if epilogue in ["gated"]:
+        tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if n % 32 == 0 and m != 192]
+        tile_mn_pingpong_vals = [(m, n) for m, n in tile_mn_pingpong_vals if n % 32 == 0]
+    elif epilogue in ["lse"]:
+        tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if m != 192]
+    tile_mn_vals = []
+    if tune_coop:
+        tile_mn_vals += [(m, n, False) for m, n in tile_mn_coop_vals]
+    tile_mn_vals += [(m, n, True) for m, n in tile_mn_pingpong_vals]
+    cluster = [(1, 2), (2, 1)]
+    # cluster = [(1, 1), (1, 2), (2, 1)]
     if epilogue in ["lse"]:
         cluster = [(1, 2), (2, 1)]
     swap_ab_vals = [False, True]
-    if epilogue in ["lse", "swiglu"]:
+    if epilogue in ["lse", "gated"]:
         swap_ab_vals = [False]
-    pingpong_vals = [False, True] if tune_pingpong else [False]
-    raster_swizzle = (
-        [(0, 1)]
-        if not tune_raster_order
-        else [(1, 1), (1, 2), (1, 4), (1, 8), (2, 1), (2, 2), (2, 4), (2, 8)]
-    )
+    # raster_swizzle = (
+    #     [(0, 1)]
+    #     if not tune_raster_order
+    #     else [(1, 1), (1, 2), (1, 4), (1, 8), (2, 1), (2, 2), (2, 4), (2, 8)]
+    # )
     return [
         GemmConfig(
-            tile_m=tile_m if not pingpong else 128,
+            tile_m=tile_m,
             tile_n=tile_n,
+            pingpong=pingpong,
             cluster_m=cluster_m,
             cluster_n=cluster_n,
             swap_ab=swap_ab,
-            pingpong=pingpong,
-            raster_order=raster_order,
-            max_swizzle_size=max_swizzle_size,
+            # raster_order=raster_order,
+            # max_swizzle_size=max_swizzle_size,
         )
-        for (tile_m, tile_n), (cluster_m, cluster_n), swap_ab, pingpong, (
-            raster_order,
-            max_swizzle_size,
-        ) in itertools.product(
+        for (tile_m, tile_n, pingpong), (cluster_m, cluster_n), swap_ab in itertools.product(
             tile_mn_vals,
             cluster,
             swap_ab_vals,
-            pingpong_vals,
-            raster_swizzle,
+            # raster_swizzle,
         )
     ]

quack/gemm_dact_sm90.py ADDED Viewed

@@ -0,0 +1,150 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass import const_expr
+import cutlass.torch as cutlass_torch
+from quack.gemm_act_sm90 import GemmActSm90
+from quack.cute_dsl_utils import get_max_active_clusters
+from quack.gemm_wrapper_utils import GemmWrapperBase
+import quack.activation
+class GemmDActSm90(GemmActSm90):
+    # Different from GemmActSm90, here act_bwd_fn must take in 2 arguments (x, dout)
+    # and return 2 arguments (dx, out)
+    EpilogueArguments = GemmActSm90.EpilogueArguments
+    EpilogueParams = GemmActSm90.EpilogueParams
+    @cute.jit
+    def epi_visit_acc_subtile(
+        self,
+        params: EpilogueParams,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        assert tRS_rC is not None
+        tRS_rC_acc = cute.make_fragment_like(tRS_rC, self.acc_dtype)
+        tRS_rC_acc.store(tRS_rC.load().to(self.acc_dtype))
+        # If we don't have .shape here, the compiler generates local stores and loads
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rD[i], tRS_rPostAct[i] = params.act_fn(tRS_rC_acc[i], tRS_rD[i])
+        else:
+            tRS_rPostAct = tRS_rC_acc
+        # Type conversion
+        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
+dact_fn_map = {
+    None: None,
+    "relu": quack.activation.drelu,
+    "relu_sq": quack.activation.drelu_sq,
+    "gelu_tanh_approx": quack.activation.dgelu_tanh_approx,
+}
+def gemm_dact_sm90(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, n, k)
+    Out: Tensor,  # (l, m, n)
+    PreAct: Tensor,  # (l, m, n)
+    PostAct: Tensor,  # (l, m, n)
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = True,
+    persistent: bool = True,
+) -> None:
+    assert activation in dact_fn_map, f"Unsupported activation {activation}"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, Out, PreAct, additional_tensors={"PostAct": PostAct}
+    )
+    GemmWrapperBase.permute_tensors(tensor_infos)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    acc_dtype = cutlass.Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmDActSm90.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = dact_fn_map[activation]
+    epi_args = GemmDActSm90.EpilogueArguments(tensor_infos["PostAct"].cute_tensor, act_fn)
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_dact_sm90.compile_cache
+    if compile_key not in cache:
+        gemm = GemmDActSm90(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            pingpong=pingpong,
+            is_persistent=persistent,
+        )
+        cache[compile_key] = cute.compile(
+            gemm,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            None,  # varlen_args
+            None,  # mAIdx
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        None,
+        None,
+        current_stream,
+    )
+gemm_dact_sm90.compile_cache = {}

quack-kernels 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl

quack-kernels 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl