PyPI - quack-kernels - Versions diffs - 0.1.10__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

quack-kernels 0.1.10py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

quack/__init__.py +8 -1
quack/activation.py +288 -0
quack/autotuner.py +310 -0
quack/cross_entropy.py +325 -175
quack/cute_dsl_utils.py +119 -0
quack/dense_gemm_sm100.py +2562 -0
quack/dense_gemm_sm90.py +1657 -842
quack/fast_math.py +80 -0
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +69 -0
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +569 -0
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +5 -3
quack/linear.py +240 -0
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +74 -0
quack/pipeline.py +151 -0
quack/reduce.py +241 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +583 -231
quack/softmax.py +27 -15
quack/sort/bitonic_sort.py +126 -0
quack/sort/generate_sorting_networks.py +326 -0
quack/sort/sorting_networks.py +120 -0
quack/sort/utils.py +31 -0
quack/symmetric_dense_gemm_sm90.py +2091 -0
quack/tensormap_manager.py +115 -0
quack/tile_scheduler.py +937 -0
quack/topk.py +227 -0
quack/utils.py +203 -230
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/METADATA +2 -2
quack_kernels-0.2.0.dist-info/RECORD +37 -0
quack_kernels-0.1.10.dist-info/RECORD +0 -13
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/top_level.txt +0 -0

quack/fast_math.py ADDED Viewed

@@ -0,0 +1,80 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Uint32
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+from quack.cute_dsl_utils import ParamsBase
+@cute.jit
+def clz(x: Int32) -> Int32:
+    # for i in cutlass.range_constexpr(32):
+    #     if (1 << (31 - i)) & x:
+    #         return Int32(i)
+    # return Int32(32)
+    # Early exit is not supported yet
+    res = Int32(32)
+    done = False
+    for i in cutlass.range(32):
+        if ((1 << (31 - i)) & x) and not done:
+            res = Int32(i)
+            done = True
+    return res
+def find_log2(x: Int32) -> Int32:
+    a: Int32 = Int32(31 - clz(x))
+    return a + ((x & (x - 1)) != 0)  # Round up, add 1 if not a power of 2.
+@dsl_user_op
+def umulhi(a: Int32, b: Int32, *, loc=None, ip=None) -> Uint32:
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Int32(a).ir_value(loc=loc, ip=ip), Int32(b).ir_value(loc=loc, ip=ip)],
+            "mul.hi.u32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dataclass
+class FastDivmod(ParamsBase):
+    divisor: Int32
+    multiplier: Uint32
+    shift_right: Uint32
+    # called by host
+    @staticmethod
+    def create(divisor: Int32) -> "FastDivmod":
+        """Construct the FastDivmod object, in host code.
+        This precomputes some values based on the divisor and is computationally expensive.
+        """
+        p = Uint32(31 + find_log2(divisor))
+        divisor_u32 = Uint32(divisor)
+        multiplier = Uint32(((cutlass.Uint64(1) << p) + divisor_u32 - 1) // divisor_u32)
+        shift_right = Uint32(p - 32)
+        return FastDivmod(divisor, multiplier, shift_right)
+    @cute.jit
+    def div(self, dividend: Int32) -> Int32:
+        return (
+            Int32(umulhi(dividend, self.multiplier) >> self.shift_right)
+            if self.divisor != 1
+            else dividend
+        )
+    def divmod(self, dividend: Int32) -> Tuple[Int32, Int32]:
+        quotient = self.div(dividend)
+        remainder = dividend - quotient * self.divisor
+        return quotient, remainder

quack/gemm_act_sm90.py ADDED Viewed

@@ -0,0 +1,368 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple, Optional, Callable
+from dataclasses import dataclass
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import warpgroup
+import cutlass.utils.hopper_helpers as sm90_utils
+from cutlass import Int32, Float32, Boolean, const_expr
+import cutlass.torch as cutlass_torch
+from quack.cute_dsl_utils import ArgumentsBase, ParamsBase
+from quack.dense_gemm_sm90 import GemmSm90
+from quack.cute_dsl_utils import get_max_active_clusters
+from quack.gemm_wrapper_utils import GemmWrapperBase
+import quack.activation
+class GemmActSm90(GemmSm90):
+    @dataclass
+    class EpilogueArguments(ArgumentsBase):
+        mPostAct: cute.Tensor
+        act_fn: cutlass.Constexpr[Optional[Callable]] = None
+        alpha: Optional[Float32] = None
+        beta: Optional[Float32] = None
+    @dataclass
+    class EpilogueParams(ParamsBase):
+        tma_atom_postact: cute.CopyAtom
+        mPostAct_mnl: cute.Tensor
+        epi_postact_smem_layout_staged: cute.ComposedLayout
+        act_fn: cutlass.Constexpr[Optional[Callable]] = None
+        alpha: Optional[Float32] = None
+        beta: Optional[Float32] = None
+    def epi_to_underlying_arguments(
+        self, args: EpilogueArguments, *, loc=None, ip=None
+    ) -> EpilogueParams:
+        self.postact_dtype = args.mPostAct.element_type
+        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
+        self.tile_shape_postact_mn = self.tile_shape_mnk[:2]
+        self.epi_tile_postact = self.epi_tile
+        postact_major_mode_size = (
+            self.epi_tile_postact[1]
+            if self.postact_layout.is_n_major_c()
+            else self.epi_tile_postact[0]
+        )
+        postact_smem_layout_atom = warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(
+                self.postact_layout, self.postact_dtype, postact_major_mode_size
+            ),
+            self.postact_dtype,
+        )
+        epi_postact_smem_layout_staged = cute.tile_to_shape(
+            postact_smem_layout_atom,
+            cute.append(self.epi_tile_postact, self.epi_stage),
+            order=(0, 1, 2),
+        )
+        tma_atom_postact, tma_tensor_postact = self._make_tma_epi_atoms_and_tensors(
+            args.mPostAct,
+            epi_postact_smem_layout_staged,
+            self.epi_tile_postact,
+            store_or_load="store",
+        )
+        return GemmActSm90.EpilogueParams(
+            tma_atom_postact,
+            tma_tensor_postact,
+            epi_postact_smem_layout_staged,
+            args.act_fn,
+            args.alpha,
+            args.beta,
+        )
+    @staticmethod
+    def epi_smem_bytes_per_stage(
+        args: EpilogueArguments,
+        tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: Tuple[int, int],
+    ) -> int:
+        postact_dtype = args.mPostAct.element_type
+        postact_bytes_per_stage = cute.size(epi_tile) * (postact_dtype.width // 8)
+        return postact_bytes_per_stage
+    def epi_get_smem_struct(self, params: EpilogueParams):
+        @cute.struct
+        class EpiSharedStorage:
+            sPostAct: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.postact_dtype, cute.cosize(params.epi_postact_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+        return EpiSharedStorage
+    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
+        sPostAct = storage.epi.sPostAct.get_tensor(
+            params.epi_postact_smem_layout_staged.outer,
+            swizzle=params.epi_postact_smem_layout_staged.inner,
+        )
+        return (sPostAct,)
+    @cute.jit
+    def epilogue(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_read_state: cutlass.pipeline.PipelineState,
+        epi_producer_state: cutlass.pipeline.PipelineState,
+        tiled_mma: cute.TiledMma,
+        tRS_rAcc: cute.Tensor,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor],
+        tiled_copy_r2s: cute.core.ThrCopy,
+        tRS_sD: cute.Tensor,
+        tiled_copy_s2r: Optional[cute.core.ThrCopy],
+        tSR_rC: Optional[cute.Tensor],
+        tSR_sC: Optional[cute.Tensor],
+        copy_D: Optional[Callable],
+        bSG_sD: cute.Tensor,
+        bSG_gD: cute.Tensor,
+        epi_load_g2s: Optional[Callable],
+        tile_coord_mnkl: cute.Coord,
+        cu_seqlens_m: Optional[cute.Tensor],
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tile_scheduler,
+        tidx: Int32,
+        is_tma_warp: Boolean,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
+        has_C = const_expr(tRS_rC is not None)
+        has_D = const_expr(copy_D is not None)
+        assert cu_seqlens_m is None, "GemmActSm90 doesn't support varlen_m for now"
+        tma_atom_postact = params.tma_atom_postact
+        mPostAct_mnl = params.mPostAct_mnl
+        (sPostAct,) = epi_smem_tensors
+        tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        copy_atom_postact_r2s = sm90_utils.sm90_get_smem_store_op(
+            self.postact_layout, elem_ty_d=self.postact_dtype, elem_ty_acc=self.acc_dtype
+        )
+        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
+        thr_copy_postact_r2s = tiled_copy_postact_r2s.get_slice(tidx)
+        tRS_sPostAct = thr_copy_postact_r2s.partition_D(sPostAct)
+        bSG_sPostAct, bSG_gPostAct = self.epilog_gmem_copy_and_partition(
+            tma_atom_postact,
+            mPostAct_mnl,
+            self.tile_shape_postact_mn,
+            self.epi_tile_postact,
+            sPostAct,
+            tile_coord_mnkl,
+            cu_seqlens_m,
+        )
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_shape = cute.zipped_divide(
+            cute.make_layout(self.tile_shape_mnk[:2]), self.epi_tile
+        ).shape[1]
+        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        epi_tile_num = cute.size(epi_tile_shape)
+        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
+        if const_expr(epi_load_g2s is not None):
+            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
+                epi_producer_state = epi_load_g2s(epi_producer_state, epi_idx, is_tma_warp)
+        for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # Copy from acc to D registers
+            for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
+                tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+            if const_expr(has_C):
+                epi_pipeline.consumer_wait(epi_read_state)
+                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
+                # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
+                cute.arch.sync_warp()
+                with cute.arch.elect_one():
+                    epi_pipeline.consumer_release(epi_read_state)
+                epi_read_state.advance()
+            if const_expr(epi_load_g2s is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                epi_producer_state = epi_load_g2s(
+                    epi_producer_state, epi_idx + self.epi_c_stage, is_tma_warp
+                )
+            tRS_rPostAct = self.epi_visit_acc_subtile(params, tRS_rD, tRS_rC)
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            # Copy from D registers to shared memory
+            if const_expr(has_D):
+                # Type conversion
+                tRS_rD_out = cute.make_fragment_like(tRS_rD, self.d_dtype)
+                tRS_rD_out.store(tRS_rD.load().to(self.d_dtype))
+                cute.copy(tiled_copy_r2s, tRS_rD_out, tRS_sD[None, None, None, epi_buffer])
+            cute.copy(
+                tiled_copy_postact_r2s,
+                tiled_copy_postact_r2s.retile(tRS_rPostAct),
+                tRS_sPostAct[None, None, None, epi_buffer],
+            )
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Get the global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(bSG_sD[None, epi_buffer], bSG_gD[None, gmem_coord])
+                cute.copy(
+                    tma_atom_postact,
+                    bSG_sPostAct[None, epi_buffer],
+                    bSG_gPostAct[None, gmem_coord],
+                )
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(self.epi_stage - 1, read=True)
+            epilogue_barrier.arrive_and_wait()
+        return epi_read_state, epi_producer_state
+    @cute.jit
+    def epi_visit_acc_subtile(
+        self,
+        params: EpilogueParams,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        # Apply alpha scaling to accumulator if alpha is provided (not None)
+        if const_expr(params.alpha is not None):
+            tRS_rD.store(tRS_rD.load() * params.alpha)
+        # Apply C with beta scaling
+        if const_expr(tRS_rC is not None):
+            if const_expr(params.beta is None):
+                # beta is None, default behavior: add C (beta=1.0)
+                tRS_rD.store(tRS_rD.load() + tRS_rC.load().to(tRS_rD.element_type))
+            else:
+                tRS_rD.store(tRS_rD.load() + params.beta * tRS_rC.load().to(tRS_rD.element_type))
+        # Apply activation function if provided
+        # If we don't have .shape here, the compiler generates local stores and loads
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
+        else:
+            tRS_rPostAct = tRS_rD
+        # Type conversion
+        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
+act_fn_map = {
+    None: None,
+    "relu": quack.activation.relu,
+    "relu_sq": quack.activation.relu_sq,
+    "gelu_tanh_approx": quack.activation.gelu_tanh_approx,
+}
+def gemm_act_sm90(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, n, k)
+    D: Optional[Tensor],  # (l, m, n)
+    C: Optional[Tensor],  # (l, m, n)
+    PostAct: Tensor,  # (l, m, n)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+) -> None:
+    tile_count_semaphore = None
+    assert activation in act_fn_map, f"Unsupported activation {activation}"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, additional_tensors={"PostAct": PostAct}
+    )
+    GemmWrapperBase.permute_tensors(tensor_infos)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    acc_dtype = cutlass.Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmActSm90.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = act_fn_map[activation]
+    epi_args = GemmActSm90.EpilogueArguments(
+        tensor_infos["PostAct"].cute_tensor,
+        act_fn,
+        alpha=Float32(alpha) if alpha != 1.0 else None,
+        beta=Float32(beta) if beta != 1.0 else None,
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        alpha != 1.0,
+        beta != 1.0,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_act_sm90.compile_cache
+    if compile_key not in cache:
+        gemm = GemmActSm90(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            pingpong=pingpong,
+            is_persistent=persistent,
+        )
+        cache[compile_key] = cute.compile(
+            gemm,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            None,  # varlen_args
+            None,  # mAIdx
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        None,
+        None,
+        current_stream,
+    )
+gemm_act_sm90.compile_cache = {}

quack/gemm_config.py ADDED Viewed

@@ -0,0 +1,69 @@
+# Copyright (C) 2025, Fri Dao.
+import itertools
+from typing import Optional, List
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class GemmConfig:
+    tile_m: int = 128
+    tile_n: int = 192
+    pingpong: bool = True
+    cluster_m: int = 2
+    cluster_n: int = 1
+    swap_ab: bool = False
+    # raster_order: int = 1
+    # max_swizzle_size: int = 8
+def get_all_configs(
+    epilogue: Optional[str] = None,
+    tune_coop: bool = True,
+    # tune_raster_order=True,
+) -> List[GemmConfig]:
+    tile_n_vals = [128, 144, 160, 176, 192, 208]
+    tile_mn_coop_vals = [(256, tile_n) for tile_n in tile_n_vals] + [
+        (128, 224),
+        (128, 256),
+        # (192, 256),  # Getting IOT instruction (core dumped) in the bwd
+    ]
+    tile_mn_pingpong_vals = [(128, tile_n) for tile_n in tile_n_vals] + [(192, 128)]
+    if epilogue in ["gated"]:
+        tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if n % 32 == 0 and m != 192]
+        tile_mn_pingpong_vals = [(m, n) for m, n in tile_mn_pingpong_vals if n % 32 == 0]
+    elif epilogue in ["lse"]:
+        tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if m != 192]
+    tile_mn_vals = []
+    if tune_coop:
+        tile_mn_vals += [(m, n, False) for m, n in tile_mn_coop_vals]
+    tile_mn_vals += [(m, n, True) for m, n in tile_mn_pingpong_vals]
+    cluster = [(1, 2), (2, 1)]
+    # cluster = [(1, 1), (1, 2), (2, 1)]
+    if epilogue in ["lse"]:
+        cluster = [(1, 2), (2, 1)]
+    swap_ab_vals = [False, True]
+    if epilogue in ["lse", "gated"]:
+        swap_ab_vals = [False]
+    # raster_swizzle = (
+    #     [(0, 1)]
+    #     if not tune_raster_order
+    #     else [(1, 1), (1, 2), (1, 4), (1, 8), (2, 1), (2, 2), (2, 4), (2, 8)]
+    # )
+    return [
+        GemmConfig(
+            tile_m=tile_m,
+            tile_n=tile_n,
+            pingpong=pingpong,
+            cluster_m=cluster_m,
+            cluster_n=cluster_n,
+            swap_ab=swap_ab,
+            # raster_order=raster_order,
+            # max_swizzle_size=max_swizzle_size,
+        )
+        for (tile_m, tile_n, pingpong), (cluster_m, cluster_n), swap_ab in itertools.product(
+            tile_mn_vals,
+            cluster,
+            swap_ab_vals,
+            # raster_swizzle,
+        )
+    ]

quack/gemm_dact_sm90.py ADDED Viewed

@@ -0,0 +1,150 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass import const_expr
+import cutlass.torch as cutlass_torch
+from quack.gemm_act_sm90 import GemmActSm90
+from quack.cute_dsl_utils import get_max_active_clusters
+from quack.gemm_wrapper_utils import GemmWrapperBase
+import quack.activation
+class GemmDActSm90(GemmActSm90):
+    # Different from GemmActSm90, here act_bwd_fn must take in 2 arguments (x, dout)
+    # and return 2 arguments (dx, out)
+    EpilogueArguments = GemmActSm90.EpilogueArguments
+    EpilogueParams = GemmActSm90.EpilogueParams
+    @cute.jit
+    def epi_visit_acc_subtile(
+        self,
+        params: EpilogueParams,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        assert tRS_rC is not None
+        tRS_rC_acc = cute.make_fragment_like(tRS_rC, self.acc_dtype)
+        tRS_rC_acc.store(tRS_rC.load().to(self.acc_dtype))
+        # If we don't have .shape here, the compiler generates local stores and loads
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rD[i], tRS_rPostAct[i] = params.act_fn(tRS_rC_acc[i], tRS_rD[i])
+        else:
+            tRS_rPostAct = tRS_rC_acc
+        # Type conversion
+        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
+dact_fn_map = {
+    None: None,
+    "relu": quack.activation.drelu,
+    "relu_sq": quack.activation.drelu_sq,
+    "gelu_tanh_approx": quack.activation.dgelu_tanh_approx,
+}
+def gemm_dact_sm90(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, n, k)
+    Out: Tensor,  # (l, m, n)
+    PreAct: Tensor,  # (l, m, n)
+    PostAct: Tensor,  # (l, m, n)
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = True,
+    persistent: bool = True,
+) -> None:
+    assert activation in dact_fn_map, f"Unsupported activation {activation}"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, Out, PreAct, additional_tensors={"PostAct": PostAct}
+    )
+    GemmWrapperBase.permute_tensors(tensor_infos)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    acc_dtype = cutlass.Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmDActSm90.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = dact_fn_map[activation]
+    epi_args = GemmDActSm90.EpilogueArguments(tensor_infos["PostAct"].cute_tensor, act_fn)
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_dact_sm90.compile_cache
+    if compile_key not in cache:
+        gemm = GemmDActSm90(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            pingpong=pingpong,
+            is_persistent=persistent,
+        )
+        cache[compile_key] = cute.compile(
+            gemm,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            None,  # varlen_args
+            None,  # mAIdx
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        None,
+        None,
+        current_stream,
+    )
+gemm_dact_sm90.compile_cache = {}

quack-kernels 0.1.10__py3-none-any.whl → 0.2.0__py3-none-any.whl

quack-kernels 0.1.10py3-none-any.whl → 0.2.0py3-none-any.whl