PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack_kernels-0.2.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,44 @@
+quack/__init__.py,sha256=iM_lvTpHS-Yxfxm8YP4MMfuP9esJpxI8karP2Dw7sFg,203
+quack/activation.py,sha256=-lZgojraqdyLjOzgOXBehoVeRBhBq30UX7kOkXsCpGI,20855
+quack/autotuner.py,sha256=atw0ntedi22RPwSdjWOoge4S56S8VFvRocJQcYhpAlo,13454
+quack/broadcast_utils.py,sha256=X5vWg2RtIIWU9Z7nEUW6m0EP0Cfd9XtCKxp4tSyp4Mg,1283
+quack/compile_utils.py,sha256=qJ3oTsDlbAiddrJHtEO7LPYVqn_s-neNfiw-_KvfXZU,591
+quack/copy_utils.py,sha256=J1Hcw18iNHHpOP2wNFhF8Lz16NEmXtoQMu59mmLrRCs,18761
+quack/cross_entropy.py,sha256=w6fjHC_vXt5ji2KfoLrSOdAvpLrQszrYU9rmRij2yY8,24899
+quack/cute_dsl_utils.py,sha256=4uQx5aYDG9UvVzbWwJTjjJLrnoympz70_CD8b37FQWo,3854
+quack/fast_math.py,sha256=E1XUqfUt0_n9BPZNggF-UDzZ6anso9bYUrwqafemWvQ,2297
+quack/gemm.py,sha256=8V23MPq49QbV3csv-_AxjfE9qf8R3NIqFK9Q9db6t2c,7417
+quack/gemm_act.py,sha256=Y8HJKfw3tCoFKecwhwhd5xpXd9jCQCGZT_V2xXf-CnU,20823
+quack/gemm_config.py,sha256=94o3g9x7H0wi7aBbsb7H67H8nSzTurwL2zgvKDtQUas,3575
+quack/gemm_dact.py,sha256=l__UhCrFbPjD9a1TAVgP7_C7p5lLfX5DkRcM6z0ofOw,7789
+quack/gemm_default_epi.py,sha256=6qO8Ovtcw8sQQ_kXTBTTQ5IHh1lS6RBCGZG0lgLHNrs,11916
+quack/gemm_interface.py,sha256=AF5PYTNgEHjb3MNXcNvvEpOcShAHtak0Xu12l1zrOAw,44804
+quack/gemm_sm100.py,sha256=U9jmzpST_d1W6CBFf1ZHhTtr0K8hENCsUz7dXvHaMZc,122344
+quack/gemm_sm90.py,sha256=u-Q3fN6DPm1fEdz0LcMecMbGTBcRunUCWopufwO8cHU,92015
+quack/gemm_symmetric.py,sha256=mqx7wgOCY6Dh9hjL6gR9PBstMD476GhpA_NkGeaEtik,13349
+quack/gemm_wrapper_utils.py,sha256=EaPyR3Lq19z_RkdB2_xxRj0IPSJMgyfpkrTXyvY3B6M,12775
+quack/layout_utils.py,sha256=QjFFlvDcLiyGGfA2FKWKI75twHIkOJ2AotE0cIpBAlI,11923
+quack/linear.py,sha256=mhN2A98w7H7X4MS63XCCK3gpOm1eS8H7a4WO9ovkt5U,9791
+quack/linear_cross_entropy.py,sha256=Zhy_gdMsKHOie-jntBaqIuiDJtkiq6qEBwnyuWwIRw4,10092
+quack/mlp.py,sha256=YjdwQRwEePA9KyidFXp5H1-lxiJc8dZ41vl8Fv8pgss,2259
+quack/pipeline.py,sha256=mMdIlpUaHdRDOkvQzgKdCdJydJq6C2eYrny5Bui4KFs,11311
+quack/reduce.py,sha256=ySKT2xh1_pIlbJX29BPmwH6yJ7MxIrRZyxHIPPYVpm0,12698
+quack/reduction_base.py,sha256=QqlPs5L2VCxwDrO4CHPq-KY6f_BAYRbvsR6k81LPzTU,3180
+quack/rmsnorm.py,sha256=esy18s5JtT7KBPRPhWf_anLRTrtromwqeJmg2yzOm60,44678
+quack/sm100_utils.py,sha256=-p5qj3Wi9n4WDLy2sl-fApYpGp5rH3JvZQb712OTxPs,1901
+quack/sm90_utils.py,sha256=hg8qq7S8NODZlUSaxNpdZcsnxcR0jM921rMn1VmBo7o,4278
+quack/softmax.py,sha256=ZqeVbnGfzwkro1LfWBHagbS7B7ug7b9SLZWuGx_Y3Kc,14367
+quack/tensormap_manager.py,sha256=Ts3Mxp0_es2RNA0ffvUjWMXN79lsfWEBZ0DQYhtbcnw,5338
+quack/tile_scheduler.py,sha256=vbKq0xp94eII0uJ63yY_3sgvJkQI7Irc8y1OttO6cRA,42514
+quack/topk.py,sha256=43xHpRGbwZCSRsulmfrG4WA_r2eLHc3sniaUFU7wn-o,22522
+quack/utils.py,sha256=WIttE1iiwyPIwR1NpaeO26Pn9YkZb361TDxFTUDH-IE,7354
+quack/varlen_utils.py,sha256=SOYkomxX2FoqjYlybg99CqNhS9IARM6F9ba2AkIVvT4,15811
+quack/sort/bitonic_sort.py,sha256=VJPVjPulW_jEr3myBE7AiBYGtsc5T9FEy3sjXFukF7s,4831
+quack/sort/generate_sorting_networks.py,sha256=vkJBOjTVEinQkWT4OtFqOWxFVdTIPoNAQocneKc9-rM,14477
+quack/sort/sorting_networks.py,sha256=l_26zi3gXD_z-tnm2eAczRrmE-mbaz00KmqH6ONivL8,9686
+quack/sort/utils.py,sha256=RbubEY1GcEpsjiz_6o5o2WB47IeMOzaajW6Jis0s444,1059
+quack_kernels-0.2.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+quack_kernels-0.2.3.dist-info/METADATA,sha256=-WFp4n_2_bB8KMrDsO2AStm5bx4Av8gZE2wWeEEfcwQ,361
+quack_kernels-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+quack_kernels-0.2.3.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
+quack_kernels-0.2.3.dist-info/RECORD,,

quack/gemm_act_sm90.py DELETED Viewed

@@ -1,368 +0,0 @@
-# Copyright (c) 2025, Tri Dao.
-from typing import Tuple, Optional, Callable
-from dataclasses import dataclass
-from torch import Tensor
-import cutlass
-import cutlass.cute as cute
-from cutlass.cute.nvgpu import warpgroup
-import cutlass.utils.hopper_helpers as sm90_utils
-from cutlass import Int32, Float32, Boolean, const_expr
-import cutlass.torch as cutlass_torch
-from quack.cute_dsl_utils import ArgumentsBase, ParamsBase
-from quack.dense_gemm_sm90 import GemmSm90
-from quack.cute_dsl_utils import get_max_active_clusters
-from quack.gemm_wrapper_utils import GemmWrapperBase
-import quack.activation
-class GemmActSm90(GemmSm90):
-    @dataclass
-    class EpilogueArguments(ArgumentsBase):
-        mPostAct: cute.Tensor
-        act_fn: cutlass.Constexpr[Optional[Callable]] = None
-        alpha: Optional[Float32] = None
-        beta: Optional[Float32] = None
-    @dataclass
-    class EpilogueParams(ParamsBase):
-        tma_atom_postact: cute.CopyAtom
-        mPostAct_mnl: cute.Tensor
-        epi_postact_smem_layout_staged: cute.ComposedLayout
-        act_fn: cutlass.Constexpr[Optional[Callable]] = None
-        alpha: Optional[Float32] = None
-        beta: Optional[Float32] = None
-    def epi_to_underlying_arguments(
-        self, args: EpilogueArguments, *, loc=None, ip=None
-    ) -> EpilogueParams:
-        self.postact_dtype = args.mPostAct.element_type
-        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
-        self.tile_shape_postact_mn = self.tile_shape_mnk[:2]
-        self.epi_tile_postact = self.epi_tile
-        postact_major_mode_size = (
-            self.epi_tile_postact[1]
-            if self.postact_layout.is_n_major_c()
-            else self.epi_tile_postact[0]
-        )
-        postact_smem_layout_atom = warpgroup.make_smem_layout_atom(
-            sm90_utils.get_smem_layout_atom(
-                self.postact_layout, self.postact_dtype, postact_major_mode_size
-            ),
-            self.postact_dtype,
-        )
-        epi_postact_smem_layout_staged = cute.tile_to_shape(
-            postact_smem_layout_atom,
-            cute.append(self.epi_tile_postact, self.epi_stage),
-            order=(0, 1, 2),
-        )
-        tma_atom_postact, tma_tensor_postact = self._make_tma_epi_atoms_and_tensors(
-            args.mPostAct,
-            epi_postact_smem_layout_staged,
-            self.epi_tile_postact,
-            store_or_load="store",
-        )
-        return GemmActSm90.EpilogueParams(
-            tma_atom_postact,
-            tma_tensor_postact,
-            epi_postact_smem_layout_staged,
-            args.act_fn,
-            args.alpha,
-            args.beta,
-        )
-    @staticmethod
-    def epi_smem_bytes_per_stage(
-        args: EpilogueArguments,
-        tile_shape_mnk: Tuple[int, int, int],
-        epi_tile: Tuple[int, int],
-    ) -> int:
-        postact_dtype = args.mPostAct.element_type
-        postact_bytes_per_stage = cute.size(epi_tile) * (postact_dtype.width // 8)
-        return postact_bytes_per_stage
-    def epi_get_smem_struct(self, params: EpilogueParams):
-        @cute.struct
-        class EpiSharedStorage:
-            sPostAct: cute.struct.Align[
-                cute.struct.MemRange[
-                    self.postact_dtype, cute.cosize(params.epi_postact_smem_layout_staged)
-                ],
-                self.buffer_align_bytes,
-            ]
-        return EpiSharedStorage
-    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
-        sPostAct = storage.epi.sPostAct.get_tensor(
-            params.epi_postact_smem_layout_staged.outer,
-            swizzle=params.epi_postact_smem_layout_staged.inner,
-        )
-        return (sPostAct,)
-    @cute.jit
-    def epilogue(
-        self,
-        params: EpilogueParams,
-        epi_smem_tensors: Tuple[cute.Tensor, ...],
-        epi_pipeline: cutlass.pipeline.PipelineAsync,
-        epi_read_state: cutlass.pipeline.PipelineState,
-        epi_producer_state: cutlass.pipeline.PipelineState,
-        tiled_mma: cute.TiledMma,
-        tRS_rAcc: cute.Tensor,
-        tRS_rD: cute.Tensor,
-        tRS_rC: Optional[cute.Tensor],
-        tiled_copy_r2s: cute.core.ThrCopy,
-        tRS_sD: cute.Tensor,
-        tiled_copy_s2r: Optional[cute.core.ThrCopy],
-        tSR_rC: Optional[cute.Tensor],
-        tSR_sC: Optional[cute.Tensor],
-        copy_D: Optional[Callable],
-        bSG_sD: cute.Tensor,
-        bSG_gD: cute.Tensor,
-        epi_load_g2s: Optional[Callable],
-        tile_coord_mnkl: cute.Coord,
-        cu_seqlens_m: Optional[cute.Tensor],
-        epilogue_barrier: cutlass.pipeline.NamedBarrier,
-        tile_scheduler,
-        tidx: Int32,
-        is_tma_warp: Boolean,
-    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
-        has_C = const_expr(tRS_rC is not None)
-        has_D = const_expr(copy_D is not None)
-        assert cu_seqlens_m is None, "GemmActSm90 doesn't support varlen_m for now"
-        tma_atom_postact = params.tma_atom_postact
-        mPostAct_mnl = params.mPostAct_mnl
-        (sPostAct,) = epi_smem_tensors
-        tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
-        copy_atom_postact_r2s = sm90_utils.sm90_get_smem_store_op(
-            self.postact_layout, elem_ty_d=self.postact_dtype, elem_ty_acc=self.acc_dtype
-        )
-        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
-        thr_copy_postact_r2s = tiled_copy_postact_r2s.get_slice(tidx)
-        tRS_sPostAct = thr_copy_postact_r2s.partition_D(sPostAct)
-        bSG_sPostAct, bSG_gPostAct = self.epilog_gmem_copy_and_partition(
-            tma_atom_postact,
-            mPostAct_mnl,
-            self.tile_shape_postact_mn,
-            self.epi_tile_postact,
-            sPostAct,
-            tile_coord_mnkl,
-            cu_seqlens_m,
-        )
-        # We iterate over epi tiles in the N dimension first before the M dimension
-        epi_tile_shape = cute.zipped_divide(
-            cute.make_layout(self.tile_shape_mnk[:2]), self.epi_tile
-        ).shape[1]
-        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
-        epi_tile_num = cute.size(epi_tile_shape)
-        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
-        if const_expr(epi_load_g2s is not None):
-            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
-                epi_producer_state = epi_load_g2s(epi_producer_state, epi_idx, is_tma_warp)
-        for epi_idx in cutlass.range_constexpr(epi_tile_num):
-            # Copy from acc to D registers
-            for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
-                tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
-            if const_expr(has_C):
-                epi_pipeline.consumer_wait(epi_read_state)
-                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
-                # Fence to make sure shared memory read is visible to TMA load
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                )
-                cute.arch.sync_warp()
-                with cute.arch.elect_one():
-                    epi_pipeline.consumer_release(epi_read_state)
-                epi_read_state.advance()
-            if const_expr(epi_load_g2s is not None and epi_idx + self.epi_c_stage < epi_tile_num):
-                epi_producer_state = epi_load_g2s(
-                    epi_producer_state, epi_idx + self.epi_c_stage, is_tma_warp
-                )
-            tRS_rPostAct = self.epi_visit_acc_subtile(params, tRS_rD, tRS_rC)
-            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
-            # Copy from D registers to shared memory
-            if const_expr(has_D):
-                # Type conversion
-                tRS_rD_out = cute.make_fragment_like(tRS_rD, self.d_dtype)
-                tRS_rD_out.store(tRS_rD.load().to(self.d_dtype))
-                cute.copy(tiled_copy_r2s, tRS_rD_out, tRS_sD[None, None, None, epi_buffer])
-            cute.copy(
-                tiled_copy_postact_r2s,
-                tiled_copy_postact_r2s.retile(tRS_rPostAct),
-                tRS_sPostAct[None, None, None, epi_buffer],
-            )
-            # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
-            epilogue_barrier.arrive_and_wait()
-            # Get the global memory coordinate for the current epi tile
-            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
-            # Copy from shared memory to global memory
-            if is_tma_warp:
-                if const_expr(has_D):
-                    copy_D(bSG_sD[None, epi_buffer], bSG_gD[None, gmem_coord])
-                cute.copy(
-                    tma_atom_postact,
-                    bSG_sPostAct[None, epi_buffer],
-                    bSG_gPostAct[None, gmem_coord],
-                )
-                cute.arch.cp_async_bulk_commit_group()
-                cute.arch.cp_async_bulk_wait_group(self.epi_stage - 1, read=True)
-            epilogue_barrier.arrive_and_wait()
-        return epi_read_state, epi_producer_state
-    @cute.jit
-    def epi_visit_acc_subtile(
-        self,
-        params: EpilogueParams,
-        tRS_rD: cute.Tensor,
-        tRS_rC: Optional[cute.Tensor] = None,
-    ) -> Optional[cute.Tensor]:
-        # Apply alpha scaling to accumulator if alpha is provided (not None)
-        if const_expr(params.alpha is not None):
-            tRS_rD.store(tRS_rD.load() * params.alpha)
-        # Apply C with beta scaling
-        if const_expr(tRS_rC is not None):
-            if const_expr(params.beta is None):
-                # beta is None, default behavior: add C (beta=1.0)
-                tRS_rD.store(tRS_rD.load() + tRS_rC.load().to(tRS_rD.element_type))
-            else:
-                tRS_rD.store(tRS_rD.load() + params.beta * tRS_rC.load().to(tRS_rD.element_type))
-        # Apply activation function if provided
-        # If we don't have .shape here, the compiler generates local stores and loads
-        if const_expr(params.act_fn is not None):
-            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
-            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
-                tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
-        else:
-            tRS_rPostAct = tRS_rD
-        # Type conversion
-        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
-        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
-        return tRS_rPostAct_out
-act_fn_map = {
-    None: None,
-    "relu": quack.activation.relu,
-    "relu_sq": quack.activation.relu_sq,
-    "gelu_tanh_approx": quack.activation.gelu_tanh_approx,
-}
-def gemm_act_sm90(
-    A: Tensor,  # (l, m, k)
-    B: Tensor,  # (l, n, k)
-    D: Optional[Tensor],  # (l, m, n)
-    C: Optional[Tensor],  # (l, m, n)
-    PostAct: Tensor,  # (l, m, n)
-    activation: Optional[str],
-    tile_M: int,
-    tile_N: int,
-    cluster_M: int,
-    cluster_N: int,
-    pingpong: bool = False,
-    persistent: bool = True,
-    alpha: float = 1.0,
-    beta: float = 1.0,
-) -> None:
-    tile_count_semaphore = None
-    assert activation in act_fn_map, f"Unsupported activation {activation}"
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A, B, D, C, additional_tensors={"PostAct": PostAct}
-    )
-    GemmWrapperBase.permute_tensors(tensor_infos)
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-        "PostAct": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
-    acc_dtype = cutlass.Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmActSm90.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    act_fn = act_fn_map[activation]
-    epi_args = GemmActSm90.EpilogueArguments(
-        tensor_infos["PostAct"].cute_tensor,
-        act_fn,
-        alpha=Float32(alpha) if alpha != 1.0 else None,
-        beta=Float32(beta) if beta != 1.0 else None,
-    )
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters, tile_count_semaphore
-    )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        activation,
-        tile_shape_mn,
-        cluster_shape_mnk,
-        pingpong,
-        persistent,
-        tile_count_semaphore is not None,
-        alpha != 1.0,
-        beta != 1.0,
-        key_tensor_names=("A", "B", "D", "PostAct", "C"),
-    )
-    cache = gemm_act_sm90.compile_cache
-    if compile_key not in cache:
-        gemm = GemmActSm90(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            pingpong=pingpong,
-            is_persistent=persistent,
-        )
-        cache[compile_key] = cute.compile(
-            gemm,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            None,  # varlen_args
-            None,  # mAIdx
-            current_stream,
-        )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        None,
-        None,
-        current_stream,
-    )
-gemm_act_sm90.compile_cache = {}

quack/gemm_dact_sm90.py DELETED Viewed

@@ -1,150 +0,0 @@
-# Copyright (c) 2025, Tri Dao.
-from typing import Optional
-from torch import Tensor
-import cutlass
-import cutlass.cute as cute
-from cutlass import const_expr
-import cutlass.torch as cutlass_torch
-from quack.gemm_act_sm90 import GemmActSm90
-from quack.cute_dsl_utils import get_max_active_clusters
-from quack.gemm_wrapper_utils import GemmWrapperBase
-import quack.activation
-class GemmDActSm90(GemmActSm90):
-    # Different from GemmActSm90, here act_bwd_fn must take in 2 arguments (x, dout)
-    # and return 2 arguments (dx, out)
-    EpilogueArguments = GemmActSm90.EpilogueArguments
-    EpilogueParams = GemmActSm90.EpilogueParams
-    @cute.jit
-    def epi_visit_acc_subtile(
-        self,
-        params: EpilogueParams,
-        tRS_rD: cute.Tensor,
-        tRS_rC: Optional[cute.Tensor] = None,
-    ) -> Optional[cute.Tensor]:
-        assert tRS_rC is not None
-        tRS_rC_acc = cute.make_fragment_like(tRS_rC, self.acc_dtype)
-        tRS_rC_acc.store(tRS_rC.load().to(self.acc_dtype))
-        # If we don't have .shape here, the compiler generates local stores and loads
-        if const_expr(params.act_fn is not None):
-            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
-            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
-                tRS_rD[i], tRS_rPostAct[i] = params.act_fn(tRS_rC_acc[i], tRS_rD[i])
-        else:
-            tRS_rPostAct = tRS_rC_acc
-        # Type conversion
-        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
-        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
-        return tRS_rPostAct_out
-dact_fn_map = {
-    None: None,
-    "relu": quack.activation.drelu,
-    "relu_sq": quack.activation.drelu_sq,
-    "gelu_tanh_approx": quack.activation.dgelu_tanh_approx,
-}
-def gemm_dact_sm90(
-    A: Tensor,  # (l, m, k)
-    B: Tensor,  # (l, n, k)
-    Out: Tensor,  # (l, m, n)
-    PreAct: Tensor,  # (l, m, n)
-    PostAct: Tensor,  # (l, m, n)
-    tile_count_semaphore: Optional[Tensor],  # (1,)
-    activation: Optional[str],
-    tile_M: int,
-    tile_N: int,
-    cluster_M: int,
-    cluster_N: int,
-    pingpong: bool = True,
-    persistent: bool = True,
-) -> None:
-    assert activation in dact_fn_map, f"Unsupported activation {activation}"
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A, B, Out, PreAct, additional_tensors={"PostAct": PostAct}
-    )
-    GemmWrapperBase.permute_tensors(tensor_infos)
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-        "PostAct": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
-    acc_dtype = cutlass.Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmDActSm90.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    act_fn = dact_fn_map[activation]
-    epi_args = GemmDActSm90.EpilogueArguments(tensor_infos["PostAct"].cute_tensor, act_fn)
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters, tile_count_semaphore
-    )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        activation,
-        tile_shape_mn,
-        cluster_shape_mnk,
-        pingpong,
-        persistent,
-        tile_count_semaphore is not None,
-        key_tensor_names=("A", "B", "D", "PostAct", "C"),
-    )
-    cache = gemm_dact_sm90.compile_cache
-    if compile_key not in cache:
-        gemm = GemmDActSm90(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            pingpong=pingpong,
-            is_persistent=persistent,
-        )
-        cache[compile_key] = cute.compile(
-            gemm,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            None,  # varlen_args
-            None,  # mAIdx
-            current_stream,
-        )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        None,
-        None,
-        current_stream,
-    )
-gemm_dact_sm90.compile_cache = {}

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl