PyPI - quack-kernels - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

quack-kernels 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

quack/__init__.py +1 -1
quack/activation.py +72 -64
quack/broadcast_utils.py +1 -1
quack/copy_utils.py +14 -18
quack/fast_math.py +29 -76
quack/gemm_act.py +296 -8
quack/gemm_dact.py +520 -4
quack/gemm_default_epi.py +4 -4
quack/gemm_interface.py +363 -0
quack/gemm_sm100.py +62 -88
quack/gemm_sm90.py +68 -114
quack/gemm_symmetric.py +2 -6
quack/layout_utils.py +2 -4
quack/linear.py +37 -0
quack/pipeline.py +59 -89
quack/reduce.py +2 -2
quack/rmsnorm.py +1 -3
quack/sm90_utils.py +5 -3
quack/sort/bitonic_sort.py +3 -3
quack/tile_scheduler.py +310 -256
quack/topk.py +4 -4
quack/utils.py +76 -40
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/METADATA +2 -2
quack_kernels-0.2.6.dist-info/RECORD +45 -0
quack_kernels-0.2.5.dist-info/RECORD +0 -45
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/top_level.txt +0 -0

quack/gemm_act.py CHANGED Viewed

@@ -20,7 +20,8 @@ from quack.gemm_sm90 import GemmSm90
 from quack.gemm_sm100 import GemmSm100
 from quack.gemm_default_epi import GemmDefaultEpiMixin
 from quack.cute_dsl_utils import get_device_capacity, get_max_active_clusters
-from quack.gemm_wrapper_utils import GemmWrapperBase
+from quack.gemm_wrapper_utils import GemmTensorInfo, GemmWrapperBase
+from quack.layout_utils import permute_gated_Cregs_b16
 import quack.sm90_utils as sm90_utils
 import quack.copy_utils as copy_utils
 import quack.activation
@@ -241,9 +242,7 @@ class GemmActMixin(GemmDefaultEpiMixin):
         def tma_store_fn(src_idx, dst_idx):
             # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
+            cute.arch.fence_view_async_shared()
             epilogue_barrier.arrive_and_wait()
             # Copy from shared memory to global memory
             if is_tma_warp:
@@ -268,9 +267,7 @@ class GemmActMixin(GemmDefaultEpiMixin):
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
                 # Fence to make sure shared memory read is visible to TMA load
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                )
+                cute.arch.fence_view_async_shared()
                 cute.arch.sync_warp()
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
@@ -327,7 +324,7 @@ class GemmActMixin(GemmDefaultEpiMixin):
         # Apply activation function if provided
         # If we don't have .shape here, the compiler generates local stores and loads
         if const_expr(params.act_fn is not None):
-            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            tRS_rPostAct = cute.make_rmem_tensor(tRS_rD.layout.shape, self.acc_dtype)
             if const_expr(self.arch < 100):
                 for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
                     tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
@@ -508,3 +505,294 @@ def gemm_act(
 gemm_act.compile_cache = {}
+class GemmGatedMixin(GemmActMixin):
+    def epi_to_underlying_arguments(
+        self, args: GemmActMixin.EpilogueArguments, *, loc=None, ip=None
+    ) -> GemmActMixin.EpilogueParams:
+        self.postact_dtype = args.mPostAct.element_type
+        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
+        assert self.postact_dtype.width == 16, "GemmGated only supports 16bit postact for now"
+        assert self.d_layout is None or self.d_layout.is_n_major_c()
+        assert self.postact_layout.is_n_major_c()
+        if self.arch == 90:
+            assert self.cta_tile_shape_mnk[1] % 32 == 0, (
+                "GemmGatedSm90 requires tileN to be divisible by 32"
+            )
+        self.cta_tile_shape_postact_mn = (
+            self.cta_tile_shape_mnk[0],
+            self.cta_tile_shape_mnk[1] // 2,
+        )
+        if isinstance(self.epi_tile[1], cute.Layout):
+            epi_tile_postact_1 = cute.recast_layout(2, 1, self.epi_tile[1])
+        else:
+            epi_tile_postact_1 = self.epi_tile[1] // 2
+        epi_tile_postact = (self.epi_tile[0], epi_tile_postact_1)
+        utils_cls = sm100_utils if self.arch == 100 else sm90_utils
+        epi_postact_smem_layout_staged = utils_cls.make_smem_layout_epi(
+            self.postact_dtype, self.postact_layout, epi_tile_postact, self.epi_stage
+        )
+        tma_atom_postact, tma_tensor_postact = self._make_tma_epi_atoms_and_tensors(
+            args.mPostAct,
+            epi_postact_smem_layout_staged,
+            epi_tile_postact,
+            op_type="store",
+        )
+        # Assume all strides are divisible by 32 bits except the last stride
+        new_stride = lambda t: tuple(
+            cute.assume(s, divby=32 // t.element_type.width) if not cute.is_static(s) else s
+            for s in t.stride
+        )
+        mRowVecBroadcast, mColVecBroadcast = [
+            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
+            if t is not None
+            else None
+            for t in (args.mRowVecBroadcast, args.mColVecBroadcast)
+        ]
+        return self.EpilogueParams(
+            tma_atom_postact,
+            tma_tensor_postact,
+            epi_postact_smem_layout_staged,
+            epi_tile_postact,
+            args.act_fn,
+            alpha=args.alpha,
+            beta=args.beta,
+            mRowVecBroadcast=mRowVecBroadcast,
+            mColVecBroadcast=mColVecBroadcast,
+        )
+    @staticmethod
+    def epi_smem_bytes_per_stage(
+        args: GemmActMixin.EpilogueArguments,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: cute.Tile,
+    ) -> int:
+        postact_dtype = args.mPostAct.element_type
+        postact_bytes_per_stage = (cute.size(cute.shape(epi_tile)) // 2) * (
+            postact_dtype.width // 8
+        )
+        rowvec_colvec_bytes = GemmDefaultEpiMixin.epi_smem_bytes_per_stage(
+            args, cta_tile_shape_mnk, epi_tile
+        )
+        return postact_bytes_per_stage + rowvec_colvec_bytes
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: GemmActMixin.EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        GemmDefaultEpiMixin.epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC)
+        tRS_rPostAct_layout = cute.recast_layout(2, 1, tRS_rD.layout)
+        # If we don't have .shape here, the compiler generates local stores and loads
+        tRS_rPostAct = cute.make_rmem_tensor(tRS_rPostAct_layout.shape, self.acc_dtype)
+        if const_expr(self.arch < 100):
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rPostAct[i] = params.act_fn(tRS_rD[2 * i], tRS_rD[2 * i + 1])
+        else:
+            for i in cutlass.range(cute.size(tRS_rPostAct) // 2, unroll_full=True):
+                tRS_rPostAct[2 * i], tRS_rPostAct[2 * i + 1] = params.act_fn(
+                    (tRS_rD[4 * i], tRS_rD[4 * i + 2]), (tRS_rD[4 * i + 1], tRS_rD[4 * i + 3])
+                )
+        # Type conversion
+        tRS_rPostAct_out = cute.make_rmem_tensor_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        if const_expr(self.arch == 90):
+            # Only need this if we're using STSM
+            permute_gated_Cregs_b16(tRS_rPostAct_out)
+        return tRS_rPostAct_out
+class GemmGatedSm90(GemmGatedMixin, GemmSm90):
+    pass
+class GemmGatedSm100(GemmGatedMixin, GemmSm100):
+    pass
+gate_fn_map = {
+    "swiglu": quack.activation.swiglu,
+    "swiglu_oai": quack.activation.swiglu_oai,
+    "reglu": quack.activation.reglu,
+    "geglu": quack.activation.geglu,
+    "glu": quack.activation.glu,
+}
+def gemm_gated(
+    A: Tensor,  # (l, m, k) or (total_m, k) if varlen_m or (whatever, k) if gather_A with varlen_m
+    B: Tensor,  # (l, n, k)
+    D: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    PostAct: Tensor,  # (l, m, n//2) or (total_m, n//2) if varlen_m
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    rowvec_bias: Optional[Tensor] = None,  # (l, n)
+    colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
+    cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
+    A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
+) -> None:
+    if cu_seqlens_m is not None:
+        assert persistent, "varlen_m requires persistent=True"
+        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
+        if D is not None:
+            assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
+        assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
+    gather_A = A_idx is not None
+    if gather_A:
+        assert cu_seqlens_m is not None, "gather_A requires varlen (cu_seqlens_m must be specified)"
+        assert cluster_N == 1, "gather_A requires cluster_N=1"
+    assert activation in gate_fn_map, f"Unsupported activation {activation}"
+    # Special validation for PostAct shape
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx
+    )
+    # PostAct shape validation depends on varlen_m
+    if cu_seqlens_m is not None:
+        # varlen_m case: PostAct is 2D (total_m, n//2)
+        assert PostAct.dim() == 2 and PostAct.is_cuda, (
+            "PostAct must be a 2D CUDA tensor for varlen_m"
+        )
+        assert PostAct.shape == (
+            M,
+            N // 2,
+        ), f"PostAct must have shape {(M, N // 2)}, got {PostAct.shape}"
+    else:
+        # Normal case: PostAct is 3D (l, m, n//2)
+        assert PostAct.dim() == 3 and PostAct.is_cuda, "PostAct must be a 3D CUDA tensor"
+        assert PostAct.shape == (
+            L,
+            M,
+            N // 2,
+        ), f"PostAct must have shape {(L, M, N // 2)}, got {PostAct.shape}"
+    tensor_infos["PostAct"] = GemmTensorInfo(PostAct)
+    GemmWrapperBase.permute_tensors(tensor_infos, varlen_m=cu_seqlens_m is not None)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),  # PostAct has shape (m, n//2, l) after permute
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmGatedSm100 if device_capacity[0] > 9 else GemmGatedSm90
+    acc_dtype = cutlass.Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = gate_fn_map[activation]
+    epi_args = GemmCls.EpilogueArguments(
+        tensor_infos["PostAct"].cute_tensor,
+        act_fn,
+        mRowVecBroadcast=(
+            from_dlpack(rowvec_bias.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=1)
+            if rowvec_bias is not None
+            else None
+        ),
+        mColVecBroadcast=(
+            from_dlpack(colvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+                leading_dim=1 if cu_seqlens_m is None else 0
+            )
+            if colvec_bias is not None
+            else None
+        ),
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
+    )
+    # Create varlen arguments if needed (assumes persistent=True when varlen_m)
+    varlen_args = GemmWrapperBase.create_varlen_args(
+        cu_seqlens_m,
+        None,  # cu_seqlens_k
+        A_idx,
+        max_active_clusters,
+        cluster_shape_mnk,
+        tensor_infos,
+        GemmCls.num_epi_tensormaps,
+        pingpong,
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        max_swizzle_size,
+        rowvec_bias.dtype if rowvec_bias is not None else None,
+        colvec_bias.dtype if colvec_bias is not None else None,
+        cu_seqlens_m is not None,
+        A_idx is not None,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_gated.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm_obj = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=gather_A,
+        )
+        cache[compile_key] = cute.compile(
+            gemm_obj,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm_gated.compile_cache = {}

quack-kernels 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

quack-kernels 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl