PyPI - quack-kernels - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

quack-kernels 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

quack/__init__.py +1 -1
quack/activation.py +16 -25
quack/autotuner.py +64 -5
quack/cross_entropy.py +6 -10
quack/cute_dsl_utils.py +6 -7
quack/dense_gemm_sm90.py +582 -287
quack/gemm_act_sm90.py +70 -29
quack/gemm_dact_sm90.py +43 -10
quack/gemm_interface.py +453 -130
quack/{dense_gemm_sm100.py → gemm_sm100.py} +443 -419
quack/gemm_wrapper_utils.py +179 -22
quack/layernorm.py +1 -1
quack/reduce.py +6 -7
quack/rmsnorm.py +126 -158
quack/softmax.py +1 -1
quack/tile_scheduler.py +37 -49
quack/utils.py +61 -71
quack/varlen_utils.py +1 -6
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/METADATA +3 -3
quack_kernels-0.2.2.dist-info/RECORD +37 -0
quack_kernels-0.2.0.dist-info/RECORD +0 -37
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/top_level.txt +0 -0

quack/gemm_wrapper_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import Optional, Tuple, Dict, Any
 from dataclasses import dataclass
+import torch
 from torch import Tensor
 import cutlass.cute as cute
@@ -9,6 +10,7 @@ from cutlass import Int32
 from cutlass.cute.runtime import from_dlpack, make_ptr
 from quack.cute_dsl_utils import torch2cute_dtype_map
+from quack.varlen_utils import VarlenArguments
 from quack.dense_gemm_sm90 import TileSchedulerOptions
@@ -22,8 +24,8 @@ class GemmTensorInfo:
 class GemmWrapperBase:
     @staticmethod
-    def validate_tensor_3d(tensor: Tensor, name: str) -> None:
-        assert tensor.dim() == 3 and tensor.is_cuda, f"{name} must be a 3D CUDA tensor"
+    def validate_tensor(tensor: Tensor, name: str, ndim: int) -> None:
+        assert tensor.dim() == ndim and tensor.is_cuda, f"{name} must be a {ndim}D CUDA tensor"
         assert tensor.dtype in torch2cute_dtype_map, f"Unsupported dtype for {name}"
     @staticmethod
@@ -47,7 +49,7 @@ class GemmWrapperBase:
     ) -> Optional[cute.Tensor]:
         if tensor is None:
             return None
-        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # Tensor is already permuted to (dims[0], dims[1], dims[2]) or (dim[0], dim[1])
         # If major is dims[1], leading_dim is 1; if major is dims[0], leading_dim is 0
         leading_dim = 1 if major == dims[1] else 0
         return from_dlpack(tensor.detach(), assumed_align=assumed_align).mark_layout_dynamic(
@@ -61,43 +63,131 @@ class GemmWrapperBase:
         D: Optional[Tensor] = None,
         C: Optional[Tensor] = None,
         additional_tensors: Optional[Dict[str, Tensor]] = None,
+        cu_seqlens_m: Optional[Tensor] = None,
+        cu_seqlens_k: Optional[Tensor] = None,
+        A_idx: Optional[Tensor] = None,
     ) -> Tuple[int, int, int, int, Dict[str, GemmTensorInfo]]:
-        GemmWrapperBase.validate_tensor_3d(A, "A")
-        L, M, K = A.shape
-        GemmWrapperBase.validate_tensor_3d(B, "B")
-        _, N, _ = B.shape
+        assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
+            "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
+        )
         assert B.dtype == A.dtype, "A and B must have the same dtype"
-        GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+        # Validate A_idx if provided (for gather_A case)
+        gather_A = A_idx is not None
+        if gather_A:
+            assert cu_seqlens_m is not None or cu_seqlens_k is not None, (
+                "gather_A requires either varlen_m or varlen_k"
+            )
+            assert A_idx.dtype == torch.int32, f"A_idx must be int32, got {A_idx.dtype}"
+            assert A_idx.dim() == 1, f"A_idx must be 1D, got {A_idx.dim()}D"
+        # Determine mode and extract dimensions
+        if cu_seqlens_m is not None:
+            # varlen_m: A is (total_m, k) or (whatever, k) if gather_A, B is (l, n, k), D/C are (total_m, n)
+            assert A.dim() == 2, f"A must be 2D when using varlen_m, got {A.dim()}D"
+            assert B.dim() == 3, f"B must be 3D with varlen_m, got {B.dim()}D"
+            if gather_A:
+                # When gather_A, A can have any number of rows, we use A_idx.shape[0] as total_M
+                total_M = A_idx.shape[0]
+                _, K = A.shape
+            else:
+                total_M, K = A.shape
+            L, N, K_B = B.shape
+            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
+            assert cu_seqlens_m.shape == (L + 1,), (
+                f"cu_seqlens_m must have shape ({L + 1},), got {cu_seqlens_m.shape}"
+            )
+            M = total_M
+            dc_shape = (total_M, N)
+            dc_ndim = 2
+        elif cu_seqlens_k is not None:
+            # varlen_k: A is (m, total_k) or (m, whatever) if gather_A, B is (n, total_k), D/C are (l, m, n)
+            assert A.dim() == 2, f"A must be 2D when using varlen_k, got {A.dim()}D"
+            assert B.dim() == 2, f"B must be 2D with varlen_k, got {B.dim()}D"
+            if gather_A:
+                # When gather_A with varlen_k, A can have any number of columns, we use A_idx.shape[0] as total_K
+                M, _ = A.shape
+                total_K = A_idx.shape[0]
+            else:
+                M, total_K = A.shape
+            N, K_B = B.shape
+            assert total_K == K_B, f"K dimension mismatch: expected {total_K}, B has {K_B}"
+            L = cu_seqlens_k.shape[0] - 1
+            assert cu_seqlens_k.shape == (L + 1,), (
+                f"cu_seqlens_k must have shape ({L + 1},), got {cu_seqlens_k.shape}"
+            )
+            K = total_K
+            dc_shape = (L, M, N)
+            dc_ndim = 3
+        else:
+            # Normal case - all tensors must be 3D
+            GemmWrapperBase.validate_tensor(A, "A", 3)
+            GemmWrapperBase.validate_tensor(B, "B", 3)
+            L, M, K = A.shape
+            _, N, K_B = B.shape
+            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
+            GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+            dc_shape = (L, M, N)
+            dc_ndim = 3
+        # Validate D and C shapes uniformly
+        for tensor, name in [(D, "D"), (C, "C")]:
+            if tensor is not None:
+                assert tensor.dim() == dc_ndim, (
+                    f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
+                )
+                assert tensor.shape == dc_shape, (
+                    f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
+                )
         tensors = {
             "A": GemmTensorInfo(A),
             "B": GemmTensorInfo(B),
             "D": GemmTensorInfo(D),
             "C": GemmTensorInfo(C),
         }
-        if D is not None:
-            GemmWrapperBase.validate_tensor_3d(D, "D")
-            GemmWrapperBase.validate_shape(D, (L, M, N), "D")
-        if C is not None:
-            GemmWrapperBase.validate_tensor_3d(C, "C")
-            GemmWrapperBase.validate_shape(C, (L, M, N), "C")
         if additional_tensors:
             for name, tensor in additional_tensors.items():
                 if tensor is not None:
-                    GemmWrapperBase.validate_tensor_3d(tensor, name)
-                    GemmWrapperBase.validate_shape(tensor, (L, M, N), name)
+                    assert tensor.dim() == dc_ndim, (
+                        f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
+                    )
+                    assert tensor.shape == dc_shape, (
+                        f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
+                    )
                 tensors[name] = GemmTensorInfo(tensor)
         return L, M, K, N, tensors
     @staticmethod
-    def permute_tensors(tensors: Dict[str, GemmTensorInfo]) -> None:
-        for info in tensors.values():
-            if info.tensor is not None:
-                info.tensor = info.tensor.permute(1, 2, 0)
+    def permute_tensors(
+        tensors: Dict[str, GemmTensorInfo], varlen_m: bool = False, varlen_k: bool = False
+    ) -> None:
+        # Determine which tensors need permutation
+        if varlen_m:
+            # Only B needs permutation (3D tensor)
+            tensors_to_permute = ["B"]
+        elif varlen_k:
+            # Only D and C need permutation (3D tensors)
+            tensors_to_permute = ["D", "C"]
+        else:
+            # All tensors need permutation
+            tensors_to_permute = None
+        # Apply permutation from (L, *, *) -> (*, *, L) for selected tensors
+        for name, info in tensors.items():
+            if info.tensor is not None and info.tensor.ndim == 3:
+                if tensors_to_permute is None or name in tensors_to_permute:
+                    info.tensor = info.tensor.permute(1, 2, 0)
     @staticmethod
     def extract_dtypes(tensors: Dict[str, GemmTensorInfo]) -> None:
-        for info in tensors.values():
+        for name, info in tensors.items():
             if info.tensor is not None:
                 info.dtype = torch2cute_dtype_map[info.tensor.dtype]
@@ -121,7 +211,9 @@ class GemmWrapperBase:
     @staticmethod
     def create_scheduler_args(
-        max_active_clusters: int, tile_count_semaphore: Optional[Tensor] = None
+        max_active_clusters: int,
+        tile_count_semaphore: Optional[Tensor] = None,
+        batch_idx_permute: Optional[Tensor] = None,
     ) -> TileSchedulerOptions:
         return TileSchedulerOptions(
             Int32(max_active_clusters),
@@ -130,6 +222,71 @@ class GemmWrapperBase:
             )
             if tile_count_semaphore is not None
             else None,
+            batch_idx_permute=(
+                from_dlpack(batch_idx_permute, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+            )
+            if batch_idx_permute is not None
+            else None,
+        )
+    @staticmethod
+    def create_varlen_args(
+        cu_seqlens_m: Optional[Tensor],
+        cu_seqlens_k: Optional[Tensor],
+        A_idx: Optional[Tensor],
+        max_active_clusters: int,
+        cluster_shape_mnk: Tuple[int, int, int],
+        tensors: Dict[str, GemmTensorInfo],
+        num_epi_tensormaps: int = 0,
+        pingpong: bool = False,
+    ) -> Optional[Any]:
+        if cu_seqlens_m is None and cu_seqlens_k is None:
+            return None
+        # When varlen_m, we assume persistent=True
+        # Grid size depends on num_active_clusters and cluster size
+        cluster_size = cluster_shape_mnk[0] * cluster_shape_mnk[1]
+        num_blocks = max_active_clusters * cluster_size
+        # Calculate number of tensormaps needed
+        if cu_seqlens_m is not None:
+            # For varlen_m: need tensormaps for D and epilogue tensors
+            num_tensormaps = num_epi_tensormaps * (1 if not pingpong else 2)
+            if tensors["D"].tensor is not None:
+                num_tensormaps += 1 if not pingpong else 2  # D tensormap
+        else:
+            # For varlen_k: need tensormaps for A & B
+            num_tensormaps = 2 if A_idx is None else 1
+        # Create tensormap buffer (each tensormap is 128 bytes = 16 int64s)
+        tensormap_size = 128 // 8  # 16 int64s
+        if num_tensormaps > 0:
+            device = cu_seqlens_m.device if cu_seqlens_m is not None else cu_seqlens_k.device
+            tensormaps = torch.empty(
+                (num_blocks, num_tensormaps, tensormap_size),
+                dtype=torch.int64,
+                device=device,
+            )
+            tensormaps_cute = from_dlpack(tensormaps, assumed_align=128).mark_compact_shape_dynamic(
+                mode=0, stride_order=(0, 1, 2)
+            )
+        else:
+            tensormaps_cute = None
+        return VarlenArguments(
+            mCuSeqlensM=(
+                from_dlpack(cu_seqlens_m, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if cu_seqlens_m is not None
+                else None
+            ),
+            mCuSeqlensK=(
+                from_dlpack(cu_seqlens_k, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if cu_seqlens_k is not None
+                else None
+            ),
+            mTensormaps=tensormaps_cute,
+            mAIdx=(
+                from_dlpack(A_idx, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if A_idx is not None
+                else None
+            ),
         )
     @staticmethod

quack/layernorm.py CHANGED Viewed

@@ -217,7 +217,7 @@ class LayerNorm(ReductionBase):
             mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
         )
-        rstd = utils.rsqrt(sum_sq_x_sub_mean / shape[1] + eps)
+        rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
         if cutlass.const_expr(mRstd is not None):
             # Only the thread corresponding to column 0 writes out the rstd to gmem
             if (

quack/reduce.py CHANGED Viewed

@@ -159,8 +159,7 @@ def online_softmax_reduce(
         width=min(threads_per_row, cute.arch.WARP_SIZE),
     )
     log2_e = math.log2(math.e)
-    exp_x = utils.exp2f(x * log2_e - (max_x * log2_e))
-    # exp_x = exp2f((x - max_x) * log2_e)
+    exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
     sum_exp_x = warp_reduce(
         exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
         operator.add,
@@ -190,10 +189,10 @@ def online_softmax_reduce(
                         reduction_buffer[row_idx, lane_idx]
                     )
                 max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
-                sum_exp_x *= utils.exp2f((max_x_single_warp - max_x_final) * log2_e)
+                sum_exp_x *= cute.math.exp(max_x_single_warp - max_x_final, fastmath=True)
                 sum_exp_x = warp_reduce(sum_exp_x, operator.add)
                 if cutlass.const_expr(return_exp_x):
-                    exp_x *= utils.exp2f((max_x - max_x_final) * log2_e)
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
                 max_x = max_x_final
             else:
                 cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
@@ -231,11 +230,11 @@ def online_softmax_reduce(
                 max_x_final = warp_reduce(max_x_final, cute.arch.fmax)
                 sum_exp_x = 0.0
                 for i in cutlass.range_constexpr(num_iter):
-                    sum_exp_x += sum_exp_x_single_warp[i] * utils.exp2f(
-                        (max_x_single_warp[i] - max_x_final) * log2_e
+                    sum_exp_x += sum_exp_x_single_warp[i] * cute.math.exp(
+                        max_x_single_warp[i] - max_x_final, fastmath=True
                     )
                 sum_exp_x = warp_reduce(sum_exp_x, operator.add)
                 if cutlass.const_expr(return_exp_x):
-                    exp_x *= utils.exp2f((max_x - max_x_final) * log2_e)
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
                 max_x = max_x_final
     return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)

quack-kernels 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

quack-kernels 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl