PyPI - quack-kernels - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

quack-kernels 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

quack/__init__.py +7 -3
quack/activation.py +279 -0
quack/autotuner.py +2 -1
quack/cross_entropy.py +330 -184
quack/cute_dsl_utils.py +83 -4
quack/dense_gemm_sm100.py +1 -1
quack/dense_gemm_sm90.py +911 -1140
quack/fast_math.py +10 -27
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +43 -35
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +491 -243
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +6 -4
quack/linear.py +128 -64
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +30 -160
quack/pipeline.py +2 -17
quack/reduce.py +240 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +614 -228
quack/softmax.py +28 -16
quack/symmetric_dense_gemm_sm90.py +6 -3
quack/tensormap_manager.py +1 -0
quack/tile_scheduler.py +64 -61
quack/topk.py +14 -8
quack/utils.py +14 -322
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/METADATA +3 -3
quack_kernels-0.2.1.dist-info/RECORD +37 -0
quack/lse.py +0 -62
quack_kernels-0.1.11.dist-info/RECORD +0 -31
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/top_level.txt +0 -0

quack/gemm_wrapper_utils.py ADDED Viewed

@@ -0,0 +1,158 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple, Dict, Any
+from dataclasses import dataclass
+from torch import Tensor
+import cutlass.cute as cute
+from cutlass import Int32
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from quack.cute_dsl_utils import torch2cute_dtype_map
+from quack.dense_gemm_sm90 import TileSchedulerOptions
+@dataclass
+class GemmTensorInfo:
+    tensor: Optional[Tensor]
+    dtype: Optional[Any] = None
+    major: Optional[str] = None
+    cute_tensor: Optional[cute.Tensor] = None
+class GemmWrapperBase:
+    @staticmethod
+    def validate_tensor_3d(tensor: Tensor, name: str) -> None:
+        assert tensor.dim() == 3 and tensor.is_cuda, f"{name} must be a 3D CUDA tensor"
+        assert tensor.dtype in torch2cute_dtype_map, f"Unsupported dtype for {name}"
+    @staticmethod
+    def validate_shape(tensor: Tensor, expected_shape: Tuple[int, ...], name: str) -> None:
+        assert tensor.shape == expected_shape, (
+            f"{name} must have shape {expected_shape}, got {tensor.shape}"
+        )
+    @staticmethod
+    def get_major_order(tensor: Tensor, dims: Tuple[str, str, str]) -> str:
+        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # stride(1) == 1 means dims[1] is contiguous (innermost)
+        return dims[1] if tensor.stride(1) == 1 else dims[0]
+    @staticmethod
+    def create_cute_tensor(
+        tensor: Optional[Tensor],
+        major: Optional[str],
+        dims: Tuple[str, str, str],
+        assumed_align: int = 16,
+    ) -> Optional[cute.Tensor]:
+        if tensor is None:
+            return None
+        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # If major is dims[1], leading_dim is 1; if major is dims[0], leading_dim is 0
+        leading_dim = 1 if major == dims[1] else 0
+        return from_dlpack(tensor.detach(), assumed_align=assumed_align).mark_layout_dynamic(
+            leading_dim=leading_dim
+        )
+    @staticmethod
+    def validate_and_prepare_tensors(
+        A: Tensor,
+        B: Tensor,
+        D: Optional[Tensor] = None,
+        C: Optional[Tensor] = None,
+        additional_tensors: Optional[Dict[str, Tensor]] = None,
+    ) -> Tuple[int, int, int, int, Dict[str, GemmTensorInfo]]:
+        GemmWrapperBase.validate_tensor_3d(A, "A")
+        L, M, K = A.shape
+        GemmWrapperBase.validate_tensor_3d(B, "B")
+        _, N, _ = B.shape
+        assert B.dtype == A.dtype, "A and B must have the same dtype"
+        GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+        tensors = {
+            "A": GemmTensorInfo(A),
+            "B": GemmTensorInfo(B),
+            "D": GemmTensorInfo(D),
+            "C": GemmTensorInfo(C),
+        }
+        if D is not None:
+            GemmWrapperBase.validate_tensor_3d(D, "D")
+            GemmWrapperBase.validate_shape(D, (L, M, N), "D")
+        if C is not None:
+            GemmWrapperBase.validate_tensor_3d(C, "C")
+            GemmWrapperBase.validate_shape(C, (L, M, N), "C")
+        if additional_tensors:
+            for name, tensor in additional_tensors.items():
+                if tensor is not None:
+                    GemmWrapperBase.validate_tensor_3d(tensor, name)
+                    GemmWrapperBase.validate_shape(tensor, (L, M, N), name)
+                tensors[name] = GemmTensorInfo(tensor)
+        return L, M, K, N, tensors
+    @staticmethod
+    def permute_tensors(tensors: Dict[str, GemmTensorInfo]) -> None:
+        for info in tensors.values():
+            if info.tensor is not None:
+                info.tensor = info.tensor.permute(1, 2, 0)
+    @staticmethod
+    def extract_dtypes(tensors: Dict[str, GemmTensorInfo]) -> None:
+        for info in tensors.values():
+            if info.tensor is not None:
+                info.dtype = torch2cute_dtype_map[info.tensor.dtype]
+    @staticmethod
+    def determine_major_orders(
+        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
+    ) -> None:
+        for name, dims in major_configs.items():
+            if name in tensors and tensors[name].tensor is not None:
+                tensors[name].major = GemmWrapperBase.get_major_order(tensors[name].tensor, dims)
+    @staticmethod
+    def create_cute_tensors(
+        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
+    ) -> None:
+        for name, info in tensors.items():
+            if info.tensor is not None and name in major_configs:
+                info.cute_tensor = GemmWrapperBase.create_cute_tensor(
+                    info.tensor, info.major, major_configs[name]
+                )
+    @staticmethod
+    def create_scheduler_args(
+        max_active_clusters: int, tile_count_semaphore: Optional[Tensor] = None
+    ) -> TileSchedulerOptions:
+        return TileSchedulerOptions(
+            Int32(max_active_clusters),
+            tile_count_semaphore=make_ptr(
+                Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4
+            )
+            if tile_count_semaphore is not None
+            else None,
+        )
+    @staticmethod
+    def get_compile_key(
+        tensors: Dict[str, GemmTensorInfo],
+        activation: Optional[str],
+        tile_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
+        pingpong: bool,
+        persistent: bool,
+        has_semaphore: bool,
+        *args,
+        key_tensor_names: Tuple[str, ...] = ("A", "B", "D", "C"),
+    ) -> Tuple:
+        key_parts = []
+        for name in key_tensor_names:
+            if name in tensors:
+                key_parts.append(tensors[name].dtype)
+        key_parts.append(activation)
+        key_parts.extend([tile_shape_mn, cluster_shape_mnk])
+        for name in key_tensor_names:
+            if name in tensors:
+                key_parts.append(tensors[name].major)
+        key_parts.extend([pingpong, persistent, has_semaphore])
+        key_parts.extend(args)
+        return tuple(key_parts)

quack/layernorm.py CHANGED Viewed

@@ -10,7 +10,9 @@ import cutlass
 import cutlass.cute as cute
 from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
-from quack.reduction_base import ReductionBase, torch2cute_dtype_map
+from quack.reduce import row_reduce
+from quack.reduction_base import ReductionBase
+from quack.cute_dsl_utils import torch2cute_dtype_map
 class LayerNorm(ReductionBase):
@@ -190,7 +192,7 @@ class LayerNorm(ReductionBase):
         cute.autovec_copy(tXsX, tXrX)
         x = tXrX.load().to(cute.Float32)
         threads_per_row = tv_layout.shape[0][0]
-        sum_x = utils.row_reduce(
+        sum_x = row_reduce(
             x,
             cute.ReductionOp.ADD,
             threads_per_row,
@@ -207,7 +209,7 @@ class LayerNorm(ReductionBase):
             cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
             x = tXrX.load().to(cute.Float32)
-        sum_sq_x_sub_mean = utils.row_reduce(
+        sum_sq_x_sub_mean = row_reduce(
             (x - mean) * (x - mean),
             cute.ReductionOp.ADD,
             threads_per_row,
@@ -215,7 +217,7 @@ class LayerNorm(ReductionBase):
             mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
         )
-        rstd = utils.rsqrt(sum_sq_x_sub_mean / shape[1] + eps)
+        rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
         if cutlass.const_expr(mRstd is not None):
             # Only the thread corresponding to column 0 writes out the rstd to gmem
             if (

quack/linear.py CHANGED Viewed

@@ -1,4 +1,6 @@
 # Copyright (c) 2025, Tri Dao
+from functools import partial
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -6,10 +8,7 @@ from torch import Tensor
 from torch.amp import custom_fwd, custom_bwd
-from gemm_cublas import gemm as gemm_cb, gemm_add_ as gemm_add_cb_
-# from gemm_cublas.interface import gemm_tuned as gemm_cb, gemm_add_tuned_ as gemm_add_cb_
-from quack import gemm, gemm_lse  # TODO: implement these
+from quack.gemm_interface import gemm, gemm_add_inplace, gemm_act, gemm_dact
 def linear_fwd_convert_type(*tensors):
@@ -19,7 +18,8 @@ def linear_fwd_convert_type(*tensors):
     return tensors
-def linear_fwd_postprocess(ctx, x, weight, weight_og, needs_input_grad, needs_weight_grad):
+def linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad):
+    needs_input_grad, needs_weight_grad = needs_x_w_grad
     if not needs_input_grad:
         weight, weight_og = None, None
     if not needs_weight_grad:
@@ -27,29 +27,24 @@ def linear_fwd_postprocess(ctx, x, weight, weight_og, needs_input_grad, needs_we
     ctx.save_for_backward(x, weight, weight_og if ctx.fuse_grad_accum else None)
-def linear_bwd_compute_input_grad(ctx, dout, weight, use_tuned_gemm=True, sm_carveout=0):
+def linear_bwd_compute_input_grad(ctx, dout, weight, matmul_fn):
     if ctx.needs_input_grad[0]:
         assert weight is not None
-        # return gemm(dout, weight) if use_tuned_gemm else (dout @ weight)
-        return (
-            gemm(dout, weight, sm_carveout=sm_carveout)
-            if use_tuned_gemm
-            else gemm_cb(dout, weight, sm_carveout=sm_carveout)
-        )
+        return matmul_fn(dout, weight)
     else:
         return None
-def linear_bwd_compute_weight_grad(ctx, dout, x, weight_og, sm_carveout=0):
+def linear_bwd_compute_weight_grad(ctx, dout, x, weight_og, matmul_fn, matmul_inplace_fn):
     if ctx.needs_input_grad[1]:
         assert x is not None
         x = x.reshape(-1, x.shape[-1])
         # fuse_grad_accum is not compatible with torch.compile
         if not ctx.fuse_grad_accum or weight_og.grad is None or torch.compiler.is_compiling():
-            dweight = gemm_cb(dout.T, x, out_dtype=ctx.weight_dtype, sm_carveout=sm_carveout)
+            dweight = matmul_fn(dout.T, x, out_dtype=ctx.weight_dtype)
         else:
             # print("Using fuse grad accum in Linear", dout.shape, x.shape, weight_og.grad.shape)
-            gemm_add_cb_(dout.T, x, weight_og.grad, sm_carveout=sm_carveout)
+            matmul_inplace_fn(dout.T, x, weight_og.grad)
             dweight = weight_og.grad
             weight_og.grad = None  # So that pytorch doesn't add dweight to weight_og.grad again
     else:
@@ -58,9 +53,15 @@ def linear_bwd_compute_weight_grad(ctx, dout, x, weight_og, sm_carveout=0):
 class LinearFunc(torch.autograd.Function):
-    @staticmethod
+    matmul_fwd_fn = gemm
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+    # Use classmethod instead of staticmethod to allow inheritance
+    @classmethod
     @custom_fwd(device_type="cuda")
-    def forward(ctx, x, weight, fuse_grad_accum=False):
+    def forward(cls, ctx, x, weight, fuse_grad_accum=False):
         """
         x: (..., in_features)
         weight: (out_features, in_features)
@@ -73,77 +74,145 @@ class LinearFunc(torch.autograd.Function):
         batch_shape = x.shape[:-1]
         x = x.reshape(-1, x.shape[-1])
         # out = F.linear(x, weight)
-        out = gemm(x, weight.T)
-        linear_fwd_postprocess(
-            ctx,
-            x,
-            weight,
-            weight_og,
-            needs_input_grad=ctx.needs_input_grad[0],
-            needs_weight_grad=ctx.needs_input_grad[1],
-        )
+        out = cls.matmul_fwd_fn(x, weight.T)
+        linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
         return out.reshape(*batch_shape, out.shape[-1])
-    @staticmethod
+    @classmethod
     @custom_bwd(device_type="cuda")
-    def backward(ctx, dout):
+    def backward(cls, ctx, dout, *args):
         """
         dout: (..., out_features)
         """
         x, weight, weight_og = ctx.saved_tensors  # weight_og is None if not ctx.fuse_grad_accum
         batch_shape = dout.shape[:-1]
         dout = dout.reshape(-1, dout.shape[-1])
-        dx = linear_bwd_compute_input_grad(ctx, dout, weight, use_tuned_gemm=True)
+        dx = linear_bwd_compute_input_grad(ctx, dout, weight, cls.matmul_bwd_dx)
         dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
-        dweight = linear_bwd_compute_weight_grad(ctx, dout, x, weight_og)
-        return dx, dweight, None
+        dweight = linear_bwd_compute_weight_grad(
+            ctx, dout, x, weight_og, cls.matmul_bwd_dw, cls.matmul_bwd_dw_inplace
+        )
+        # return extra Nones for other classes that inherit from LinearFunc
+        return dx, dweight, *([None] * 10)
+class LinearUntunedFunc(LinearFunc):
+    # Passing in tuned=False to disable tuning at runtime
+    matmul_fwd_fn = partial(gemm, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
-def linear_func(x, weight, fuse_grad_accum=False):
-    return LinearFunc.apply(x, weight, fuse_grad_accum)
+def linear_func(x, weight, fuse_grad_accum=False, tuned=True):
+    fn_cls = LinearFunc if tuned else LinearUntunedFunc
+    return fn_cls.apply(x, weight, fuse_grad_accum)
-class LinearLSEFunc(torch.autograd.Function):
-    @staticmethod
+class LinearActFunc(LinearFunc):
+    matmul_fwd_fn = gemm_act
+    # Use classmethod instead of staticmethod to allow inheritance
+    @classmethod
     @custom_fwd(device_type="cuda")
-    def forward(ctx, x, weight, fuse_grad_accum=False):
+    def forward(cls, ctx, x, weight, activation, store_preact=True, fuse_grad_accum=False):
         """
         x: (..., in_features)
         weight: (out_features, in_features)
         out: (..., out_features)
+        Return both out and post-activation, but only out is differentiable.
         """
-        needs_weight_grad = weight.requires_grad
-        needs_input_grad = x.requires_grad
         ctx.weight_dtype = weight.dtype
         ctx.fuse_grad_accum = fuse_grad_accum
         weight_og = weight
         x, weight = linear_fwd_convert_type(x, weight)
         batch_shape = x.shape[:-1]
         x = x.reshape(-1, x.shape[-1])
-        out, lse = gemm_lse(x, weight.T)
-        lse = lse.reshape(*batch_shape)
-        linear_fwd_postprocess(ctx, x, weight, weight_og, needs_weight_grad, needs_input_grad)
-        ctx.mark_non_differentiable(lse)
-        return out.reshape(*batch_shape, out.shape[-1]), lse
+        out, postact = cls.matmul_fwd_fn(
+            x, weight.T, activation=activation, store_preact=store_preact
+        )
+        linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
+        if out is not None:
+            out = out.reshape(*batch_shape, out.shape[-1])
+        ctx.mark_non_differentiable(postact)
+        ctx.set_materialize_grads(False)  # We don't want to materialize grads for postact
+        return out, postact.reshape(*batch_shape, postact.shape[-1])
-    @staticmethod
+class LinearActUntunedFunc(LinearActFunc):
+    # Passing in tuned=False to disable tuning at runtime
+    matmul_fwd_fn = partial(gemm_act, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+def linear_act_func(x, weight, activation, store_preact=True, fuse_grad_accum=False, tuned=True):
+    fn_cls = LinearActFunc if tuned else LinearActUntunedFunc
+    return fn_cls.apply(x, weight, activation, store_preact, fuse_grad_accum)
+class DActLinearFunc(LinearFunc):
+    matmul_bwd_dx = partial(gemm_dact, dynamic_scheduler=True)
+    # Use classmethod instead of staticmethod to allow inheritance
+    @classmethod
+    @custom_fwd(device_type="cuda")
+    def forward(cls, ctx, preact, weight, x, activation, fuse_grad_accum=False):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        out: (..., out_features)
+        Takes in an extra preact argument which is the pre-activation, to be used in the backward pass.
+        """
+        ctx.weight_dtype = weight.dtype
+        ctx.fuse_grad_accum = fuse_grad_accum
+        weight_og = weight
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        out = cls.matmul_fwd_fn(x, weight.T)
+        # Store preact instead of x, we will recompute x in the backward pass
+        linear_fwd_postprocess(
+            ctx, preact, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2]
+        )
+        ctx.activation = activation
+        return out.reshape(*batch_shape, out.shape[-1])
+    @classmethod
     @custom_bwd(device_type="cuda")
-    def backward(ctx, dout, dlse_ignored):
+    def backward(cls, ctx, dout):
         """
         dout: (..., out_features)
         """
-        x, weight, weight_og = ctx.saved_tensors  # weight_og is None if not ctx.fuse_grad_accum
+        # weight_og is None if not ctx.fuse_grad_accum
+        preact, weight, weight_og = ctx.saved_tensors
         batch_shape = dout.shape[:-1]
         dout = dout.reshape(-1, dout.shape[-1])
-        # cuBLAS seems faster for this so we just use it instead of cutlass gemm
-        dx = linear_bwd_compute_input_grad(ctx, dout, weight, use_tuned_gemm=False)
-        dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
-        dweight = linear_bwd_compute_weight_grad(ctx, dout, x, weight_og)
-        return dx, dweight, None
+        preact = preact.reshape(-1, preact.shape[-1])
+        if ctx.needs_input_grad[0]:
+            assert weight is not None
+            dpreact, x = cls.matmul_bwd_dx(dout, weight, preact, activation=ctx.activation)
+        else:
+            dpreact, x = None, None
+        dpreact = dpreact.reshape(*batch_shape, dpreact.shape[-1]) if dpreact is not None else None
+        dweight = linear_bwd_compute_weight_grad(
+            ctx, dout, x, weight_og, cls.matmul_bwd_dw, cls.matmul_bwd_dw_inplace
+        )
+        return dpreact, dweight, *([None] * 3)
+class DActLinearUntunedFunc(DActLinearFunc):
+    # Passing in tuned=False to disable tuning at runtime
+    matmul_fwd_fn = partial(gemm, tuned=False)
+    matmul_bwd_dx = partial(gemm_dact, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
-def linear_lse_func(x, weight, fuse_grad_accum=False):
-    return LinearLSEFunc.apply(x, weight, fuse_grad_accum)
+def act_linear_func(preact, weight, x, activation, fuse_grad_accum=False, tuned=True):
+    fn_cls = DActLinearFunc if tuned else DActLinearUntunedFunc
+    return fn_cls.apply(preact, weight, x, activation, fuse_grad_accum)
 class Linear(nn.Linear):
@@ -160,17 +229,12 @@ class Linear(nn.Linear):
         self.fuse_grad_accum = fuse_grad_accum
     def forward(self, input: Tensor) -> Tensor:
-        if self.bias is None and input.is_cuda:
+        if (
+            self.bias is None
+            and input.is_cuda
+            and self.in_features % 8 == 0
+            and self.out_features % 8 == 0
+        ):
             return linear_func(input, self.weight, fuse_grad_accum=self.fuse_grad_accum)
         else:
             return F.linear(input, self.weight, self.bias)
-class LinearLSE(Linear):
-    def forward(self, input: Tensor) -> Tensor:
-        if self.bias is None and input.is_cuda:
-            return linear_lse_func(input, self.weight, fuse_grad_accum=self.fuse_grad_accum)
-        else:
-            out = F.linear(input, self.weight, self.bias)
-            lse = torch.logsumexp(out, dim=-1)
-            return out, lse

quack-kernels 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl

quack-kernels 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl