PyPI - quack-kernels - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

quack-kernels 0.1.11py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

quack/__init__.py +7 -3
quack/activation.py +288 -0
quack/autotuner.py +2 -1
quack/cross_entropy.py +325 -175
quack/cute_dsl_utils.py +83 -4
quack/dense_gemm_sm100.py +1 -1
quack/dense_gemm_sm90.py +911 -1140
quack/fast_math.py +10 -27
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +43 -35
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +491 -243
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +5 -3
quack/linear.py +128 -64
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +30 -160
quack/pipeline.py +2 -17
quack/reduce.py +241 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +583 -231
quack/softmax.py +27 -15
quack/symmetric_dense_gemm_sm90.py +6 -3
quack/tensormap_manager.py +1 -0
quack/tile_scheduler.py +61 -59
quack/topk.py +14 -8
quack/utils.py +14 -259
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/METADATA +2 -2
quack_kernels-0.2.0.dist-info/RECORD +37 -0
quack/lse.py +0 -62
quack_kernels-0.1.11.dist-info/RECORD +0 -31
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/top_level.txt +0 -0

quack/softmax.py CHANGED Viewed

@@ -9,7 +9,9 @@ import cutlass.cute as cute
 from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
-from quack.reduction_base import ReductionBase, torch2cute_dtype_map
+from quack.reduce import row_reduce, online_softmax_reduce
+from quack.reduction_base import ReductionBase
+from quack.cute_dsl_utils import torch2cute_dtype_map
 class Softmax(ReductionBase):
@@ -147,7 +149,7 @@ class Softmax(ReductionBase):
         x = tXrX.load().to(cute.Float32)
         threads_per_row = tv_layout.shape[0][0]
         if cutlass.const_expr(not self.online_softmax):
-            max_x = utils.row_reduce(
+            max_x = row_reduce(
                 x,
                 cute.ReductionOp.MAX,
                 threads_per_row,
@@ -158,7 +160,7 @@ class Softmax(ReductionBase):
             )
             log2_e = math.log2(math.e)
             exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
-            denom = utils.row_reduce(
+            denom = row_reduce(
                 exp_x,
                 cute.ReductionOp.ADD,
                 threads_per_row,
@@ -167,7 +169,7 @@ class Softmax(ReductionBase):
                 init_val=0.0,
             )
         else:
-            max_x, denom, exp_x = utils.online_softmax_reduce(
+            max_x, denom, exp_x = online_softmax_reduce(
                 x,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
@@ -186,7 +188,8 @@ class Softmax(ReductionBase):
             cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
-def _softmax_fwd(x: torch.Tensor) -> torch.Tensor:
+@torch.library.custom_op("quack::_softmax_fwd", mutates_args={"out"})
+def _softmax_fwd(x: torch.Tensor, out: torch.Tensor) -> None:
     """Softmax forward pass.
     Args:
         x: Input tensor of shape (M, N)
@@ -196,8 +199,7 @@ def _softmax_fwd(x: torch.Tensor) -> torch.Tensor:
     assert x.dim() == 2, "Input must be 2D"
     assert x.is_cuda, "Tensor must be on CUDA device"
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    M, N = x.shape
-    out = torch.empty_like(x)
+    N = x.size(1)
     dtype = torch2cute_dtype_map[x.dtype]
     convert_from_dlpack = lambda tensor: (
         from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
@@ -213,12 +215,17 @@ def _softmax_fwd(x: torch.Tensor) -> torch.Tensor:
             softmax_op, x_tensor, out_tensor, current_stream
         )
     _softmax_fwd.compile_cache[compile_key](x_tensor, out_tensor, current_stream)
-    return out
 _softmax_fwd.compile_cache = {}
+def softmax_fwd(x: torch.Tensor) -> torch.Tensor:
+    out = torch.empty_like(x)
+    _softmax_fwd(x, out)
+    return out
 class SoftmaxBackward(ReductionBase):
     def __init__(self, dtype: Type[cutlass.Numeric], N: int):
         # 1 stage for computing dot product
@@ -372,7 +379,7 @@ class SoftmaxBackward(ReductionBase):
         # Compute dot product: dot = Σⱼ dy_j × y_j
         threads_per_row = tv_layout.shape[0][0]
-        dot = utils.row_reduce(
+        dot = row_reduce(
             dy * y,
             cute.ReductionOp.ADD,
             threads_per_row,
@@ -394,7 +401,8 @@ class SoftmaxBackward(ReductionBase):
             cute.copy(copy_atom_store, tdXrdX, tdXgdX, pred=tdXpdX)
-def _softmax_backward(dy: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+@torch.library.custom_op("quack::_softmax_backward", mutates_args={"dx"})
+def _softmax_backward(dy: torch.Tensor, y: torch.Tensor, dx: torch.Tensor) -> None:
     """Softmax backward pass.
     Args:
         dy: Upstream gradients tensor of shape (M, N)
@@ -409,8 +417,7 @@ def _softmax_backward(dy: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     assert dy.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
     assert y.dtype == dy.dtype, "dy and y must have same dtype"
-    M, N = dy.shape
-    dx = torch.empty_like(dy)
+    N = dy.size(1)
     dtype = torch2cute_dtype_map[dy.dtype]
     convert_from_dlpack = lambda tensor: (
         from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
@@ -427,23 +434,28 @@ def _softmax_backward(dy: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             softmax_backward_op, dy_tensor, y_tensor, dx_tensor, current_stream
         )
     _softmax_backward.compile_cache[compile_key](dy_tensor, y_tensor, dx_tensor, current_stream)
-    return dx
 _softmax_backward.compile_cache = {}
+def softmax_bwd(dy: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    dx = torch.empty_like(dy)
+    _softmax_backward(dy, y, dx)
+    return dx
 class SoftmaxFunction(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x):
-        y = _softmax_fwd(x)
+        y = softmax_fwd(x)
         ctx.save_for_backward(y)
         return y
     @staticmethod
     def backward(ctx, dy):
         (y,) = ctx.saved_tensors
-        dx = _softmax_backward(dy, y)
+        dx = softmax_bwd(dy, y)
         return dx

quack/symmetric_dense_gemm_sm90.py CHANGED Viewed

@@ -51,7 +51,7 @@ from quack.tile_scheduler import (
     RasterOrderOption,
     TriangularTileScheduler,
 )
-from quack.reduction_base import torch2cute_dtype_map
+from quack.cute_dsl_utils import torch2cute_dtype_map
 # return PipelineStateWAdvance instead of PipelineState
 from quack.pipeline import make_pipeline_state
@@ -907,8 +907,11 @@ class HopperSymmetricGemmKernel:
             acc_shape = tiled_mma.partition_shape_C(cute.select(self.tile_shape_mnk, mode=[0, 1]))
             acc = cute.make_fragment(acc_shape, self.acc_dtype)
-            if const_expr(self.fp8_slow_accum):
-                acc_slow = cute.make_fragment(acc_shape, self.acc_dtype)
+            acc_slow = (
+                cute.make_fragment(acc_shape, self.acc_dtype)
+                if const_expr(self.fp8_slow_accum)
+                else None
+            )
             if const_expr(self.pingpong):
                 if warp_group_idx == 0:

quack/tensormap_manager.py CHANGED Viewed

@@ -99,6 +99,7 @@ class TensorMapManagerSm90(TensorMapManager):
                 for gmem_ptr, smem_ptr in zip(tensormap_gmem_ptr, tensormap_smem_ptr):
                     cute.nvgpu.cpasync.cp_fence_tma_desc_release(gmem_ptr, smem_ptr)
             else:
+                assert len(shapes) == len(orders) == len(tensormap_gmem_ptr)
                 for gmem_ptr, shape, order in zip(tensormap_gmem_ptr, shapes, orders):
                     gmem_ptr_i64 = gmem_ptr.toint().ir_value()
                     llvm.inline_asm(

quack/tile_scheduler.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Copyright (c) 2025, Tri Dao.
 from typing import Tuple, Optional
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
 from enum import IntEnum
 import cutlass
@@ -11,30 +11,7 @@ from cutlass import Int32, Boolean, const_expr
 import quack.utils as utils
 from quack.fast_math import FastDivmod
 from quack.pipeline import PipelineStateWAdvance
-@dataclass
-class ParamsBase:
-    def __extract_mlir_values__(self):
-        all_fields = [getattr(self, field.name) for field in fields(self)]
-        non_constexpr_fields = [f for f in all_fields if not isinstance(f, cutlass.Constexpr)]
-        values, self._values_pos = [], []
-        for obj in non_constexpr_fields:
-            obj_values = cutlass.extract_mlir_values(obj)
-            values += obj_values
-            self._values_pos.append(len(obj_values))
-        return values
-    def __new_from_mlir_values__(self, values):
-        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
-        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, cutlass.Constexpr)}
-        non_constexpr_fields = {
-            n: f for n, f in all_fields.items() if not isinstance(f, cutlass.Constexpr)
-        }
-        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
-            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
-            values = values[n_items:]
-        return self.__class__(**non_constexpr_fields, **constexpr_fields)
+from quack.cute_dsl_utils import ArgumentsBase, ParamsBase
 class RasterOrderOption(IntEnum):
@@ -66,13 +43,24 @@ def get_raster_order_from_option(
     return raster_order
+# Grouping arguments together that should be passed to __call__
+@dataclass
+class TileSchedulerOptions(ArgumentsBase):
+    max_active_clusters: Int32
+    raster_order: cutlass.Constexpr[RasterOrderOption] = RasterOrderOption.Heuristic
+    max_swizzle_size: Int32 = Int32(8)
+    tile_count_semaphore: Optional[cute.Pointer] = None
+    batch_idx_permute: Optional[cute.Tensor] = None
 @dataclass
-class TileSchedulerArguments(ParamsBase):
+class TileSchedulerArguments(ArgumentsBase):
     problem_shape_ntile_mnl: cute.Shape
-    raster_order: RasterOrderOption
+    raster_order: cutlass.Constexpr[RasterOrderOption]
     group_size: Int32
     cluster_shape_mnk: cutlass.Constexpr[cute.Shape]
     tile_count_semaphore: Optional[cute.Pointer] = None
+    batch_idx_permute: Optional[cute.Tensor] = None
     is_persistent: cutlass.Constexpr[bool] = False
@@ -87,6 +75,7 @@ class TileScheduler:
         group_size_tail_divmod: FastDivmod
         num_clusters_in_group_divmod: FastDivmod
         tile_count_semaphore: Optional[cute.Pointer]
+        batch_idx_permute: Optional[cute.Tensor]
         cluster_shape_mn: cutlass.Constexpr[cute.Shape]
         is_persistent: cutlass.Constexpr[bool]
@@ -128,6 +117,7 @@ class TileScheduler:
                 FastDivmod.create(group_size_tail if group_size_tail > 0 else 1),
                 FastDivmod.create(num_clusters_in_group),
                 args.tile_count_semaphore if const_expr(args.is_persistent) else None,
+                args.batch_idx_permute,
                 cluster_shape_mn,
                 args.is_persistent,
             )
@@ -256,7 +246,10 @@ class TileScheduler:
         bidx_in_cluster = cute.arch.block_in_cluster_idx()
         pid_m = cid_m * params.cluster_shape_mn[0] + bidx_in_cluster[0]
         pid_n = cid_n * params.cluster_shape_mn[1] + bidx_in_cluster[1]
-        tile_coord_mnkl = (pid_m, pid_n, None, bidz)
+        batch_idx = (
+            bidz if const_expr(params.batch_idx_permute is None) else params.batch_idx_permute[bidz]
+        )
+        tile_coord_mnkl = (pid_m, pid_n, None, batch_idx)
         if const_expr(not params.is_persistent):
             is_valid = self._num_tiles_executed == 0
         else:
@@ -267,10 +260,10 @@ class TileScheduler:
         return self.get_current_work(loc=loc, ip=ip)
     @cute.jit
-    def fetch_next_work(self, is_scheduler_warp: bool | Boolean, *, loc=None, ip=None):
+    def fetch_next_work(self, is_scheduler_warp: bool | Boolean = False, *, loc=None, ip=None):
         """is_scheduler_warp should only be true for one warp in the whole cluster"""
-        if const_expr(self.params.tile_count_semaphore is not None):
-            params = self.params
+        params = self.params
+        if const_expr(params.is_persistent and params.tile_count_semaphore is not None):
             current_work_linear_idx = self._current_work_linear_idx
             if is_scheduler_warp:
                 if cute.arch.lane_idx() == 0:
@@ -283,6 +276,38 @@ class TileScheduler:
                 current_work_linear_idx = cute.arch.shuffle_sync(current_work_linear_idx, 0)
             self._current_work_linear_idx = current_work_linear_idx
+    # We have to split broadcast_next_work and advance_to_next_work into two functions
+    # due to a bug in cute-dsl 4.2: https://github.com/NVIDIA/cutlass/issues/2647
+    @cute.jit
+    def broadcast_next_work(self, is_scheduler_warp: bool | Boolean = False, *, loc=None, ip=None):
+        """is_scheduler_warp should only be true for one warp in the whole cluster"""
+        params = self.params
+        if const_expr(params.is_persistent and params.tile_count_semaphore is not None):
+            current_work_linear_idx = self._current_work_linear_idx
+            if is_scheduler_warp:
+                self._scheduler_pipeline.producer_acquire(self._pipeline_state)
+                lane_idx = cute.arch.lane_idx()
+                if lane_idx < cute.size(params.cluster_shape_mn):
+                    # cute.printf("Producer bidx = {}, tidx = {}, after empty wait, idx = {}", bidx, tidx, current_work_linear_idx)
+                    if const_expr(cute.size(params.cluster_shape_mn) == 1):
+                        self._tile_count[self._pipeline_state.index] = current_work_linear_idx
+                        self._scheduler_pipeline.producer_commit(self._pipeline_state)
+                    else:
+                        peer_cta_rank_in_cluster = lane_idx
+                        mbar_ptr = self._scheduler_pipeline.producer_get_barrier(
+                            self._pipeline_state
+                        )
+                        cute.arch.mbarrier_arrive_and_expect_tx(
+                            mbar_ptr, 4, peer_cta_rank_in_cluster
+                        )
+                        utils.store_shared_remote(
+                            val=current_work_linear_idx,
+                            smem_ptr=self._tile_count.iterator + self._pipeline_state.index,
+                            mbar_ptr=mbar_ptr,
+                            peer_cta_rank_in_cluster=peer_cta_rank_in_cluster,
+                        )
+                    # cute.printf("Producer bidx = {}, tidx = {}, after full arrive", bidx, tidx)
     @cute.jit
     def advance_to_next_work(
         self,
@@ -300,32 +325,10 @@ class TileScheduler:
             if const_expr(params.tile_count_semaphore is None):  # Static persistent
                 self._current_work_linear_idx += advance_count * Int32(num_persistent_clusters)
             else:  # Dynamic persistent
-                self._pipeline_state.advance_iters(advance_count - 1)
+                if const_expr(advance_count > 1):
+                    self._pipeline_state.advance_iters(advance_count - 1)
                 current_work_linear_idx = self._current_work_linear_idx
-                if is_scheduler_warp:
-                    self._scheduler_pipeline.producer_acquire(self._pipeline_state)
-                    lane_idx = cute.arch.lane_idx()
-                    if lane_idx < cute.size(params.cluster_shape_mn):
-                        # cute.printf("Producer bidx = {}, tidx = {}, after empty wait, idx = {}", bidx, tidx, current_work_linear_idx)
-                        if const_expr(cute.size(params.cluster_shape_mn) == 1):
-                            self._tile_count[self._pipeline_state.index] = current_work_linear_idx
-                            self._scheduler_pipeline.producer_commit(self._pipeline_state)
-                        else:
-                            peer_cta_rank_in_cluster = lane_idx
-                            mbar_ptr = self._scheduler_pipeline.producer_get_barrier(
-                                self._pipeline_state
-                            )
-                            cute.arch.mbarrier_arrive_and_expect_tx(
-                                mbar_ptr, 4, peer_cta_rank_in_cluster
-                            )
-                            utils.store_shared_remote(
-                                val=current_work_linear_idx,
-                                smem_ptr=self._tile_count.iterator + self._pipeline_state.index,
-                                mbar_ptr=mbar_ptr,
-                                peer_cta_rank_in_cluster=peer_cta_rank_in_cluster,
-                            )
-                        # cute.printf("Producer bidx = {}, tidx = {}, after full arrive", bidx, tidx)
-                else:
+                if not is_scheduler_warp:
                     # if tidx % 64 == 0: cute.printf("bidx = {},tidx = {}, before full wait, idx = {}", bidx, tidx, current_work_linear_idx)
                     self._scheduler_pipeline.consumer_wait(self._pipeline_state)
                     # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, after full wait, idx = {}", bidx, tidx, current_work_linear_idx)
@@ -580,7 +583,7 @@ class VarlenMTileSchedulerArguments(ParamsBase):
     cu_seqlens_m: cute.Tensor
     raster_order: cutlass.Constexpr[RasterOrderOption]
     group_size: Int32
-    tile_shape_mnk: cutlass.Constexpr[cute.Shape]
+    tile_shape_mn: cutlass.Constexpr[cute.Shape]
     cluster_shape_mnk: cutlass.Constexpr[cute.Shape]
     tile_count_semaphore: Optional[cute.Pointer] = None
     is_persistent: cutlass.Constexpr[bool] = False
@@ -609,7 +612,6 @@ class VarlenMTileScheduler(TileScheduler):
         ) -> "VarlenMTileScheduler.Params":
             assert args.cluster_shape_mnk[2] == 1
             cluster_shape_mn = const_expr(cute.select(args.cluster_shape_mnk, mode=[0, 1]))
-            tile_shape_mn = const_expr(cute.select(args.tile_shape_mnk, mode=[0, 1]))
             # problem_shape_ntile_mnl[0] will be None for VarlenM
             problem_shape_ntile_mn = cute.select(args.problem_shape_ntile_mnl, mode=[0, 1])
             problem_shape_ncluster_mn = (
@@ -657,7 +659,7 @@ class VarlenMTileScheduler(TileScheduler):
                 FastDivmod.create(num_clusters_in_group)
                 if num_clusters_in_group is not None
                 else None,
-                tile_shape_mn,
+                args.tile_shape_mn,
                 args.tile_count_semaphore if const_expr(args.is_persistent) else None,
                 cluster_shape_mn,
                 args.is_persistent,

quack/topk.py CHANGED Viewed

@@ -12,7 +12,7 @@ from cutlass.cute.runtime import from_dlpack
 from cutlass import const_expr
 import quack.utils as utils
-from quack.reduction_base import torch2cute_dtype_map
+from quack.cute_dsl_utils import torch2cute_dtype_map
 from quack.sort.bitonic_sort import bitonic_topk
@@ -133,6 +133,7 @@ class TopK:
         threads_per_row = tv_layout.shape[0][0]
         topk_vals = bitonic_topk(tXrX_f32, self.k, warp_width=threads_per_row)
         # Extract indices and clean values
         topk_vals_u32 = cute.recast_tensor(topk_vals, cutlass.Uint32)
         topk_indices = cute.make_fragment(self.k, cutlass.Int32)
@@ -166,7 +167,8 @@ class TopK:
                 cute.autovec_copy(topk_indices_store[None, i], mIndices_store[None, i])
-def _topk_fwd(x: torch.Tensor, k: int):
+@torch.library.custom_op("quack::_topk_fwd", mutates_args={"values", "indices"})
+def _topk_fwd(x: torch.Tensor, k: int, values: torch.Tensor, indices: torch.Tensor) -> None:
     """Top-k forward pass.
     Args:
         x: Input tensor of shape (M, N)
@@ -179,9 +181,7 @@ def _topk_fwd(x: torch.Tensor, k: int):
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
     assert k > 0 and k <= x.shape[1], "k must be positive and <= N"
-    M, N = x.shape
-    values = torch.empty((M, k), dtype=x.dtype, device=x.device)
-    indices = torch.empty((M, k), dtype=torch.int32, device=x.device)
+    N = x.size(1)
     dtype = torch2cute_dtype_map[x.dtype]
     convert_from_dlpack = lambda tensor: (
@@ -202,8 +202,6 @@ def _topk_fwd(x: torch.Tensor, k: int):
         )
     _topk_fwd.compile_cache[compile_key](x_tensor, values_tensor, indices_tensor, current_stream)
-    return values, indices
 _topk_fwd.compile_cache = {}
@@ -218,4 +216,12 @@ def topk(x: torch.Tensor, k: int):
     Returns:
         Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
     """
-    return _topk_fwd(x, k)
+    M = x.size(0)
+    values = torch.empty((M, k), dtype=x.dtype, device=x.device)
+    indices = torch.empty((M, k), dtype=torch.int32, device=x.device)
+    _topk_fwd(x, k, values, indices)
+    return values, indices

quack-kernels 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

quack-kernels 0.1.11py3-none-any.whl → 0.2.0py3-none-any.whl