PyPI - quack-kernels - Versions diffs - 0.1.10__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

quack-kernels 0.1.10py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

quack/__init__.py +8 -1
quack/activation.py +288 -0
quack/autotuner.py +310 -0
quack/cross_entropy.py +325 -175
quack/cute_dsl_utils.py +119 -0
quack/dense_gemm_sm100.py +2562 -0
quack/dense_gemm_sm90.py +1657 -842
quack/fast_math.py +80 -0
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +69 -0
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +569 -0
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +5 -3
quack/linear.py +240 -0
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +74 -0
quack/pipeline.py +151 -0
quack/reduce.py +241 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +583 -231
quack/softmax.py +27 -15
quack/sort/bitonic_sort.py +126 -0
quack/sort/generate_sorting_networks.py +326 -0
quack/sort/sorting_networks.py +120 -0
quack/sort/utils.py +31 -0
quack/symmetric_dense_gemm_sm90.py +2091 -0
quack/tensormap_manager.py +115 -0
quack/tile_scheduler.py +937 -0
quack/topk.py +227 -0
quack/utils.py +203 -230
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/METADATA +2 -2
quack_kernels-0.2.0.dist-info/RECORD +37 -0
quack_kernels-0.1.10.dist-info/RECORD +0 -13
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/top_level.txt +0 -0

quack/mlp.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright (c) 2025, Tri Dao
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from quack.linear import linear_act_func, act_linear_func
+def mlp_func(x, weight1, weight2, activation: str, fuse_grad_accum=False, tuned=True):
+    preact, postact = linear_act_func(
+        x,
+        weight1,
+        activation,
+        store_preact=torch.is_grad_enabled(),
+        fuse_grad_accum=fuse_grad_accum,
+        tuned=tuned,
+    )
+    out = act_linear_func(
+        preact,
+        weight2,
+        postact,
+        activation=activation,
+        fuse_grad_accum=fuse_grad_accum,
+        tuned=tuned,
+    )
+    return out
+class MLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=False,
+        bias2=False,
+        activation="gelu",
+        device=None,
+        dtype=None,
+        fuse_grad_accum: bool = False,
+        tuned: bool = True,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        hidden_features = hidden_features if hidden_features is not None else 4 * in_features
+        self.activation = activation
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+        self.fuse_grad_accum = fuse_grad_accum
+        self.tuned = tuned
+    def forward(self, input: Tensor) -> Tensor:
+        if (
+            self.fc1.bias is None
+            and self.fc2.bias is None
+            and input.is_cuda
+            and input.stride(-1) == 1
+            and self.fc1.in_features % 8 == 0
+            and self.fc1.out_features % 8 == 0
+            and self.fc2.out_features % 8 == 0
+        ):
+            return mlp_func(
+                input,
+                self.fc1.weight,
+                self.fc2.weight,
+                activation=self.activation,
+                fuse_grad_accum=self.fuse_grad_accum,
+                tuned=self.tuned,
+            )
+        else:
+            y = self.fc1(input)
+            return self.fc2(F.silu(y[..., ::2]) * y[..., 1::2])

quack/pipeline.py ADDED Viewed

@@ -0,0 +1,151 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional
+from dataclasses import dataclass
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, Int32, if_generate
+from cutlass.pipeline import CooperativeGroup, PipelineOp, pipeline_init_wait
+from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+class PipelineStateWAdvance(PipelineState):
+    def advance_iters(self, num_iterations: Int32):
+        self._count += Int32(num_iterations)
+        new_index = self._index + Int32(num_iterations)
+        # How many times did we cross the stages boundary
+        num_crossings = new_index // self.stages
+        self._phase ^= num_crossings
+        self._index = new_index % self.stages
+    # This can be overridden by derived classes
+    def __new_from_mlir_values__(self, values):
+        return PipelineStateWAdvance(
+            self.stages, Int32(values[0]), Int32(values[1]), Int32(values[2])
+        )
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        return PipelineStateWAdvance(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(1),
+        )
+    elif type is PipelineUserType.Consumer:
+        return PipelineStateWAdvance(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(0),
+        )
+    else:
+        assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+@dataclass(frozen=True)
+class PipelineTmaCpAsync(PipelineTmaAsync):
+    """
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and AsyncThread consumers
+    """
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        tidx: Optional[Int32] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        :param tidx: thread index to consumer async threads
+        :type tidx: Int32 | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.AsyncThread
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if tidx is None:
+            tidx, _, _ = cute.arch.thread_idx()
+        if cta_layout_vmnk is None:
+            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
+        (
+            dst_rank,
+            is_signalling_thread,
+        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            dst_rank = None
+        else:
+            dst_rank = dst_rank
+        producer_mask = None
+        pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaCpAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            dst_rank,
+            is_signalling_thread,
+        )
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if_generate(
+            is_tma_warp,
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+    def producer_commit(self, state: PipelineState):
+        """
+        We need the mbarrier to track the completion of cp.async
+        """
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))

quack/reduce.py ADDED Viewed

@@ -0,0 +1,241 @@
+# Copyright (c) 2025, Tri Dao.
+import math
+import operator
+from typing import Callable, Optional
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32
+import quack.utils as utils
+@cute.jit
+def warp_reduce(
+    val: cute.TensorSSA | cute.Numeric,
+    op: Callable,
+    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
+) -> cute.TensorSSA | cute.Numeric:
+    if cutlass.const_expr(isinstance(val, cute.TensorSSA)):
+        res = cute.make_fragment(val.shape, val.dtype)
+        res.store(val)
+        for i in cutlass.range_constexpr(cute.size(val.shape)):
+            res[i] = warp_reduce(res[i], op, width)
+        return res.load()
+    else:
+        for i in cutlass.range_constexpr(int(math.log2(width))):
+            val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
+    return val
+@cute.jit
+def block_reduce(
+    val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, init_val: cute.Numeric = 0.0
+) -> cute.Numeric:
+    """reduction_buffer has shape (num_warps / warp_per_row, warps_per_row)"""
+    lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+    warps_per_row = cute.size(reduction_buffer.shape[1])
+    row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if lane_idx == 0:
+        reduction_buffer[row_idx, col_idx] = val
+    cute.arch.barrier()
+    block_reduce_val = init_val
+    if lane_idx < warps_per_row:
+        block_reduce_val = reduction_buffer[row_idx, lane_idx]
+    return warp_reduce(block_reduce_val, op)
+@cute.jit
+def cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: cute.Pointer,
+    init_val: cute.Numeric = 0.0,
+    phase: Optional[cutlass.Int32] = None,
+) -> cute.Numeric:
+    """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
+    cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+    lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+    rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
+    row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if warp_idx == 0:
+        with cute.arch.elect_one():
+            num_warps = rows_per_block * warps_per_row
+            cute.arch.mbarrier_arrive_and_expect_tx(
+                mbar_ptr,
+                num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+            )
+    if lane_idx < cluster_n:
+        utils.store_shared_remote(
+            val,
+            utils.elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+            mbar_ptr,
+            peer_cta_rank_in_cluster=lane_idx,
+        )
+    cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+    block_reduce_val = init_val
+    num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+    for i in cutlass.range_constexpr(num_iter):
+        idx = lane_idx + i * cute.arch.WARP_SIZE
+        if idx < cute.size(reduction_buffer, mode=[1]):
+            block_reduce_val = op(block_reduce_val, reduction_buffer[row_idx, idx])
+    return warp_reduce(block_reduce_val, op)
+@cute.jit
+def block_or_cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: Optional[cute.Pointer],
+    phase: Optional[cutlass.Int32] = None,
+    init_val: cute.Numeric = 0.0,
+) -> cute.Numeric:
+    """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
+    if cutlass.const_expr(mbar_ptr is None):
+        return block_reduce(val, op, reduction_buffer, init_val=init_val)
+    else:
+        return cluster_reduce(val, op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val)
+@cute.jit
+def row_reduce(
+    x: cute.TensorSSA | cute.Numeric,
+    op: cute.ReductionOp,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    phase: Optional[cutlass.Int32] = None,
+    init_val: cute.Numeric = 0.0,
+    hook_fn: Optional[Callable] = None,
+) -> cute.Numeric:
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
+    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
+        val = x.reduce(op, init_val=init_val, reduction_profile=0)
+    else:
+        val = x
+    warp_op = {
+        cute.ReductionOp.ADD: operator.add,
+        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
+        cute.ReductionOp.MIN: min,
+        cute.ReductionOp.MUL: operator.mul,
+    }[op]
+    val = warp_reduce(
+        val,
+        warp_op,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if cutlass.const_expr(hook_fn is not None):
+        hook_fn()
+    if cutlass.const_expr(reduction_buffer is not None):
+        warps_per_row, cluster_n = reduction_buffer.shape[1]
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+            val = block_or_cluster_reduce(
+                val, warp_op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val
+            )
+    return val
+@cute.jit
+def online_softmax_reduce(
+    x: cute.TensorSSA,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    hook_fn: Optional[Callable] = None,
+    phase: Optional[cutlass.Int32] = None,
+    return_exp_x: bool = False,
+) -> [Float32, Float32, Optional[cute.TensorSSA]]:
+    assert x.dtype == Float32, "x must be of type Float32"
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n), 2)"""
+    max_x = warp_reduce(
+        x.reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
+        cute.arch.fmax,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    log2_e = math.log2(math.e)
+    exp_x = utils.exp2f(x * log2_e - (max_x * log2_e))
+    # exp_x = exp2f((x - max_x) * log2_e)
+    sum_exp_x = warp_reduce(
+        exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
+        operator.add,
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if cutlass.const_expr(hook_fn is not None):
+        hook_fn()
+    if cutlass.const_expr(reduction_buffer is not None):
+        rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+            assert reduction_buffer.element_type == cutlass.Int64, (
+                "reduction_buffer must be of type cute.Int64"
+            )
+            lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+            row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+            if cutlass.const_expr(mbar_ptr is None):
+                if lane_idx == 0:
+                    reduction_buffer[row_idx, col_idx] = utils.f32x2_to_i64(max_x, sum_exp_x)
+                cute.arch.barrier()
+                max_x_single_warp = -Float32.inf
+                sum_exp_x = 0.0
+                if lane_idx < warps_per_row:
+                    max_x_single_warp, sum_exp_x = utils.i64_to_f32x2(
+                        reduction_buffer[row_idx, lane_idx]
+                    )
+                max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
+                sum_exp_x *= utils.exp2f((max_x_single_warp - max_x_final) * log2_e)
+                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
+                if cutlass.const_expr(return_exp_x):
+                    exp_x *= utils.exp2f((max_x - max_x_final) * log2_e)
+                max_x = max_x_final
+            else:
+                cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+                if warp_idx == 0:
+                    with cute.arch.elect_one():
+                        num_warps = rows_per_block * warps_per_row
+                        cute.arch.mbarrier_arrive_and_expect_tx(
+                            mbar_ptr,
+                            num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+                        )
+                if lane_idx < cluster_n:
+                    utils.store_shared_remote(
+                        utils.f32x2_to_i64(max_x, sum_exp_x),
+                        utils.elem_pointer(
+                            reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))
+                        ),
+                        mbar_ptr,
+                        peer_cta_rank_in_cluster=lane_idx,
+                    )
+                cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+                num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+                max_x_single_warp = cute.make_fragment(num_iter, Float32)
+                max_x_single_warp.fill(-Float32.inf)
+                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
+                sum_exp_x_single_warp.fill(0.0)
+                for i in cutlass.range_constexpr(num_iter):
+                    idx = lane_idx + i * cute.arch.WARP_SIZE
+                    if idx < cute.size(reduction_buffer, mode=[1]):
+                        max_x_single_warp[i], sum_exp_x_single_warp[i] = utils.i64_to_f32x2(
+                            reduction_buffer[row_idx, idx]
+                        )
+                max_x_final = max_x_single_warp.load().reduce(
+                    cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0
+                )
+                max_x_final = warp_reduce(max_x_final, cute.arch.fmax)
+                sum_exp_x = 0.0
+                for i in cutlass.range_constexpr(num_iter):
+                    sum_exp_x += sum_exp_x_single_warp[i] * utils.exp2f(
+                        (max_x_single_warp[i] - max_x_final) * log2_e
+                    )
+                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
+                if cutlass.const_expr(return_exp_x):
+                    exp_x *= utils.exp2f((max_x - max_x_final) * log2_e)
+                max_x = max_x_final
+    return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)

quack/reduction_base.py CHANGED Viewed

@@ -1,19 +1,11 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
-import torch
 from typing import Type, Tuple, Optional
 import cutlass
 import cutlass.cute as cute
-torch2cute_dtype_map = {
-    torch.float16: cutlass.Float16,
-    torch.bfloat16: cutlass.BFloat16,
-    torch.float32: cutlass.Float32,
-}
 class ReductionBase:
     def __init__(
         self, dtype: Type[cutlass.Numeric], N: int, stage: int, reduction_dtype=cutlass.Float32
@@ -32,9 +24,8 @@ class ReductionBase:
     def _get_num_threads(self):
         return 128 if self.N <= 16384 else 256
-    def _get_tv_layout(self):
-        copy_bits = 128
-        vecsize = copy_bits // self.dtype.width
+    def _get_tv_layout(self, num_copy_bits=128):
+        vecsize = num_copy_bits // self.dtype.width
         assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
         num_threads = self._get_num_threads()
         assert num_threads % cute.arch.WARP_SIZE == 0

quack-kernels 0.1.10__py3-none-any.whl → 0.2.0__py3-none-any.whl

quack-kernels 0.1.10py3-none-any.whl → 0.2.0py3-none-any.whl