PyPI - quack-kernels - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

quack-kernels 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

quack/__init__.py +4 -1
quack/autotuner.py +309 -0
quack/cross_entropy.py +2 -5
quack/cute_dsl_utils.py +40 -0
quack/dense_gemm_sm100.py +2562 -0
quack/dense_gemm_sm90.py +2474 -0
quack/fast_math.py +97 -0
quack/gemm_config.py +61 -0
quack/gemm_interface.py +321 -0
quack/linear.py +176 -0
quack/lse.py +62 -0
quack/mlp.py +204 -0
quack/pipeline.py +166 -0
quack/sort/bitonic_sort.py +126 -0
quack/sort/generate_sorting_networks.py +326 -0
quack/sort/sorting_networks.py +120 -0
quack/sort/utils.py +31 -0
quack/symmetric_dense_gemm_sm90.py +2088 -0
quack/tensormap_manager.py +114 -0
quack/tile_scheduler.py +935 -0
quack/topk.py +221 -0
quack/utils.py +237 -19
{quack_kernels-0.1.9.dist-info → quack_kernels-0.1.11.dist-info}/METADATA +3 -3
quack_kernels-0.1.11.dist-info/RECORD +31 -0
quack_kernels-0.1.9.dist-info/RECORD +0 -12
{quack_kernels-0.1.9.dist-info → quack_kernels-0.1.11.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.9.dist-info → quack_kernels-0.1.11.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.9.dist-info → quack_kernels-0.1.11.dist-info}/top_level.txt +0 -0

quack/lse.py ADDED Viewed

@@ -0,0 +1,62 @@
+# Copyright (c) 2025, Tri Dao.
+# TODO: we probably dont' need this kernel, just use torch.logsumexp
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _lse_kernel(
+    lse_ptr,
+    logits_ptr,
+    n_rows,
+    n_cols,
+    logits_row_stride,
+    logits_col_stride,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    row_start = tl.program_id(0) * BLOCK_SIZE_M
+    rows = row_start + tl.arange(0, BLOCK_SIZE_M)
+    cols = tl.arange(0, BLOCK_SIZE_N)
+    logits = tl.load(
+        logits_ptr + rows[:, None] * logits_row_stride + cols[None, :] * logits_col_stride,
+        mask=(rows[:, None] < n_rows) & (cols[None, :] < n_cols),
+        other=-float("inf"),
+    ).to(tl.float32)
+    m = tl.max(logits, 1)
+    lse = tl.log(tl.sum(tl.exp(logits - m[:, None]), 1)) + m
+    tl.store(lse_ptr + rows, lse, mask=rows < n_rows)
+def logsumexp(logits):
+    n_rows, n_cols = logits.shape
+    BLOCK_SIZE_M = 32 if logits.stride(1) != 1 else 1
+    MAX_BLOCK_SIZE = 64 * 1024
+    # BLOCK_SIZE_N = min(triton.next_power_of_2(n_cols), MAX_BLOCK_SIZE // BLOCK_SIZE_M)
+    BLOCK_SIZE_N = triton.next_power_of_2(n_cols)
+    assert (
+        BLOCK_SIZE_M * BLOCK_SIZE_N <= MAX_BLOCK_SIZE
+    ), f"Only support max dimension {MAX_BLOCK_SIZE // BLOCK_SIZE_M}"
+    num_warps = (
+        4
+        if BLOCK_SIZE_N < 2048
+        else (8 if BLOCK_SIZE_N < 8192 else (16 if BLOCK_SIZE_N < 128 * 1024 else 32))
+    )
+    lse = torch.empty(n_rows, dtype=torch.float, device=logits.device)
+    # Need this, otherwise Triton tries to launch from cuda:0 and we get
+    # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    with torch.cuda.device(logits.device.index):
+        _lse_kernel[(triton.cdiv(n_rows, BLOCK_SIZE_M),)](
+            lse,
+            logits,
+            n_rows,
+            n_cols,  # shapes
+            logits.stride(0),  # strides
+            logits.stride(1),
+            BLOCK_SIZE_M=BLOCK_SIZE_M,  # constants
+            BLOCK_SIZE_N=BLOCK_SIZE_N,  # constants
+            num_warps=num_warps,
+        )
+    return lse

quack/mlp.py ADDED Viewed

@@ -0,0 +1,204 @@
+# Copyright (c) 2025, Tri Dao
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.amp import custom_fwd, custom_bwd
+from einops import rearrange
+from gemm_cublas import gemm as gemm_cb, gemm_add_ as gemm_add_cb_
+# from gemm_cublas.interface import gemm_tuned as gemm_cb, gemm_add_tuned_ as gemm_add_cb_
+from quack import gemm, gemm_swiglu, gemm_dswiglu  # TODO: implement these
+class MLPSwiGLUFunc(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(device_type="cuda")
+    def forward(ctx, x, weight1, weight2, fuse_grad_accum=False):
+        """
+        x: (..., in_features)
+        weight1: (2 * intermediate_features, in_features)
+        weight2: (out_features, intermediate_features)
+        out: (..., out_features)
+        Note that we do swiglu on the even and odd indices of the intermediate output,
+        i.e. silu(y[..., ::2]) * y[..., 1::2].
+        This is different from the usual swiglu implementation that does: y1, y2 = y.chunk(2, dim=-1); silu(y1) * y2
+        """
+        needs_weight1_grad = weight1.requires_grad
+        needs_weight2_grad = weight2.requires_grad
+        needs_input_grad = x.requires_grad
+        ctx.weight1_dtype = weight1.dtype
+        ctx.weight2_dtype = weight2.dtype
+        autocast_dtype = torch.get_autocast_dtype("cuda")
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=autocast_dtype)
+        weight1_og = weight1
+        weight2_og = weight2
+        if torch.is_autocast_enabled():
+            weight1 = weight1.to(dtype=autocast_dtype)
+            weight2 = weight2.to(dtype=autocast_dtype)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        # don't need preact if not computing gradient
+        store_preact = needs_input_grad or needs_weight1_grad or needs_weight2_grad
+        # (batch, inter_dim) & (batch, 2 * inter_dim)
+        y, preact = gemm_swiglu(x, weight1.T, store_preact=store_preact)
+        # out = F.linear(y, weight2)
+        out = gemm(y, weight2.T)
+        if not needs_input_grad:
+            weight1, weight1_og = None, None
+        if not needs_weight1_grad:
+            x = None
+        if not needs_input_grad and not needs_weight1_grad and not needs_weight2_grad:
+            weight2, weight2_og = None, None
+            preact = None
+        ctx.save_for_backward(
+            x,
+            preact,
+            weight1,
+            weight2,
+            *((weight1_og, weight2_og) if fuse_grad_accum else (None, None)),
+        )
+        ctx.fuse_grad_accum = fuse_grad_accum
+        return out.reshape(*batch_shape, out.shape[-1])
+    @staticmethod
+    @custom_bwd(device_type="cuda")
+    def backward(ctx, dout):
+        """
+        dout: (..., out_features)
+        """
+        if not torch.compiler.is_dynamo_compiling():
+            assert dout.stride(-1) == 1
+        # weight1_og and weight2_og are None if not ctx.fused_grad_accum
+        x, preact, weight1, weight2, weight1_og, weight2_og = ctx.saved_tensors
+        batch_shape = dout.shape[:-1]
+        dout = dout.reshape(-1, dout.shape[-1])
+        if (
+            not ctx.needs_input_grad[0]
+            and not ctx.needs_weight1_grad[0]
+            and not ctx.needs_weight2_grad[0]
+        ):
+            return (None,) * 4
+        assert preact is not None
+        # (batch, 2 * inter_dim) and (batch, inter_dim)
+        # dpreact, y = gemm_dswiglu(dout, weight2, preact)
+        dpreact, y = gemm_dswiglu(dout, weight2, preact, sm_carveout=16)
+        if ctx.needs_input_grad[2]:
+            # fuse_grad_accum is not compatible with torch.compile
+            if not ctx.fuse_grad_accum or weight2_og.grad is None or torch.compiler.is_compiling():
+                dweight2 = gemm_cb(dout.T, y, out_dtype=ctx.weight2_dtype)
+                # dweight2 = gemm_cb(dout.T, y, out_dtype=ctx.weight2_dtype, sm_carveout=16)
+            else:
+                # print("Using fuse grad accum in MLP 2", dout.shape, y.shape, weight2_og.grad.shape)
+                gemm_add_cb_(dout.T, y, weight2_og.grad)
+                # gemm_add_cb_(dout.T, y, weight2_og.grad, sm_carveout=16)
+                dweight2 = weight2_og.grad
+                weight2_og.grad = (
+                    None  # So that pytorch doesn't add dweight to weight2_og.grad again
+                )
+        else:
+            dweight2 = None
+        if ctx.needs_input_grad[0]:
+            dx = dpreact @ weight1  # (batch, in_features)
+            # dx = gemm(dpreact, weight1)  # (batch, in_features)
+            dx = dx.reshape(*batch_shape, dx.shape[-1])
+        else:
+            dx = None
+        if ctx.needs_input_grad[1]:
+            # fuse_grad_accum is not compatible with torch.compile
+            if not ctx.fuse_grad_accum or weight1_og.grad is None or torch.compiler.is_compiling():
+                dweight1 = gemm_cb(dpreact.T, x, out_dtype=ctx.weight1_dtype)
+            else:
+                # print("Using fuse grad accum in MLP 1", dpreact.shape, x.shape, weight1_og.grad.shape)
+                gemm_add_cb_(dpreact.T, x, weight1_og.grad)
+                dweight1 = weight1_og.grad
+                weight1_og.grad = (
+                    None  # So that pytorch doesn't add dweight to weight1_og.grad again
+                )
+        else:
+            dweight1 = None
+        return dx, dweight1, dweight2, None
+def mlp_swiglu_func(x, weight1, weight2, fuse_grad_accum=False):
+    return MLPSwiGLUFunc.apply(x, weight1, weight2, fuse_grad_accum)
+class MLPSwiGLU(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=False,
+        bias2=False,
+        multiple_of=128,
+        device=None,
+        dtype=None,
+        fuse_grad_accum: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        hidden_features = (
+            hidden_features if hidden_features is not None else int(8 * in_features / 3)
+        )
+        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
+        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
+        self.fc1.weight._muon_reshape_functions = (
+            lambda w: rearrange(w, "(d two) e -> two d e", two=2),
+            lambda w: rearrange(w, "two d e -> (d two) e"),
+        )
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+        self.fuse_grad_accum = fuse_grad_accum
+    def forward(self, input: Tensor) -> Tensor:
+        if (
+            self.fc1.bias is None
+            and self.fc2.bias is None
+            and input.is_cuda
+            and input.stride(-1) == 1
+            and self.fc1.in_features % 8 == 0
+            and self.fc1.out_features % 16 == 0
+            and self.fc2.out_features % 8 == 0
+        ):
+            return mlp_swiglu_func(
+                input,
+                self.fc1.weight,
+                self.fc2.weight,
+                fuse_grad_accum=self.fuse_grad_accum,
+            )
+        else:
+            y = self.fc1(input)
+            return self.fc2(F.silu(y[..., ::2]) * y[..., 1::2])
+class MLPSwiGLURef(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=False,
+        bias2=False,
+        multiple_of=128,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        hidden_features = (
+            hidden_features if hidden_features is not None else int(8 * in_features / 3)
+        )
+        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
+        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+    def forward(self, input: Tensor) -> Tensor:
+        y = self.fc1(input)
+        y1, y2 = y.chunk(2, dim=-1)
+        return self.fc2(F.silu(y1) * y2)

quack/pipeline.py ADDED Viewed

@@ -0,0 +1,166 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional
+from dataclasses import dataclass
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, Int32, if_generate
+from cutlass.pipeline import CooperativeGroup, PipelineOp, pipeline_init_wait
+from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+from cutlass.cutlass_dsl import dsl_user_op
+from cutlass._mlir.dialects import nvvm
+@dsl_user_op
+def cp_async_mbarrier_arrive_shared(
+    mbar_ptr: cute.Pointer, noinc: bool = False, *, loc=None, ip=None
+) -> None:
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_ptr.llvm_ptr,
+        noinc=noinc,
+        loc=loc,
+        ip=ip,
+    )
+class PipelineStateWAdvance(PipelineState):
+    def advance_iters(self, num_iterations: Int32):
+        self._count += Int32(num_iterations)
+        new_index = self._index + Int32(num_iterations)
+        # How many times did we cross the stages boundary
+        num_crossings = new_index // self.stages
+        self._phase ^= num_crossings
+        self._index = new_index % self.stages
+    # This can be overridden by derived classes
+    def __new_from_mlir_values__(self, values):
+        return PipelineStateWAdvance(
+            self.stages, Int32(values[0]), Int32(values[1]), Int32(values[2])
+        )
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        return PipelineStateWAdvance(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(1),
+        )
+    elif type is PipelineUserType.Consumer:
+        return PipelineStateWAdvance(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(0),
+        )
+    else:
+        assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+@dataclass(frozen=True)
+class PipelineTmaCpAsync(PipelineTmaAsync):
+    """
+    PipelineTmaCpAsync is used for CpAync + TMA producers and AsyncThread consumers
+    """
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        tidx: Optional[Int32] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        :param tidx: thread index to consumer async threads
+        :type tidx: Int32 | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.AsyncThread
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if tidx is None:
+            tidx, _, _ = cute.arch.thread_idx()
+        if cta_layout_vmnk is None:
+            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
+        (
+            dst_rank,
+            is_signalling_thread,
+        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            dst_rank = None
+        else:
+            dst_rank = dst_rank
+        producer_mask = None
+        pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaCpAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            dst_rank,
+            is_signalling_thread,
+        )
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if_generate(
+            is_tma_warp,
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+    def producer_commit(self, state: PipelineState):
+        """
+        We need the mbarrier to track the completion of cp.async
+        """
+        cp_async_mbarrier_arrive_shared(self.producer_get_barrier(state), noinc=True)

quack/sort/bitonic_sort.py ADDED Viewed

@@ -0,0 +1,126 @@
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Tri Dao.
+import math
+from typing import Optional
+import cutlass
+import cutlass.cute as cute
+import quack.utils as utils
+from quack.sort.utils import compare_and_swap
+from quack.sort.sorting_networks import optimal_sort
+@cute.jit
+def bitonic_merge(
+    arr: cute.Tensor,
+    n: cutlass.Constexpr[int],
+    start: cutlass.Constexpr[int],
+    ascending: cutlass.Constexpr[bool] = True,
+) -> None:
+    """Merge a bitonic sequence into a sorted sequence using iterative approach."""
+    if cutlass.const_expr(n > 1):
+        num_levels = int(math.log2(n))
+        assert n == 2**num_levels, "n must be a power of 2"
+        # This one must be range_constexpr otherwise it's very slow for n = 128
+        for level in cutlass.range_constexpr(num_levels):
+            length = n >> level  # n // (2^level)
+            step = length // 2
+            for i in cutlass.range(n // length, unroll_full=True):
+                start_i = start + i * length
+                for j in cutlass.range(step, unroll_full=True):
+                    compare_and_swap(arr, start_i + j, start_i + j + step, ascending)
+@cute.jit
+def bitonic_sort(
+    arr: cute.Tensor,
+    n: Optional[cutlass.Constexpr[int]] = None,
+    start: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = True,
+) -> None:
+    """
+    Bitonic sort for small arrays of size N (power of 2, N <= 128).
+    Args:
+        arr: Array to sort
+        n: Size of array (must be power of 2 and <= 128)
+        start: Starting index (default 0)
+        ascending: Sort in ascending order (default True)
+    """
+    if cutlass.const_expr(n is None):
+        n = cute.size(arr.shape)
+    assert n <= 128
+    if cutlass.const_expr(n > 1):
+        if cutlass.const_expr(n in [2, 4, 8, 16, 32, 64]):
+            optimal_sort(arr, n, start, ascending)
+        else:  # Fall back to bitonic sort
+            assert n % 2 == 0
+            # Sort first half in ascending order
+            bitonic_sort(arr, n // 2, start, True)
+            # Sort second half in descending order
+            bitonic_sort(arr, n // 2, start + n // 2, False)
+            # Merge the whole sequence
+            bitonic_merge(arr, n, start, ascending)
+@cute.jit
+def bitonic_topk_merge(
+    arr0: cute.Tensor,
+    arr1: cute.Tensor,
+    k: Optional[cutlass.Constexpr[int]] = None,
+    start0: cutlass.Constexpr[int] = 0,
+    start1: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = False,
+) -> None:
+    if cutlass.const_expr(k is None):
+        k = cute.size(arr0.shape)
+    if cutlass.const_expr(arr0.element_type == cutlass.Float32):
+        minmax_fn = utils.fmin if ascending else cute.arch.fmax
+    else:
+        minmax_fn = min if ascending else max
+    # Write the top k elements to the first half of the array
+    for i in cutlass.range(k, unfoll_full=True):
+        arr0[start0 + i] = minmax_fn(arr0[start0 + i], arr1[start1 + k - 1 - i])
+    # Now the 1st half is bitonic, we just need to merge it
+    bitonic_merge(arr0, k, start0, ascending)
+@cute.jit
+def bitonic_topk(
+    arr: cute.Tensor,
+    k: cutlass.Constexpr[int],
+    ascending: cutlass.Constexpr[bool] = False,
+    warp_width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
+) -> cute.Tensor:
+    """
+    Bitonic top-k for small arrays of size N (power of 2, N <= 128).
+    Args:
+        arr: Array to sort
+        k: must be power of 2 and <= 128
+        ascending: Sort in ascending order (default False)
+    """
+    assert arr.element_type in [cutlass.Float32, cutlass.Int32]
+    n = cute.size(arr.shape)
+    assert k == 1 << int(math.log2(k)), "k must be a power of 2"
+    assert n % k == 0, "n must be divisible by k"
+    topk_vals = cute.make_fragment(k, arr.element_type)
+    for v in cutlass.range(k, unroll_full=True):
+        topk_vals[v] = arr[v]
+    bitonic_sort(topk_vals, ascending=ascending)
+    other_vals = cute.make_fragment(k, arr.element_type)
+    for i in cutlass.range(1, n // k, unroll_full=True):
+        for v in cutlass.range(k, unroll_full=True):
+            other_vals[v] = arr[i * k + v]
+        bitonic_sort(other_vals, ascending=ascending)
+        # Merge 2 sorted top-k sequences to get a new top-k sequence
+        bitonic_topk_merge(topk_vals, other_vals, ascending=ascending)
+    # TODO: this is not efficient for large k (e.g. >= 16) since threads in the same warps
+    # do duplicate work.
+    for i in cutlass.range(int(math.log2(warp_width)), unroll_full=True):
+        other_vals = cute.make_fragment(k, arr.element_type)
+        for v in cutlass.range(k, unroll_full=True):
+            other_vals[v] = cute.arch.shuffle_sync_bfly(topk_vals[v], offset=1 << i)
+        bitonic_topk_merge(topk_vals, other_vals, ascending=ascending)
+    return topk_vals

quack-kernels 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

quack-kernels 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl