PyPI - quack-kernels - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

quack-kernels 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

quack/__init__.py +1 -1
quack/cross_entropy.py +304 -14
quack/reduction_base.py +3 -6
quack/rmsnorm.py +398 -20
quack/softmax.py +25 -17
quack/utils.py +17 -29
{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/METADATA +2 -2
quack_kernels-0.1.5.dist-info/RECORD +11 -0
quack_kernels-0.1.3.dist-info/RECORD +0 -11
{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/top_level.txt +0 -0

quack/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.3"
+__version__ = "0.1.5"
 from quack.rmsnorm import rmsnorm
 from quack.softmax import softmax

quack/cross_entropy.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
 import math
 import torch
 from typing import Optional, Type
@@ -77,7 +79,7 @@ class CrossEntropy(ReductionBase):
         self.kernel(mX, mTarget, mLoss, mLSE, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -93,15 +95,16 @@ class CrossEntropy(ReductionBase):
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape: cute.Shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mX, idX)
-        ]
+        gX, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, idX)]
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -131,7 +134,9 @@ class CrossEntropy(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
         )
         if row < shape[0]:
             cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
@@ -154,7 +159,7 @@ class CrossEntropy(ReductionBase):
                 cute.ReductionOp.MAX,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
-                mbar_ptr + 0 if self.cluster_n > 1 else None,
+                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=-cutlass.Float32.inf,
                 hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
             )
@@ -172,7 +177,7 @@ class CrossEntropy(ReductionBase):
                 cute.ReductionOp.ADD,
                 threads_per_row,
                 reduction_buffer[None, None, 1],
-                mbar_ptr + 1 if self.cluster_n > 1 else None,
+                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=0.0,
             )
         else:
@@ -197,7 +202,7 @@ class CrossEntropy(ReductionBase):
                 mLSE[row] = lse
-def cross_entropy(
+def _cross_entropy(
     x: torch.Tensor,
     target: torch.Tensor,
     return_lse: bool = False,
@@ -238,15 +243,300 @@ def cross_entropy(
     stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     compile_key = (dtype, N, lse is not None)
-    if compile_key not in cross_entropy.compile_cache:
+    if compile_key not in _cross_entropy.compile_cache:
         cross_entropy_op = CrossEntropy(dtype, N)
-        cross_entropy.compile_cache[compile_key] = cute.compile(
+        _cross_entropy.compile_cache[compile_key] = cute.compile(
             cross_entropy_op, x_tensor, target_tensor, loss_tensor, lse_tensor, stream
         )
-    cross_entropy.compile_cache[compile_key](
+    _cross_entropy.compile_cache[compile_key](
         x_tensor, target_tensor, loss_tensor, lse_tensor, stream
     )
     return loss if not return_lse else (loss, lse)
-cross_entropy.compile_cache = {}
+_cross_entropy.compile_cache = {}
+class CrossEntropyBackward:
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        self.dtype = dtype
+        self.N = N
+        self.vecsize = 128 // dtype.width
+    def _calculate_threads_per_row(self):
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+    def _get_tv_layout(self):
+        N = self.N
+        vecsize = self.vecsize
+        num_threads = 128 if N <= 16384 else 256
+        threads_per_row = self._calculate_threads_per_row()
+        cols_per_block = num_threads // threads_per_row
+        num_blocks_N = cute.ceil_div(min(N, 16384) // vecsize, threads_per_row)
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
+        tv_layout = cute.make_layout(
+            ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
+            stride=(
+                (vecsize * cols_per_block, 1),
+                (cols_per_block, cols_per_block * vecsize * threads_per_row),
+            ),
+        )
+        return tiler_mn, tv_layout
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mTarget: cute.Tensor,
+        mDLoss: cute.Tensor,
+        mdX: cute.Tensor,
+        mLSE: cute.Tensor,
+        stream: cuda.CUstream,
+    ):
+        assert mX.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = cute.size(tv_layout, mode=[0])
+        mDLoss = cute.make_tensor(
+            mDLoss.iterator, cute.append(mDLoss.layout, cute.make_layout((self.N,), stride=(0,)))
+        )
+        mTarget = cute.make_tensor(
+            mTarget.iterator, cute.append(mTarget.layout, cute.make_layout((self.N,), stride=(0,)))
+        )
+        mLSE = cute.make_tensor(
+            mLSE.iterator, cute.append(mLSE.layout, cute.make_layout((self.N,), stride=(0,)))
+        )
+        smem_size = cute.size_in_bytes(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0))
+        )
+        self.kernel(
+            mX,
+            mTarget,
+            mDLoss,
+            mdX,
+            mLSE,
+            mX.shape,
+            tv_layout,
+            tiler_mn,
+        ).launch(
+            grid=[
+                cute.ceil_div(mX.shape[0], tiler_mn[0]),
+                cute.ceil_div(mX.shape[1], tiler_mn[1]),
+                1,
+            ],
+            block=[num_threads, 1, 1],
+            smem=smem_size,
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mDLoss: cute.Tensor,  # (M,)
+        mdX: cute.Tensor,  # (M, N)
+        mLSE: cute.Tensor,  # (M,)
+        shape: cute.Shape,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, _ = cute.arch.block_idx()
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        idX = cute.make_identity_tensor(shape)
+        gX, gdX, cX, gTarget, gDLoss, gLse = [
+            cute.local_tile(mT, tiler_mn, (bidx, bidy))
+            for mT in (mX, mdX, idX, mTarget, mDLoss, mLSE)
+        ]
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), gX.element_type, num_bits_per_copy=128
+        )
+        copy_atom_load_X_async = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(), gX.element_type, num_bits_per_copy=128
+        )
+        copy_atom_store_O = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), gdX.element_type, num_bits_per_copy=128
+        )
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_X_async = cute.make_tiled_copy(
+            copy_atom_load_X_async, tv_layout, tiler_mn
+        ).get_slice(tidx)
+        thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
+        #### Thread View
+        tXgX = thr_copy_X_async.partition_S(gX)
+        tXsX = thr_copy_X_async.partition_S(sX)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        tXcFull = thr_copy_X.partition_S(cX)  # improve
+        tXgO = thr_copy_O.partition_D(gdX)
+        # allocate fragments for gmem->rmem
+        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
+        is_even_N = cutlass.const_expr(shape[1] % tiler_mn[1] == 0)
+        row = tXcX[0][0]
+        tXpX = (
+            utils.predicate_k(thr_copy_X_async.partition_S(cX), limit=shape[1])
+            if not is_even_N
+            else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+        if cutlass.const_expr(not is_even_N):
+            utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(cute.Float32)
+        label = cute.Int32.zero
+        dloss = cute.Float32.zero
+        lse = cute.Float32.zero
+        if row < shape[0]:
+            label = cute.Int32(mTarget[row])
+            dloss = cute.Float32(mDLoss[row])
+            lse = cute.Float32(mLSE[row])
+        log2_e = math.log2(math.e)
+        probs = utils.exp2f((x - lse) * log2_e)
+        prob_shifted = probs - 1.0
+        mask = cute.make_fragment_like(tXrX, cutlass.Boolean)
+        for i in cutlass.range_constexpr(cute.size(tXcFull)):
+            mask[i] = tXcFull[i][1] == label
+        mask = mask.load()
+        grad = cute.where(mask, prob_shifted, probs)
+        grad = grad * dloss
+        tXrO.store(grad.to(tXrO.element_type))
+        tOpO = (
+            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1]) if not is_even_N else None
+        )
+        if row < shape[0]:
+            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+def _cross_entropy_backward(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    inplace_backward: bool = False,
+) -> torch.Tensor:
+    """Cross entropy backward pass.
+    Args:
+        x: Input logits tensor of shape (M, N)
+        target: Target class indices tensor of shape (M,)
+        dloss: Upstream gradients tensor of shape (M,)
+        lse: Log-sum-exp values tensor of shape (M,)
+    Returns:
+        Input gradients tensor of shape (M, N)
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert target.dim() == 1, "Target must be 1D"
+    assert dloss.dim() == 1, "dloss must be 1D"
+    assert lse.dim() == 1, "lse must be 1D"
+    assert x.shape[0] == target.shape[0], "Batch dimensions must match"
+    assert x.shape[0] == dloss.shape[0], "Batch dimensions must match"
+    assert x.shape[0] == lse.shape[0], "Batch dimensions must match"
+    assert (
+        x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda
+    ), "Tensors must be on CUDA device"
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
+    assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
+    M, N = x.shape
+    dx = torch.empty_like(x) if not inplace_backward else x
+    dtype = torch2cute_dtype_map[x.dtype]
+    convert_from_dlpack = lambda tensor: (
+        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+    )
+    x_tensor = convert_from_dlpack(x)
+    dx_tensor = convert_from_dlpack(dx)
+    dloss_tensor = from_dlpack(dloss.detach(), assumed_align=16).mark_compact_shape_dynamic(mode=0)
+    lse_tensor = from_dlpack(lse.detach(), assumed_align=16).mark_compact_shape_dynamic(mode=0)
+    target_tensor = from_dlpack(target.detach(), assumed_align=32).mark_compact_shape_dynamic(
+        mode=0
+    )
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    compile_key = (dtype, N)
+    if compile_key not in _cross_entropy_backward.compile_cache:
+        cross_entropy_backward_op = CrossEntropyBackward(dtype, N)
+        _cross_entropy_backward.compile_cache[compile_key] = cute.compile(
+            cross_entropy_backward_op,
+            x_tensor,
+            target_tensor,
+            dloss_tensor,
+            dx_tensor,
+            lse_tensor,
+            stream,
+        )
+    _cross_entropy_backward.compile_cache[compile_key](
+        x_tensor, target_tensor, dloss_tensor, dx_tensor, lse_tensor, stream
+    )
+    return dx
+_cross_entropy_backward.compile_cache = {}
+class CrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, target, inplace_backward=False):
+        loss, lse = _cross_entropy(x, target, return_lse=True)
+        ctx.save_for_backward(x, target, lse)
+        ctx.inplace_backward = inplace_backward
+        return loss
+    @staticmethod
+    def backward(ctx, dloss):
+        x, target, lse = ctx.saved_tensors
+        dx = _cross_entropy_backward(x, target, dloss, lse, inplace_backward=ctx.inplace_backward)
+        return dx, None, None
+def cross_entropy(
+    x: torch.Tensor, target: torch.Tensor, inplace_backward: bool = False
+) -> torch.Tensor:
+    """Cross entropy loss with automatic differentiation support.
+    Args:
+        x: Input logits tensor of shape (M, N)
+        target: Target class indices tensor of shape (M,)
+    Returns:
+        Cross entropy loss tensor of shape (M,)
+    """
+    return CrossEntropyFunction.apply(x, target, inplace_backward)

quack/reduction_base.py CHANGED Viewed

@@ -6,8 +6,6 @@ from typing import Type, Tuple, Optional
 import cutlass
 import cutlass.cute as cute
-import quack.utils as utils
 torch2cute_dtype_map = {
     torch.float16: cutlass.Float16,
@@ -39,7 +37,6 @@ class ReductionBase:
         vecsize = copy_bits // self.dtype.width
         assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
         num_threads = self._get_num_threads()
-        num_warps = num_threads // cute.arch.WARP_SIZE
         assert num_threads % cute.arch.WARP_SIZE == 0
         threads_per_row = self._calculate_threads_per_row()
@@ -64,7 +61,7 @@ class ReductionBase:
     def _get_reduction_buffer_layout(self, tv_layout: cute.Layout, cluster_n: int):
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        warps_per_row = utils.max_constexpr(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        warps_per_row = max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
         return cute.make_ordered_layout(
             (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
             order=(1, 0, 2),
@@ -88,10 +85,10 @@ class ReductionBase:
     def _initialize_cluster(self, tidx: cutlass.Int32, mbar_ptr: cute.Pointer, num_warps: int):
         if cutlass.const_expr(self.cluster_n > 1):
             if tidx < self.stage:
-                cute.arch.mbarrier_init_arrive_cnt(mbar_ptr + tidx, 1)
+                cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
             cute.arch.mbarrier_init_fence()
             if tidx < self.stage:
-                cute.arch.mbarrier_init_tx_bytes(
+                cute.arch.mbarrier_arrive_and_expect_tx(
                     mbar_ptr + tidx, num_warps * self.cluster_n * self.reduction_dtype.width // 8
                 )
             # Cluster arrive after barrier init

quack/rmsnorm.py CHANGED Viewed

@@ -9,7 +9,6 @@ import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
 from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
 from quack.reduction_base import ReductionBase, torch2cute_dtype_map
@@ -84,7 +83,7 @@ class RMSNorm(ReductionBase):
         self.kernel(mX, mW, mO, mRstd, eps, tv_layout, tiler_mn, self.reload_from).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -103,7 +102,11 @@ class RMSNorm(ReductionBase):
         delay_w_load: cutlass.Constexpr = False,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -114,13 +117,10 @@ class RMSNorm(ReductionBase):
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, gO, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mX, mO, idX)
-        ]
-        gW = cute.local_tile(mW, tiler_mn, (0, 0 if self.cluster_n == 1 else cluster_y))
+        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         gRstd = (
-            cute.local_tile(mRstd, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
+            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
             if cutlass.const_expr(mRstd is not None)
             else None
         )
@@ -167,7 +167,7 @@ class RMSNorm(ReductionBase):
         cute.arch.cp_async_commit_group()
         tWpW = utils.predicate_k(thr_copy_W.partition_S(cX), limit=shape[1])
-        if not delay_w_load:
+        if cutlass.const_expr(not delay_w_load):
             cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
         cute.arch.cp_async_wait_group(0)
@@ -192,12 +192,12 @@ class RMSNorm(ReductionBase):
                 and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
             ):
                 tXrRstd[0] = rstd
-        if delay_w_load:
+        if cutlass.const_expr(delay_w_load):
             cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
-        if reload_from == "smem":
+        if cutlass.const_expr(reload_from == "smem"):
             cute.autovec_copy(tXsX, tXrX)
             x = tXrX.load().to(cute.Float32)
-        elif reload_from == "gmem":
+        elif cutlass.const_expr(reload_from == "gmem"):
             cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
             x = tXrX.load().to(cute.Float32)
         x_hat = x * rstd
@@ -209,20 +209,18 @@ class RMSNorm(ReductionBase):
             cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
-def rmsnorm(
+def _rmsnorm_fwd(
     x: torch.Tensor,
     weight: torch.Tensor,
     eps: float = 1e-6,
     return_rstd: bool = False,
 ) -> torch.Tensor:
     """RMSNorm forward pass.
     Args:
         x: Input tensor of shape (M, N)
         weight: Weight tensor of shape (N,)
         eps: Small value for numerical stability
         return_rstd: Whether to return the reciprocal standard deviation
     Returns:
         Normalized output tensor of same shape as x
         If return_rstd is True, also returns rstd tensor of shape (M,)
@@ -258,18 +256,18 @@ def rmsnorm(
     )
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     compile_key = (dtype, N, rstd is not None)
-    if compile_key not in rmsnorm.compile_cache:
+    if compile_key not in _rmsnorm_fwd.compile_cache:
         rmsnorm_op = RMSNorm(dtype, N)
-        rmsnorm.compile_cache[compile_key] = cute.compile(
+        _rmsnorm_fwd.compile_cache[compile_key] = cute.compile(
             rmsnorm_op, x_tensor, weight_tensor, out_tensor, rstd_tensor, current_stream
         )
-    rmsnorm.compile_cache[compile_key](
+    _rmsnorm_fwd.compile_cache[compile_key](
         x_tensor, weight_tensor, out_tensor, rstd_tensor, current_stream, eps
     )
     return (out, rstd) if return_rstd else out
-rmsnorm.compile_cache = {}
+_rmsnorm_fwd.compile_cache = {}
 def rmsnorm_ref(x, w, eps=1e-6):
@@ -282,3 +280,383 @@ def rmsnorm_ref(x, w, eps=1e-6):
 def rstd_ref(x, eps=1e-6):
     x_f32 = x.float()
     return 1.0 / torch.sqrt(torch.mean(x_f32 * x_f32, dim=-1) + eps)
+def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
+    """Reference implementation for RMSNorm backward pass."""
+    x_f32 = x.float()
+    x_hat = x_f32 * rstd.unsqueeze(1)
+    wdy = dout * w
+    c1 = (x_hat * wdy).mean(dim=-1, keepdim=True)
+    dx = (wdy - x_hat * c1) * rstd.unsqueeze(1)
+    # dL/dW
+    dw = (dout * x_hat).sum(dim=0)
+    return dx.to(x.dtype), dw.to(w.dtype)
+class RMSNormBackward(ReductionBase):
+    def __init__(self, dtype: cutlass.Numeric, N: int):
+        # 1 stage for computing mean of x_hat * wdy
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Float32)
+    def _calculate_threads_per_row(self):
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+    def _set_cluster_n(self):
+        N = self.N
+        if cutlass.const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:  # fp32
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: cute.Tensor,
+        mDout: cute.Tensor,
+        mRstd: cute.Tensor,
+        mDx: cute.Tensor,
+        mDw: cute.Tensor,
+        sm_count: cutlass.Constexpr,
+        stream: cuda.CUstream,
+    ):
+        self._set_cluster_n()
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = cute.size(tv_layout, mode=[0])
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+        mRstd_expanded_layout = cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,)))
+        mRstd = cute.make_tensor(mRstd.iterator, mRstd_expanded_layout)
+        num_blocks = (
+            sm_count if tiler_mn[0] == 1 else min(sm_count, cute.ceil_div(1024, tiler_mn[0]))
+        )
+        self.kernel(mX, mW, mDout, mRstd, mDx, mDw, sm_count, tv_layout, tiler_mn).launch(
+            grid=[num_blocks, self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,
+        mW: cute.Tensor,
+        mDout: cute.Tensor,
+        mRstd: cute.Tensor,
+        mDx: cute.Tensor,
+        mDw: cute.Tensor,
+        sm_count: cutlass.Constexpr,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, cluster_y, _ = cute.arch.block_idx()
+        gdim, _, _ = cute.arch.grid_dim()
+        shape = mX.shape
+        M, N = shape[0], shape[1]
+        idX = cute.make_identity_tensor(shape)
+        smem = cutlass.utils.SmemAllocator()
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=128
+        )
+        copy_atom_load_W = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=128
+        )
+        copy_atom_store_dX = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mDx.element_type, num_bits_per_copy=128
+        )
+        copy_atom_dw = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mDw.element_type, num_bits_per_copy=128
+        )
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_W = cute.make_tiled_copy(copy_atom_load_W, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_dw = cute.make_tiled_copy(copy_atom_dw, tv_layout, tiler_mn).get_slice(tidx)
+        thr_store_dx = cute.make_tiled_copy(copy_atom_store_dX, tv_layout, tiler_mn).get_slice(tidx)
+        gW = cute.local_tile(mW, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
+        tWgW = thr_copy_W.partition_S(gW)
+        tWrW = cute.make_fragment_like(tWgW)
+        tXrW = thr_copy_X.retile(tWrW)
+        gW_coord = cute.local_tile(idX, tiler_mn, (0, 0 if self.cluster_n == 1 else cluster_y))
+        tWpW = utils.predicate_k(thr_copy_W.partition_S(gW_coord), limit=shape[1])
+        cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
+        weight = tXrW.load().to(cute.Float32)
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+        dw_coord = cute.local_tile(idX, tiler_mn, (0, 0 if self.cluster_n == 1 else cluster_y))
+        tDwpDw = utils.predicate_k(thr_copy_dw.partition_S(dw_coord), limit=shape[1])
+        gDw = cute.local_tile(mDw, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
+        tDwgDw = thr_copy_dw.partition_D(gDw)
+        tDwrDw = cute.make_fragment_like(tDwgDw)
+        dw_accumulator = thr_copy_X.retile(tDwrDw)
+        dw_accumulator.fill(0.0)
+        M_pad = ((M + sm_count - 1) // sm_count) * sm_count
+        jump = sm_count if tiler_mn[0] == 1 else min(sm_count, cute.ceil_div(1024, tiler_mn[0]))
+        if cutlass.const_expr(self.cluster_n > 1):
+            cute.arch.cluster_arrive()
+            cute.arch.cluster_wait()
+        ## need to update range_dynamic since it will be deprecated soon
+        for row_offset in cutlass.range_dynamic(bidx, M_pad, jump):
+            gX = cute.local_tile(
+                mX, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+            )
+            gDout = cute.local_tile(
+                mDout, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+            )
+            gRstd = cute.local_tile(
+                mRstd, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+            )
+            gDx = cute.local_tile(
+                mDx, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+            )
+            cX = cute.local_tile(
+                idX, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+            )
+            tXgX = thr_copy_X.partition_S(gX)
+            thrDout = thr_copy_X.partition_S(gDout)
+            tXrRstd = thr_copy_W.partition_S(gRstd)
+            thrDx = thr_store_dx.partition_D(gDx)
+            tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+            tXrX, frgDout, frgDx = [cute.make_fragment_like(thr) for thr in (tXgX, thrDout, thrDx)]
+            tXpX = utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if tXcX[0][0] < shape[0]:
+                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+                cute.copy(copy_atom_load_X, thrDout, frgDout, pred=tXpX)
+            x = tXrX.load().to(cute.Float32)
+            dout = frgDout.load().to(cute.Float32)
+            rstd = tXrRstd[0]
+            x_hat = x * rstd
+            wdy = dout * weight
+            threads_per_row = tv_layout.shape[0][0]
+            row = tXcX[0][0]
+            if cutlass.const_expr(self.cluster_n > 1):
+                cute.arch.cluster_arrive()
+                cute.arch.cluster_wait()
+            else:
+                cute.arch.barrier()
+            mean_xhat_wdy = (
+                utils.row_reduce(
+                    x_hat * wdy,
+                    cute.ReductionOp.ADD,
+                    threads_per_row,
+                    reduction_buffer[None, None, 0],
+                    mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
+                    init_val=0.0,
+                    hook_fn=cute.arch.cluster_wait
+                    if cutlass.const_expr(self.cluster_n > 1)
+                    else None,
+                )
+                / shape[1]
+            )
+            dx = (wdy - x_hat * mean_xhat_wdy) * rstd
+            frgDx.store(dx.to(frgDout.element_type))
+            if row < M:
+                cute.copy(copy_atom_store_dX, frgDx, thrDx, pred=tXpX)
+            if cutlass.const_expr(self.cluster_n > 1):
+                cute.arch.cluster_arrive()
+                cute.arch.cluster_wait()
+            else:
+                cute.arch.barrier()
+            if row < M:
+                dw_row = dout * x_hat
+                current_dw = dw_accumulator.load().to(cute.Float32)
+                updated_dw = current_dw + dw_row
+                dw_accumulator.store(updated_dw.to(dw_accumulator.element_type))
+            """
+            if cutlass.const_expr(self.cluster_n > 1):
+                cute.arch.cluster_arrive()
+                cute.arch.cluster_wait()
+            else:
+                cute.arch.barrier()
+            """
+        """
+        if cutlass.const_expr(self.cluster_n > 1):
+            cute.arch.cluster_arrive()
+            cute.arch.cluster_wait()
+        else:
+            cute.arch.barrier()
+        """
+        cute.autovec_copy(dw_accumulator, tDwrDw)
+        cute.copy(copy_atom_dw, tDwrDw, tDwgDw, pred=tDwpDw)
+def _rmsnorm_backward(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    dout: torch.Tensor,
+    rstd: torch.Tensor,
+) -> (torch.Tensor, torch.Tensor):
+    """RMSNorm backward pass.
+    Args:
+        x: Input tensor of shape (M, N)
+        weight: Weight tensor of shape (N,)
+        dout: Upstream gradients tensor of shape (M, N)
+        rstd: Reciprocal standard deviation tensor of shape (M,)
+    Returns:
+        Tuple of (dx, dw) where:
+        - dx: Input gradients tensor of same shape as x
+        - dw: Weight gradients tensor of same shape as weight
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert weight.dim() == 1, "Weight must be 1D"
+    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
+    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
+    assert weight.dtype == torch.float32, "Weight must be float32"
+    M, N = x.shape
+    dx = torch.empty_like(x)
+    device = x.device
+    sm_count = torch.cuda.get_device_properties(device).multi_processor_count * 8
+    dw_partial = torch.zeros((sm_count, N), device=device, dtype=weight.dtype)
+    dtype = torch2cute_dtype_map[x.dtype]
+    convert_from_dlpack = lambda tensor: (
+        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+    )
+    x_tensor, dout_tensor, dx_tensor = [convert_from_dlpack(tensor) for tensor in (x, dout, dx)]
+    weight_tensor = utils.convert_from_dlpack(
+        weight.detach(), leading_dim=0, divisibility=128 // cutlass.Float32.width
+    )
+    dw_partial_tensor = convert_from_dlpack(dw_partial)
+    rstd_tensor = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    compile_key = (dtype, N)
+    if compile_key not in _rmsnorm_backward.compile_cache:
+        rmsnorm_backward_op = RMSNormBackward(dtype, N)
+        _rmsnorm_backward.compile_cache[compile_key] = cute.compile(
+            rmsnorm_backward_op,
+            x_tensor,
+            weight_tensor,
+            dout_tensor,
+            rstd_tensor,
+            dx_tensor,
+            dw_partial_tensor,
+            sm_count,
+            current_stream,
+        )
+    _rmsnorm_backward.compile_cache[compile_key](
+        x_tensor,
+        weight_tensor,
+        dout_tensor,
+        rstd_tensor,
+        dx_tensor,
+        dw_partial_tensor,
+        current_stream,
+    )
+    dw = dw_partial.sum(dim=0).to(weight.dtype)
+    return dx, dw
+_rmsnorm_backward.compile_cache = {}
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, eps):
+        out, rstd = _rmsnorm_fwd(x, weight, eps, return_rstd=True)
+        ctx.save_for_backward(x, weight, rstd)
+        ctx.eps = eps
+        return out
+    @staticmethod
+    def backward(ctx, dout):
+        x, weight, rstd = ctx.saved_tensors
+        dx, dw = _rmsnorm_backward(x, weight, dout, rstd)
+        # dw is returned for weight gradient, None for eps gradient
+        return dx, dw, None
+def rmsnorm(x: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+    """RMSNorm forward pass with automatic differentiation support.
+    Args:
+        x: Input tensor of shape (M, N)
+        weight: Weight tensor of shape (N,)
+        eps: Small value for numerical stability
+    Returns:
+        Normalized output tensor of same shape as x
+    """
+    return RMSNormFunction.apply(x, weight, eps)

quack/softmax.py CHANGED Viewed

@@ -75,7 +75,7 @@ class Softmax(ReductionBase):
         self.kernel(mX, mO, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -89,15 +89,16 @@ class Softmax(ReductionBase):
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, gO, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mX, mO, idX)
-        ]
+        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -129,7 +130,9 @@ class Softmax(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
         )
         if tXcX[0][0] < shape[0]:
             cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
@@ -148,7 +151,7 @@ class Softmax(ReductionBase):
                 cute.ReductionOp.MAX,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
-                mbar_ptr + 0 if self.cluster_n > 1 else None,
+                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=-cutlass.Float32.inf,
                 hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
             )
@@ -159,7 +162,7 @@ class Softmax(ReductionBase):
                 cute.ReductionOp.ADD,
                 threads_per_row,
                 reduction_buffer[None, None, 1],
-                mbar_ptr + 1 if self.cluster_n > 1 else None,
+                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=0.0,
             )
         else:
@@ -174,7 +177,9 @@ class Softmax(ReductionBase):
         y = exp_x * (1.0 / denom)
         tXrO.store(y.to(tXrO.element_type))
         tOpO = (
-            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
         )
         if tXcX[0][0] < shape[0]:
             cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
@@ -283,7 +288,7 @@ class SoftmaxBackward(ReductionBase):
         self.kernel(mdY, mY, mdX, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mdY.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -298,14 +303,17 @@ class SoftmaxBackward(ReductionBase):
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape = mdY.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
         gdY, gY, gdX, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mdY, mY, mdX, idX)
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mdY, mY, mdX, idX)
         ]
         smem = cutlass.utils.SmemAllocator()
@@ -344,7 +352,7 @@ class SoftmaxBackward(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tdYpdY = (
             utils.predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
-            if not is_even_N
+            if cutlass.const_expr(not is_even_N)
             else None
         )
@@ -366,7 +374,7 @@ class SoftmaxBackward(ReductionBase):
             cute.ReductionOp.ADD,
             threads_per_row,
             reduction_buffer[None, None, 0],
-            mbar_ptr if self.cluster_n > 1 else None,
+            mbar_ptr if cutlass.const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
             hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
         )
@@ -376,7 +384,7 @@ class SoftmaxBackward(ReductionBase):
         tdXrdX.store(dx.to(tdXrdX.element_type))
         tdXpdX = (
             utils.predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
-            if not is_even_N
+            if cutlass.const_expr(not is_even_N)
             else None
         )
         if tXcX[0][0] < shape[0]:

quack/utils.py CHANGED Viewed

@@ -24,32 +24,19 @@ def convert_from_dlpack(x, leading_dim, alignment=16, divisibility=1) -> cute.Te
 @cute.jit
-def max_constexpr(
-    a: cutlass.Constexpr[cute.Numeric], b: cutlass.Constexpr[cute.Numeric]
-) -> cutlass.Constexpr[cute.Numeric]:
-    return a if a > b else b
-@cute.jit
-def min_constexpr(
-    a: cutlass.Constexpr[cute.Numeric], b: cutlass.Constexpr[cute.Numeric]
-) -> cutlass.Constexpr[cute.Numeric]:
-    return a if a < b else b
 def warp_reduce(
     val: cute.TensorSSA | cute.Numeric,
     op: Callable,
     width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
 ) -> cute.TensorSSA | cute.Numeric:
-    if isinstance(val, cute.TensorSSA):
+    if cutlass.const_expr(isinstance(val, cute.TensorSSA)):
         res = cute.make_fragment(val.shape, val.dtype)
         res.store(val)
-        for i in range(cute.size(val.shape)):
+        for i in cutlass.range_constexpr(cute.size(val.shape)):
             res[i] = warp_reduce(res[i], op, width)
         return res.load()
     else:
-        for i in range(int(math.log2(width))):
+        for i in cutlass.range_constexpr(int(math.log2(width))):
             val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
     return val
@@ -111,15 +98,15 @@ def store_shared_remote(
     remote_mbar_ptr_i32 = set_block_rank(
         mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
     ).ir_value()
-    if isinstance(val, float):
+    if cutlass.const_expr(isinstance(val, float)):
         val = Float32(val)
     assert isinstance(val, (Float32, cutlass.Int64)), "val must be Float32 or Int64"
-    suffix = "f32" if isinstance(val, Float32) else "s64"
+    suffix = "f32" if cutlass.const_expr(isinstance(val, Float32)) else "s64"
     llvm.inline_asm(
         None,
         [remote_smem_ptr_i32, val.ir_value(loc=loc, ip=ip), remote_mbar_ptr_i32],
         f"st.async.shared::cluster.mbarrier::complete_tx::bytes.{suffix} [$0], $1, [$2];",
-        f"r,{'f' if isinstance(val, Float32) else 'l'},r",
+        f"r,{'f' if cutlass.const_expr(isinstance(val, Float32)) else 'l'},r",
         has_side_effects=True,
         is_align_stack=False,
         asm_dialect=llvm.AsmDialect.AD_ATT,
@@ -195,7 +182,7 @@ def row_reduce(
     val = warp_reduce(
         val,
         warp_op,
-        width=min_constexpr(threads_per_row, cute.arch.WARP_SIZE),
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
     )
     if cutlass.const_expr(hook_fn is not None):
         hook_fn()
@@ -225,7 +212,7 @@ def online_softmax_reduce(
     max_x = warp_reduce(
         x.reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
         cute.arch.fmax,
-        width=min_constexpr(threads_per_row, cute.arch.WARP_SIZE),
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
     )
     log2_e = math.log2(math.e)
     exp_x = exp2f(x * log2_e - (max_x * log2_e))
@@ -233,7 +220,7 @@ def online_softmax_reduce(
     sum_exp_x = warp_reduce(
         exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
         operator.add,
-        width=min_constexpr(threads_per_row, cute.arch.WARP_SIZE),
+        width=min(threads_per_row, cute.arch.WARP_SIZE),
     )
     if cutlass.const_expr(hook_fn is not None):
         hook_fn()
@@ -299,18 +286,18 @@ def online_softmax_reduce(
     return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)
+@cute.jit
 def exp2f(x: cute.TensorSSA | Float32) -> cute.TensorSSA | Float32:
     """exp2f calculation for both vector and scalar.
     :param x: input value
     :type x: cute.TensorSSA or Float32
     :return: exp2 value
     :rtype: cute.TensorSSA or Float32
     """
-    if isinstance(x, cute.TensorSSA):
+    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
         res = cute.make_fragment(x.shape, Float32)
         res.store(x)
-        for i in range(cute.size(x.shape)):
+        for i in cutlass.range_constexpr(cute.size(x.shape)):
             res[i] = cute.arch.exp2(res[i])
         return res.load()
     else:
@@ -347,6 +334,7 @@ def rsqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
     )
+@cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
     tApA = cute.make_fragment(
@@ -356,8 +344,8 @@ def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
         ),
         cutlass.Boolean,
     )
-    for rest_v in range(tApA.shape[0]):
-        for rest_k in range(tApA.shape[2]):
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
             tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
     return tApA
@@ -373,8 +361,8 @@ def fill_oob(tXsX: cute.Tensor, tXpX: cute.Tensor, fill_value: cute.Numeric) ->
     """
     tXrX_fill = cute.make_fragment_like(tXsX[(None, 0), 0, 0])
     tXrX_fill.fill(fill_value)
-    for rest_v in range(tXpX.shape[0]):
-        for rest_k in range(tXpX.shape[2]):
+    for rest_v in cutlass.range_constexpr(tXpX.shape[0]):
+        for rest_k in cutlass.range_constexpr(tXpX.shape[2]):
             if not tXpX[rest_v, 0, rest_k]:
                 cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])

{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.3
+Version: 0.1.5
 Requires-Python: >=3.9
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.0.0
+Requires-Dist: nvidia-cutlass-dsl==4.1.0.dev0
 Requires-Dist: torch
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"

quack_kernels-0.1.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+quack/__init__.py,sha256=GPoImcynY5-OkMep5RhQhXrnZyxgqZG3RoHhsYQFSL4,203
+quack/cross_entropy.py,sha256=WkngPY8uk4RCjCFtHtB7h9GF_8xt4NnyvDzvw73gIL4,19320
+quack/reduction_base.py,sha256=fFuGXPR3lDq2yw_m86ujmkni6R51jzNAzy_r9R6C8tA,3563
+quack/rmsnorm.py,sha256=N9NavrR85ws4cZgkfpeRLjYkVSq2yfyzJQWvfKf98pY,23935
+quack/softmax.py,sha256=VfhlC2huRuv7olFSVFgS8LF1yF8TFV64yjjjQxYX9yk,16364
+quack/utils.py,sha256=6EyWgf0z3wcbhGUivHmWB8hVBnEzMyOhmAuZ2Te82k0,15226
+quack_kernels-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+quack_kernels-0.1.5.dist-info/METADATA,sha256=WI-2CP1mRH05V9Fjdx7HsErNOkrc6fUhheoH4ynlo-U,289
+quack_kernels-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+quack_kernels-0.1.5.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
+quack_kernels-0.1.5.dist-info/RECORD,,

quack_kernels-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-quack/__init__.py,sha256=aUR7drzgaqmbzw9H_eoFselMUVQVF3BHc9VOzZg5d-Q,203
-quack/cross_entropy.py,sha256=_Xlyifd_YS8LaYxYlZEsuBfsi8zTH4At3i9DDggGCf8,9319
-quack/reduction_base.py,sha256=nrRsXwTpLVQkPp2Gr_FgHRPnifqkMHRodve5ciHzx58,3667
-quack/rmsnorm.py,sha256=YqGTTKHHXYzw3xnnjBRfaN9TDlhG8D_fSI9CHKAU40A,10548
-quack/softmax.py,sha256=mWaUfaY6PBtO1ioYxXxS-yodQmcBNGasWVMUg9G066Y,15938
-quack/utils.py,sha256=1-HMcFTEvGdAtqC3ucQGZ3DLa_PoJQsqwYlKd9bcXO8,15347
-quack_kernels-0.1.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-quack_kernels-0.1.3.dist-info/METADATA,sha256=DDuEKHLjFx9dFTQV5YtXsnKVFZVoueO7NwhcwOtpw6g,284
-quack_kernels-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-quack_kernels-0.1.3.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
-quack_kernels-0.1.3.dist-info/RECORD,,

{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{quack_kernels-0.1.3.dist-info → quack_kernels-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

quack-kernels 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

quack-kernels 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl