PyPI - quack-kernels - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

quack-kernels 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

quack/__init__.py +7 -1
quack/cross_entropy.py +201 -167
quack/reduction_base.py +98 -0
quack/rmsnorm.py +212 -181
quack/softmax.py +417 -156
quack/utils.py +206 -45
quack_kernels-0.1.4.dist-info/METADATA +11 -0
quack_kernels-0.1.4.dist-info/RECORD +11 -0
quack_kernels-0.1.2.dist-info/METADATA +0 -8
quack_kernels-0.1.2.dist-info/RECORD +0 -10
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.4.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.4.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.4.dist-info}/top_level.txt +0 -0

quack/softmax.py CHANGED Viewed

@@ -1,169 +1,191 @@
 import math
 import torch
-import operator
-from typing import Callable
+from typing import Type
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
 from cutlass.cute.runtime import from_dlpack
-import cutlass.torch as cutlass_torch
 import quack.utils as utils
+from quack.reduction_base import ReductionBase, torch2cute_dtype_map
-@cute.kernel
-def softmax_kernel(
-    mX: cute.Tensor,
-    mO: cute.Tensor,
-    tv_layout: cute.Layout,
-    tiler_mn: cute.Shape,
-    cluster_n: cutlass.Constexpr = 1,
-):
-    tidx, _, _ = cute.arch.thread_idx()
-    bidx, cluster_y, _ = cute.arch.block_idx()
-    shape = mX.shape
-    idX = cute.make_identity_tensor(shape)
-    # slice for CTAs
-    gX, gO, cX = [
-        cute.local_tile(mT, tiler_mn, (bidx, 0 if cluster_n == 1 else cluster_y))
-        for mT in (mX, mO, idX)
-    ]
-    smem = cutlass.utils.SmemAllocator()
-    sX = smem.allocate_tensor(mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16)
-    num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-    warps_per_row = utils.max_constexpr(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
-    reduction_buffer_layout = cute.make_ordered_layout(
+class Softmax(ReductionBase):
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int, online_softmax: bool = True):
         # 2 stages: 1 for max, 1 for sum
-        (num_warps // warps_per_row, (warps_per_row, cluster_n), 2),
-        order=(1, 0, 2)
-    )
-    reduction_buffer = smem.allocate_tensor(cutlass.Float32, reduction_buffer_layout, byte_alignment=4)
-    if cutlass.const_expr(cluster_n > 1):
-        # 1 mbar for max reduction, 1 mbar for sum reduction
-        mbar_ptr = smem.allocate_array(cutlass.Int64, num_elems=2)
-    else:
-        mbar_ptr = None
-    # declare the atoms which will be used later for memory copy
-    copy_atom_load_X = cute.make_copy_atom(cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=128)
-    copy_atom_store_O = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gO.element_type, num_bits_per_copy=128)
-    thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
-    thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
-    tXgX = thr_copy_X.partition_S(gX)
-    tXsX = thr_copy_X.partition_D(sX)
-    tXgO = thr_copy_O.partition_D(gO)
-    tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
-    # allocate fragments for gmem->rmem
-    tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
-    if cluster_n > 1:
-        if tidx < 2:
-            cute.arch.mbarrier_init_arrive_cnt(mbar_ptr + tidx, 1)
-        cute.arch.mbarrier_init_fence()
-        if tidx < 2:
-            cute.arch.mbarrier_init_tx_bytes(mbar_ptr + tidx, num_warps * cluster_n * cutlass.Float32.width // 8)
-        # Cluster arrive after barrier init
-        cute.arch.cluster_arrive_relaxed()
-    is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * cluster_n)
-    tXpX = utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
-    if tXcX[0][0] < shape[0]:
-        cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
-    cute.arch.cp_async_commit_group()
-    cute.arch.cp_async_wait_group(0)
-    cute.autovec_copy(tXsX, tXrX)
-    x = tXrX.load().to(cute.Float32)
-    # Fill OOB values with -inf
-    if cutlass.const_expr(not is_even_N):
-        tXrX_fp32 = cute.make_fragment_like(tXrX, cutlass.Float32)
-        tXrX_fp32.store(x)
-        for rest_v in range(tXpX.shape[0]):
-            for rest_k in range(tXpX.shape[2]):
-                if not tXpX[rest_v, 0, rest_k]:
-                    tXrX_fp32[(None, rest_v), None, rest_k].fill(-cutlass.Float32.inf)
-        x = tXrX_fp32.load()
-    threads_per_row = tv_layout.shape[0][0]
-    max_x = utils.row_reduce(
-        x,
-        cute.ReductionOp.MAX,
-        threads_per_row,
-        reduction_buffer[None, None, 0],
-        mbar_ptr + 0 if cluster_n > 1 else None,
-        init_val=-cutlass.Float32.inf,
-        hook_fn=cute.arch.cluster_wait if cutlass.const_expr(cluster_n > 1) else None
-    )
-    log2_e = math.log2(math.e)
-    exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
-    denom = utils.row_reduce(
-        exp_x,
-        cute.ReductionOp.ADD,
-        threads_per_row,
-        reduction_buffer[None, None, 1],
-        mbar_ptr + 1 if cluster_n > 1 else None,
-        init_val=0.0,
-    )
-    inv = 1.0 / denom
-    y = exp_x * inv
-    tXrO.store(y.to(tXrO.element_type))
-    tOpO = utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1]) if not is_even_N else None
-    if tXcX[0][0] < shape[0]:
-        cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
-@cute.jit
-def softmax_interface(
-    mX: cute.Tensor,
-    mO: cute.Tensor,
-    stream: cuda.CUstream,
-    N: cutlass.Constexpr,
-    copy_bits: cutlass.Constexpr = 128
-):
-    vecsize = copy_bits // mX.element_type.width
-    assert N % vecsize == 0, f"Input N {N} is not divisible by vector size {vecsize}"
-    num_threads = 128 if N <= 16384 else 256
-    num_warps = num_threads // cute.arch.WARP_SIZE
-    assert num_threads % cute.arch.WARP_SIZE == 0
-    threads_per_row = 8 if N <= 64 else (16 if N <= 128 else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256))))
-    if cutlass.const_expr(mX.element_type.width == 16):
-        cluster_n = 1 if N <= 16 * 1024 else (2 if N <= 32 * 1024 else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16)))
-    else:  # fp32
-        cluster_n = 1 if N <= 32 * 1024 else (2 if N <= 64 * 1024 else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16)))
-    num_blocks_N = cute.ceil_div(N // vecsize, threads_per_row * cluster_n)
-    cols_per_block = num_threads // threads_per_row
-    tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)  # This rounds up N
-    tv_layout = cute.make_layout(
-        ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
-        stride=((vecsize * cols_per_block, 1), (cols_per_block, cols_per_block * vecsize * threads_per_row))
-    )
+        super().__init__(
+            dtype,
+            N,
+            stage=2 if not online_softmax else 1,
+            reduction_dtype=cutlass.Float32 if not online_softmax else cutlass.Int64,
+        )
+        self.online_softmax = online_softmax
-    smem_allocated = cute.size_in_bytes(mX.element_type, cute.make_layout(tiler_mn)) + 2 * num_warps * cluster_n * (cutlass.Float32.width // 8) + 2 * (cutlass.Int64.width // 8)
-    softmax_kernel(mX, mO, tv_layout, tiler_mn, cluster_n).launch(
-        grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), cluster_n, 1],
-        block=[cute.size(tv_layout, mode=[0]), 1, 1],
-        # Launching with cluster=[1, 1, 1] instead of None slows down the kernel by ~8us
-        cluster=[1, cluster_n, 1] if cluster_n > 1 else None,
-        smem=smem_allocated,
-        stream=stream,
-    )
+    def _calculate_threads_per_row(self):
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+    def _set_cluster_n(self):
+        N = self.N
+        if cutlass.const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:  # fp32
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
-torch2cute_dtype_map = {
-    torch.float16: cutlass.Float16,
-    torch.bfloat16: cutlass.BFloat16,
-    torch.float32: cutlass.Float32,
-}
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mO: cute.Tensor,
+        stream: cuda.CUstream,
+    ):
+        assert mX.element_type == self.dtype
+        assert mO.element_type == self.dtype
+        self._set_cluster_n()
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = cute.size(tv_layout, mode=[0])
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        self.kernel(mX, mO, tv_layout, tiler_mn).launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,
+        mO: cute.Tensor,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
-def softmax(x: torch.Tensor) -> torch.Tensor:
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+        # slice for CTAs
+        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        # declare the atoms which will be used later for memory copy
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=128
+        )
+        copy_atom_store_O = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), gO.element_type, num_bits_per_copy=128
+        )
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXgO = thr_copy_O.partition_D(gO)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        # allocate fragments for gmem->rmem
+        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+        # Fill OOB values with -inf
+        if cutlass.const_expr(not is_even_N):
+            utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(cute.Float32)
+        threads_per_row = tv_layout.shape[0][0]
+        if cutlass.const_expr(not self.online_softmax):
+            max_x = utils.row_reduce(
+                x,
+                cute.ReductionOp.MAX,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
+                init_val=-cutlass.Float32.inf,
+                hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+            )
+            log2_e = math.log2(math.e)
+            exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
+            denom = utils.row_reduce(
+                exp_x,
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 1],
+                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
+                init_val=0.0,
+            )
+        else:
+            max_x, denom, exp_x = utils.online_softmax_reduce(
+                x,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr,
+                hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+                return_exp_x=True,
+            )
+        y = exp_x * (1.0 / denom)
+        tXrO.store(y.to(tXrO.element_type))
+        tOpO = (
+            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+def _softmax_fwd(x: torch.Tensor) -> torch.Tensor:
     """Softmax forward pass.
     Args:
         x: Input tensor of shape (M, N)
@@ -174,22 +196,261 @@ def softmax(x: torch.Tensor) -> torch.Tensor:
     assert x.is_cuda, "Tensor must be on CUDA device"
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
     M, N = x.shape
-    device = x.device
     out = torch.empty_like(x)
     dtype = torch2cute_dtype_map[x.dtype]
     convert_from_dlpack = lambda tensor: (
-        from_dlpack(tensor.detach(), assumed_align=16)
-        .mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
     )
     x_tensor, out_tensor = [convert_from_dlpack(tensor) for tensor in (x, out)]
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     compile_key = (dtype, N)
-    if compile_key not in softmax.compile_cache:
-        softmax.compile_cache[compile_key] = cute.compile(
-            softmax_interface, x_tensor, out_tensor, current_stream, N
+    if compile_key not in _softmax_fwd.compile_cache:
+        softmax_op = Softmax(dtype, N)
+        _softmax_fwd.compile_cache[compile_key] = cute.compile(
+            softmax_op, x_tensor, out_tensor, current_stream
         )
-    softmax.compile_cache[compile_key](x_tensor, out_tensor, current_stream)
+    _softmax_fwd.compile_cache[compile_key](x_tensor, out_tensor, current_stream)
     return out
-softmax.compile_cache = {}
+_softmax_fwd.compile_cache = {}
+class SoftmaxBackward(ReductionBase):
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        # 1 stage for computing dot product
+        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Float32)
+    def _calculate_threads_per_row(self):
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 8192 else 256)))
+            )
+        )
+    def _set_cluster_n(self):
+        N = self.N
+        if cutlass.const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:  # fp32
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+    def _get_num_threads(self):
+        return 128 if self.N <= 8192 else 256
+    def _smem_size_in_bytes(self, tiler_mn, num_warps):
+        return (
+            # Multiply by 2 since we need space for Y and dY
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
+            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage * (cutlass.Int64.width // 8)
+        )
+    @cute.jit
+    def __call__(
+        self,
+        mdY: cute.Tensor,
+        mY: cute.Tensor,
+        mdX: cute.Tensor,
+        stream: cuda.CUstream,
+    ):
+        assert mdY.element_type == self.dtype
+        assert mY.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        self._set_cluster_n()
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = cute.size(tv_layout, mode=[0])
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        self.kernel(mdY, mY, mdX, tv_layout, tiler_mn).launch(
+            grid=[cute.ceil_div(mdY.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mdY: cute.Tensor,
+        mY: cute.Tensor,
+        mdX: cute.Tensor,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
+        shape = mdY.shape
+        idX = cute.make_identity_tensor(shape)
+        # slice for CTAs
+        gdY, gY, gdX, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mdY, mY, mdX, idX)
+        ]
+        smem = cutlass.utils.SmemAllocator()
+        sdY = smem.allocate_tensor(
+            mdY.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        sY = smem.allocate_tensor(
+            mY.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        # declare the atoms which will be used later for memory copy
+        copy_atom_load = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(), mdY.element_type, num_bits_per_copy=128
+        )
+        copy_atom_store = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), gdX.element_type, num_bits_per_copy=128
+        )
+        thr_copy_load = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
+        tdYgdY = thr_copy_load.partition_S(gdY)
+        tdYsdY = thr_copy_load.partition_D(sdY)
+        tYgY = thr_copy_load.partition_S(gY)
+        tYsY = thr_copy_load.partition_D(sY)
+        tdXgdX = thr_copy_store.partition_D(gdX)
+        tXcX = thr_copy_load.partition_S(cX)[(0, None), None, None]
+        # allocate fragments for gmem->rmem
+        tdYrdY, tYrY, tdXrdX = [cute.make_fragment_like(thr) for thr in (tdYgdY, tYgY, tdXgdX)]
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tdYpdY = (
+            utils.predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_load, tdYgdY, tdYsdY, pred=tdYpdY)
+            cute.copy(copy_atom_load, tYgY, tYsY, pred=tdYpdY)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+        cute.autovec_copy(tdYsdY, tdYrdY)
+        cute.autovec_copy(tYsY, tYrY)
+        dy = tdYrdY.load().to(cute.Float32)
+        y = tYrY.load().to(cute.Float32)
+        # Compute dot product: dot = Σⱼ dy_j × y_j
+        threads_per_row = tv_layout.shape[0][0]
+        dot = utils.row_reduce(
+            dy * y,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr if cutlass.const_expr(self.cluster_n > 1) else None,
+            init_val=0.0,
+            hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+        )
+        # Compute gradient: dx_i = y_i × (dy_i - dot)
+        dx = y * (dy - dot)
+        tdXrdX.store(dx.to(tdXrdX.element_type))
+        tdXpdX = (
+            utils.predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_store, tdXrdX, tdXgdX, pred=tdXpdX)
+def _softmax_backward(dy: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """Softmax backward pass.
+    Args:
+        dy: Upstream gradients tensor of shape (M, N)
+        y: Softmax output tensor of shape (M, N)
+    Returns:
+        Input gradients tensor of same shape as dy and y
+    """
+    assert dy.dim() == 2, "dy must be 2D"
+    assert y.dim() == 2, "y must be 2D"
+    assert dy.shape == y.shape, "dy and y must have same shape"
+    assert dy.is_cuda and y.is_cuda, "Tensors must be on CUDA device"
+    assert dy.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
+    assert y.dtype == dy.dtype, "dy and y must have same dtype"
+    M, N = dy.shape
+    dx = torch.empty_like(dy)
+    dtype = torch2cute_dtype_map[dy.dtype]
+    convert_from_dlpack = lambda tensor: (
+        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+    )
+    dy_tensor, y_tensor, dx_tensor = [convert_from_dlpack(tensor) for tensor in (dy, y, dx)]
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    compile_key = (dtype, N)
+    if compile_key not in _softmax_backward.compile_cache:
+        softmax_backward_op = SoftmaxBackward(dtype, N)
+        _softmax_backward.compile_cache[compile_key] = cute.compile(
+            softmax_backward_op, dy_tensor, y_tensor, dx_tensor, current_stream
+        )
+    _softmax_backward.compile_cache[compile_key](dy_tensor, y_tensor, dx_tensor, current_stream)
+    return dx
+_softmax_backward.compile_cache = {}
+class SoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        y = _softmax_fwd(x)
+        ctx.save_for_backward(y)
+        return y
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensors
+        dx = _softmax_backward(dy, y)
+        return dx
+def softmax(x: torch.Tensor) -> torch.Tensor:
+    """Softmax forward pass with automatic differentiation support.
+    Args:
+        x: Input tensor of shape (M, N)
+    Returns:
+        Softmax output tensor of same shape as x
+    """
+    return SoftmaxFunction.apply(x)

quack-kernels 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

quack-kernels 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl