PyPI - quack-kernels - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

quack-kernels 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

quack/__init__.py +7 -3
quack/activation.py +279 -0
quack/autotuner.py +2 -1
quack/cross_entropy.py +330 -184
quack/cute_dsl_utils.py +83 -4
quack/dense_gemm_sm100.py +1 -1
quack/dense_gemm_sm90.py +911 -1140
quack/fast_math.py +10 -27
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +43 -35
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +491 -243
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +6 -4
quack/linear.py +128 -64
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +30 -160
quack/pipeline.py +2 -17
quack/reduce.py +240 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +614 -228
quack/softmax.py +28 -16
quack/symmetric_dense_gemm_sm90.py +6 -3
quack/tensormap_manager.py +1 -0
quack/tile_scheduler.py +64 -61
quack/topk.py +14 -8
quack/utils.py +14 -322
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/METADATA +3 -3
quack_kernels-0.2.1.dist-info/RECORD +37 -0
quack/lse.py +0 -62
quack_kernels-0.1.11.dist-info/RECORD +0 -31
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.1.dist-info}/top_level.txt +0 -0

quack/cross_entropy.py CHANGED Viewed

@@ -1,17 +1,22 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
 import math
-from typing import Optional, Type
+from typing import Optional, Type, Literal
+import torch
+from torch import Tensor
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
-import torch
-from cutlass.cute.runtime import from_dlpack
-from quack.reduction_base import ReductionBase, torch2cute_dtype_map
+from quack.reduce import row_reduce, online_softmax_reduce
+from quack.reduction_base import ReductionBase
+from quack.cute_dsl_utils import torch2cute_dtype_map
 class CrossEntropy(ReductionBase):
@@ -21,7 +26,7 @@ class CrossEntropy(ReductionBase):
             dtype,
             N,
             stage=2 if not online_softmax else 1,
-            reduction_dtype=cutlass.Float32 if not online_softmax else cutlass.Int64,
+            reduction_dtype=Float32 if not online_softmax else cutlass.Int64,
         )
         self.online_softmax = online_softmax
         self.reload_from = None if N <= 16384 or online_softmax else "smem"
@@ -40,7 +45,7 @@ class CrossEntropy(ReductionBase):
     def _set_cluster_n(self):
         N = self.N
-        if cutlass.const_expr(self.dtype.width == 16):
+        if const_expr(self.dtype.width == 16):
             cluster_n = (
                 1
                 if N <= 16 * 1024
@@ -65,21 +70,30 @@ class CrossEntropy(ReductionBase):
     @cute.jit
     def __call__(
         self,
-        mX: cute.Tensor,
-        mTarget: cute.Tensor,
-        mLoss: cute.Tensor,
-        mLSE: Optional[cute.Tensor],
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mTargetLogit: Optional[cute.Tensor],  # (M, K) or (M,). If None, we use mX
+        mLoss: cute.Tensor,  # (M,)
+        mLSE: Optional[cute.Tensor],  # (M,)
+        mdX: Optional[cute.Tensor],  # (M, N) - if provided, compute gradient
+        ignore_index: Int32,  # Index to ignore in loss computation
         stream: cuda.CUstream,
     ):
         assert mX.element_type == self.dtype
+        if const_expr(mTargetLogit is None):
+            mTargetLogit = mX
         self._set_cluster_n()
-        tiler_mn, tv_layout = self._get_tv_layout()
+        # e.g. if self.N isn't divisible by 8 for bf16, we might use 64 bits (4 elements) copy
+        num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
         num_threads = cute.size(tv_layout, mode=[0])
         num_warps = num_threads // cute.arch.WARP_SIZE
-        self.kernel(mX, mTarget, mLoss, mLSE, tv_layout, tiler_mn).launch(
+        self.kernel(
+            mX, mTarget, mTargetLogit, mLoss, mLSE, mdX, ignore_index, tv_layout, tiler_mn
+        ).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=([1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None),
+            cluster=([1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None),
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -89,17 +103,20 @@ class CrossEntropy(ReductionBase):
         self,
         mX: cute.Tensor,  # (M, N)
         mTarget: cute.Tensor,  # (M,)
+        mTargetLogit: cute.Tensor,  # (M, K) or (M,)
         mLoss: cute.Tensor,  # (M,)
         mLSE: Optional[cute.Tensor],  # (M,)
+        mdX: Optional[cute.Tensor],  # (M, N) - if provided, compute gradient
+        ignore_index: Int32,  # Index to ignore in loss computation
         tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
-        if cutlass.const_expr(self.cluster_n > 1):
+        if const_expr(self.cluster_n > 1):
             cluster_y = cute.arch.block_idx()[1]
         else:
-            cluster_y = cutlass.const_expr(0)
+            cluster_y = const_expr(0)
         shape: cute.Shape = mX.shape
         idX = cute.make_identity_tensor(shape)
@@ -118,12 +135,14 @@ class CrossEntropy(ReductionBase):
         reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
         # declare the atoms which will be used later for memory copy
+        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
         copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), gX.element_type, num_bits_per_copy=128
+            cute.nvgpu.cpasync.CopyG2SOp(), gX.element_type, num_bits_per_copy=num_copy_bits_X
         )
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
-        #### Thread View
+        #### Partition to get thread view
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
@@ -133,14 +152,14 @@ class CrossEntropy(ReductionBase):
         self._initialize_cluster(tidx, mbar_ptr, num_warps)
         row = tXcX[0][0]
-        target = cute.Int32.zero
-        if row < shape[0] and tXcX[0][1] == 0:
-            target = cute.Int32(mTarget[row])
+        target = Int32.zero
+        if row < shape[0]:
+            target = Int32(mTarget[row])
-        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
             utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
-            if cutlass.const_expr(not is_even_N)
+            if const_expr(not is_even_N)
             else None
         )
         if row < shape[0]:
@@ -148,99 +167,148 @@ class CrossEntropy(ReductionBase):
         cute.arch.cp_async_commit_group()
         cute.arch.cp_async_wait_group(0)
         # Fill OOB values with -inf
-        if cutlass.const_expr(not is_even_N):
+        if const_expr(not is_even_N):
             utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
         cute.autovec_copy(tXsX, tXrX)
-        x = tXrX.load().to(cute.Float32)
-        target_logit = cute.Float32.zero
-        if row < shape[0] and tXcX[0][1] == 0:
-            # Use Int64 for indexing to deal with large tensors
-            mX_off = utils.domain_offset_i64((row, 0), mX)
-            target_logit = cute.Float32(mX_off[0, target])
+        x = tXrX.load().to(Float32)
+        target_logit = Float32.zero
+        should_ignore = Boolean(target == ignore_index)
+        if row < shape[0] and tXcX[0][1] == 0 and not should_ignore:
+            # Only load target logit if not ignoring this index
+            if const_expr(cute.rank(mTargetLogit.shape) == 2):
+                # Use Int64 for indexing to deal with large tensors
+                mTargetLogit_off = utils.domain_offset_i64((row, 0), mTargetLogit)
+                target_logit = Float32(mTargetLogit_off[0, target])
+            else:
+                assert cute.rank(mTargetLogit.shape) == 1
+                target_logit = Float32(mTargetLogit[row])
         threads_per_row = tv_layout.shape[0][0]
-        if cutlass.const_expr(not self.online_softmax):
-            max_x = utils.row_reduce(
+        if const_expr(not self.online_softmax):
+            max_x = row_reduce(
                 x,
                 cute.ReductionOp.MAX,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
-                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
-                init_val=-cutlass.Float32.inf,
-                hook_fn=(
-                    cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None
-                ),
+                mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
+                init_val=-Float32.inf,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
             )
-            if cutlass.const_expr(self.reload_from == "smem"):
+            if const_expr(self.reload_from == "smem"):
                 cute.autovec_copy(tXsX, tXrX)
-                x = tXrX.load().to(cute.Float32)
+                x = tXrX.load().to(Float32)
             log2_e = math.log2(math.e)
-            # exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
-            # a bit faster, probably because it's calling ex2.approx.ftz instead of ex2.approx?
-            # exp_x = utils.exp2f((x - max_x) * log2_e)
             # This would use ffma instead of fadd then fmul
-            exp_x = utils.exp2f(x * log2_e - (max_x * log2_e))
-            denom = utils.row_reduce(
+            exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=False)
+            denom = row_reduce(
                 exp_x,
                 cute.ReductionOp.ADD,
                 threads_per_row,
                 reduction_buffer[None, None, 1],
-                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
+                mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
                 init_val=0.0,
             )
         else:
-            max_x, denom, _ = utils.online_softmax_reduce(
+            max_x, denom, exp_x = online_softmax_reduce(
                 x,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
                 mbar_ptr,
-                hook_fn=(
-                    cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None
-                ),
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+                return_exp_x=const_expr(mdX is not None),
             )
+        # Write loss and lse to gmem
         if (
             tXcX[0][1] == 0
             and row < shape[0]
             and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
         ):
-            ln_2 = math.log(2.0)
-            lse = max_x + utils.log2f(denom) * ln_2
-            loss_val = lse - target_logit
-            mLoss[row] = loss_val.to(mLoss.element_type)
-            if cutlass.const_expr(mLSE is not None):
+            lse = max_x + cute.math.log(denom, fastmath=True)
+            # Set loss to 0 if this index should be ignored, otherwise compute normally
+            loss_val = (lse - target_logit) if not should_ignore else Float32.zero
+            mLoss[row] = mLoss.element_type(loss_val)
+            if const_expr(mLSE is not None):
                 mLSE[row] = lse
-def _cross_entropy(
-    x: torch.Tensor,
-    target: torch.Tensor,
-    return_lse: bool = False,
-) -> torch.Tensor:
+        # Compute gradient if mdX is provided
+        if const_expr(mdX is not None):
+            # Compute probabilities: exp(x) / sum(exp(x))
+            # If ignored, gradient should be zero
+            denom_inv = (
+                1.0 / denom
+                if not (denom == 0.0 or denom != denom or should_ignore)
+                else Float32.zero
+            )
+            probs = exp_x * denom_inv
+            mdX_off = utils.domain_offset_i64((bidx * tiler_mn[0], 0), mdX)
+            gdX = cute.local_tile(mdX_off, tiler_mn, (0, cluster_y))
+            # Setup copy atom for storing gradient
+            copy_atom_store = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(), mdX.element_type, num_bits_per_copy=num_copy_bits_X
+            )
+            thr_copy_dX = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
+            tXgdX = thr_copy_dX.partition_D(gdX)
+            tXrdX = cute.make_fragment_like(tXgdX)
+            tXcFull = thr_copy_X.partition_S(cX)
+            # Compute gradient: probs for all classes, (probs - 1) for target class
+            # If ignored, gradient is already zero
+            tXrdX_f32 = cute.make_fragment_like(tXrX, Float32)
+            tXrdX_f32.store(probs)
+            if not should_ignore:
+                for i in cutlass.range(cute.size(tXrX), unroll_full=True):
+                    tXrdX_f32[i] = tXrdX_f32[i] if tXcFull[i][1] != target else tXrdX_f32[i] - 1.0
+            tXrdX.store(tXrdX_f32.load().to(tXrdX.element_type))
+            tXpdX = (
+                utils.predicate_k(thr_copy_dX.partition_S(cX), limit=shape[1])
+                if not is_even_N
+                else None
+            )
+            if row < shape[0]:
+                cute.copy(copy_atom_store, tXrdX, tXgdX, pred=tXpdX)
+@torch.library.custom_op("quack::cross_entropy_fwd_out", mutates_args={"loss", "lse", "dx"})
+def cross_entropy_fwd_out(
+    x: Tensor,
+    target: Tensor,
+    target_logit: Optional[Tensor],
+    loss: Tensor,
+    lse: Optional[Tensor],
+    dx: Optional[Tensor],
+    ignore_index: int = -100,
+) -> None:
     """Cross entropy forward pass.
     Args:
         x: Input logits tensor of shape (M, N)
         target: Target class indices tensor of shape (M,)
+        target_logit: (M, K) or (M,).
+            If provided, the target logit will be read from this tensor instead of x.
+        loss: Output loss tensor of shape (M,)
+        lse: Optional output log-sum-exp tensor of shape (M,)
+        dx: Optional output gradient tensor of shape (M, N)
+        ignore_index: Index to ignore in loss computation
     Returns:
-        Cross entropy loss tensor of shape (M,)
+        None (mutates loss, lse, and optionally dx in-place)
     """
     assert x.dim() == 2, "Input must be 2D"
     assert target.dim() == 1, "Target must be 1D"
     assert x.shape[0] == target.shape[0], "Batch dimensions must match"
     assert x.is_cuda and target.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ], "Unsupported input dtype"
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
     assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
-    M, N = x.shape
-    device = x.device
-    loss = torch.empty(M, device=device, dtype=torch.float32)
-    lse = torch.empty(M, device=device, dtype=torch.float32) if return_lse else None
+    if target_logit is not None:
+        assert target_logit.shape[0] == x.shape[0]
+        assert target_logit.is_cuda, "Target logits must be on CUDA device"
+        assert target_logit.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    if dx is not None:
+        assert dx.shape == x.shape, "dx must have same shape as x"
+        assert dx.is_cuda, "dx must be on CUDA device"
+        assert dx.dtype == x.dtype, "dx must have same dtype as x"
+    N = x.size(1)
     dtype = torch2cute_dtype_map[x.dtype]
     convert_from_dlpack = lambda tensor: (
         from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
@@ -248,28 +316,86 @@ def _cross_entropy(
         )
     )
     x_tensor = convert_from_dlpack(x)
-    loss_tensor = from_dlpack(loss.detach(), assumed_align=4).mark_compact_shape_dynamic(mode=0)
+    loss_tensor = from_dlpack(loss.detach(), assumed_align=4).mark_layout_dynamic()
     lse_tensor = (
-        from_dlpack(lse.detach(), assumed_align=4).mark_compact_shape_dynamic(mode=0)
+        from_dlpack(lse.detach(), assumed_align=4).mark_layout_dynamic()
         if lse is not None
         else None
     )
-    target_tensor = from_dlpack(target.detach(), assumed_align=8).mark_compact_shape_dynamic(mode=0)
+    target_tensor = from_dlpack(target.detach(), assumed_align=8).mark_layout_dynamic()
+    target_logit_tensor = (
+        from_dlpack(target_logit.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=target_logit.ndim - 1
+        )
+        if target_logit is not None
+        else None
+    )
+    dx_tensor = convert_from_dlpack(dx) if dx is not None else None
     stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N, lse is not None)
-    if compile_key not in _cross_entropy.compile_cache:
-        cross_entropy_op = CrossEntropy(dtype, N)
-        _cross_entropy.compile_cache[compile_key] = cute.compile(
-            cross_entropy_op, x_tensor, target_tensor, loss_tensor, lse_tensor, stream
+    compile_key = (
+        dtype,
+        N,
+        target_logit.dtype if target_logit is not None else None,
+        lse.dtype if lse is not None else None,
+        dx is not None,
+        loss.stride(),
+        lse.stride() if lse is not None else None,
+        target.stride(),
+        target_logit.stride(-1) if target_logit is not None else None,
+    )
+    if compile_key not in cross_entropy_fwd_out.compile_cache:
+        # If there's dx, it's faster to not use online softmax since we want the exp(x - max)
+        cross_entropy_op = CrossEntropy(dtype, N, online_softmax=dx is None)
+        cross_entropy_fwd_out.compile_cache[compile_key] = cute.compile(
+            cross_entropy_op,
+            x_tensor,
+            target_tensor,
+            target_logit_tensor,
+            loss_tensor,
+            lse_tensor,
+            dx_tensor,
+            Int32(ignore_index),
+            stream,
         )
-    _cross_entropy.compile_cache[compile_key](
-        x_tensor, target_tensor, loss_tensor, lse_tensor, stream
+    cross_entropy_fwd_out.compile_cache[compile_key](
+        x_tensor,
+        target_tensor,
+        target_logit_tensor,
+        loss_tensor,
+        lse_tensor,
+        dx_tensor,
+        Int32(ignore_index),
+        stream,
     )
-    return loss if not return_lse else (loss, lse)
-_cross_entropy.compile_cache = {}
+cross_entropy_fwd_out.compile_cache = {}
+def cross_entropy_fwd(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    target_logit: Optional[torch.Tensor] = None,
+    ignore_index: int = -100,
+    return_lse: bool = False,
+    return_dx: bool = False,
+    inplace_backward: bool = False,
+) -> torch.Tensor | tuple[torch.Tensor]:
+    M = x.size(0)
+    device = x.device
+    loss = torch.empty(M, device=device, dtype=torch.float32)
+    lse = torch.empty(M, device=device, dtype=torch.float32) if return_lse else None
+    dx = (torch.empty_like(x) if not inplace_backward else x) if return_dx else None
+    cross_entropy_fwd_out(x, target, target_logit, loss, lse, dx, ignore_index)
+    if return_lse and return_dx:
+        return loss, lse, dx
+    elif return_lse:
+        return loss, lse
+    elif return_dx:
+        return loss, dx
+    else:
+        return loss
 class CrossEntropyBackward:
@@ -279,7 +405,7 @@ class CrossEntropyBackward:
         self.vecsize = 128 // dtype.width
     def _calculate_threads_per_row(self):
-        N = self.N
+        N = min(self.N, 16384)  # We split by blocks of 16k
         return (
             8
             if N <= 64
@@ -290,13 +416,14 @@ class CrossEntropyBackward:
             )
         )
-    def _get_tv_layout(self):
-        N = self.N
-        vecsize = self.vecsize
+    def _get_tv_layout(self, num_copy_bits=128):
+        vecsize = num_copy_bits // self.dtype.width
+        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        N = min(self.N, 16384)
         num_threads = 128 if N <= 16384 else 256
         threads_per_row = self._calculate_threads_per_row()
         cols_per_block = num_threads // threads_per_row
-        num_blocks_N = cute.ceil_div(min(N, 16384) // vecsize, threads_per_row)
+        num_blocks_N = cute.ceil_div(N // vecsize, threads_per_row)
         tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
         tv_layout = cute.make_layout(
             ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
@@ -315,40 +442,27 @@ class CrossEntropyBackward:
         mDLoss: cute.Tensor,
         mdX: cute.Tensor,
         mLSE: cute.Tensor,
+        ignore_index: Int32,  # Index to ignore in gradient computation
         stream: cuda.CUstream,
     ):
         assert mX.element_type == self.dtype
         assert mdX.element_type == self.dtype
-        tiler_mn, tv_layout = self._get_tv_layout()
+        # e.g. if self.N isn't divisible by 8 for bf16, we might use 64 bits (4 elements) copy
+        num_copy_bits = math.gcd(self.N, 128 // self.dtype.width) * self.dtype.width
+        tiler_mn, tv_layout = self._get_tv_layout(num_copy_bits=num_copy_bits)
         num_threads = cute.size(tv_layout, mode=[0])
-        mDLoss = cute.make_tensor(
-            mDLoss.iterator,
-            cute.append(mDLoss.layout, cute.make_layout((self.N,), stride=(0,))),
-        )
-        mTarget = cute.make_tensor(
-            mTarget.iterator,
-            cute.append(mTarget.layout, cute.make_layout((self.N,), stride=(0,))),
-        )
-        mLSE = cute.make_tensor(
-            mLSE.iterator,
-            cute.append(mLSE.layout, cute.make_layout((self.N,), stride=(0,))),
-        )
+        # (M,) -> (M, N) with stride 0 in the N dimension
+        mDLoss, mTarget, mLSE = [
+            cute.make_tensor(
+                X.iterator, cute.append(X.layout, cute.make_layout((self.N,), stride=(0,)))
+            )
+            for X in (mDLoss, mTarget, mLSE)
+        ]
         smem_size = cute.size_in_bytes(
             mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0))
         )
         self.kernel(
-            mX,
-            mTarget,
-            mDLoss,
-            mdX,
-            mLSE,
-            mX.shape,
-            tv_layout,
-            tiler_mn,
+            mX, mTarget, mDLoss, mdX, mLSE, ignore_index, mX.shape, tv_layout, tiler_mn
         ).launch(
             grid=[
                 cute.ceil_div(mX.shape[0], tiler_mn[0]),
@@ -368,6 +482,7 @@ class CrossEntropyBackward:
         mDLoss: cute.Tensor,  # (M,)
         mdX: cute.Tensor,  # (M, N)
         mLSE: cute.Tensor,  # (M,)
+        ignore_index: Int32,  # Index to ignore in gradient computation
         shape: cute.Shape,
         tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
@@ -388,76 +503,67 @@ class CrossEntropyBackward:
         gX, gdX = [cute.local_tile(mT, tiler_mn, (0, bidy)) for mT in (mX, mdX)]
         cX = cute.local_tile(idX, tiler_mn, (bidx, bidy))
+        num_copy_elems_X = tv_layout.shape[1][0]
+        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
         copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), gX.element_type, num_bits_per_copy=128
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), gX.element_type, num_bits_per_copy=128
+            cute.nvgpu.cpasync.CopyG2SOp(), gX.element_type, num_bits_per_copy=num_copy_bits_X
         )
-        copy_atom_store_O = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), gdX.element_type, num_bits_per_copy=128
+        copy_atom_store_dX = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), gdX.element_type, num_bits_per_copy=num_copy_bits_X
         )
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_X_async = cute.make_tiled_copy(
-            copy_atom_load_X_async, tv_layout, tiler_mn
-        ).get_slice(tidx)
-        thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
-        #### Thread View
-        tXgX = thr_copy_X_async.partition_S(gX)
-        tXsX = thr_copy_X_async.partition_S(sX)
+        thr_copy_dX = cute.make_tiled_copy(copy_atom_store_dX, tv_layout, tiler_mn).get_slice(tidx)
+        #### Partition to get thread view
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_S(sX)
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
-        tXcFull = thr_copy_X.partition_S(cX)  # improve
-        tXgO = thr_copy_O.partition_D(gdX)
+        tXcFull = thr_copy_X.partition_S(cX)
+        tXgdX = thr_copy_dX.partition_D(gdX)
         # allocate fragments for gmem->rmem
-        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
+        tXrX, tXrdX = [cute.make_fragment_like(thr) for thr in (tXgX, tXgdX)]
-        is_even_N = cutlass.const_expr(shape[1] % tiler_mn[1] == 0)
+        is_even_N = const_expr(shape[1] % tiler_mn[1] == 0)
         row = tXcX[0][0]
         tXpX = (
-            utils.predicate_k(thr_copy_X_async.partition_S(cX), limit=shape[1])
-            if not is_even_N
-            else None
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
         )
         if row < shape[0]:
-            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+            cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
         cute.arch.cp_async_commit_group()
         cute.arch.cp_async_wait_group(0)
-        if cutlass.const_expr(not is_even_N):
+        if const_expr(not is_even_N):
             utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
         cute.autovec_copy(tXsX, tXrX)
-        x = tXrX.load().to(cute.Float32)
+        x = tXrX.load().to(Float32)
-        label = cute.Int32.zero
-        dloss = cute.Float32.zero
-        lse = cute.Float32.zero
+        target = Int32.zero
+        dloss = Float32.zero
+        lse = Float32.zero
         if row < shape[0]:
-            label = cute.Int32(mTarget[row])
-            dloss = cute.Float32(mDLoss[row])
-            lse = cute.Float32(mLSE[row])
+            target = Int32(mTarget[row])
+            should_ignore = Boolean(target == ignore_index)
+            # Set dloss to 0 if this index should be ignored
+            dloss = Float32(mDLoss[row]) if not should_ignore else Float32.zero
+            lse = Float32(mLSE[row])
         log2_e = math.log2(math.e)
-        probs = utils.exp2f((x - lse) * log2_e)
+        probs = cute.math.exp2(x * log2_e - (lse * log2_e), fastmath=True)
         prob_shifted = probs - 1.0
         mask = cute.make_fragment_like(tXrX, cutlass.Boolean)
         for i in cutlass.range(cute.size(tXcFull), unroll_full=True):
-            mask[i] = tXcFull[i][1] == label
+            mask[i] = tXcFull[i][1] == target
         grad = cute.where(mask.load(), prob_shifted, probs)
         grad = grad * dloss
-        tXrO.store(grad.to(tXrO.element_type))
-        tOpO = (
-            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1]) if not is_even_N else None
+        tXrdX.store(grad.to(tXrdX.element_type))
+        tXpdX = (
+            utils.predicate_k(thr_copy_dX.partition_S(cX), limit=shape[1])
+            if not is_even_N
+            else None
         )
         if row < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+            cute.copy(copy_atom_store_dX, tXrdX, tXgdX, pred=tXpdX)
 def _cross_entropy_backward(
@@ -465,8 +571,9 @@ def _cross_entropy_backward(
     target: torch.Tensor,
     dloss: torch.Tensor,
     lse: torch.Tensor,
-    inplace_backward: bool = False,
-) -> torch.Tensor:
+    dx: torch.Tensor,
+    ignore_index=-100,
+) -> None:
     """Cross entropy backward pass.
     Args:
         x: Input logits tensor of shape (M, N)
@@ -483,18 +590,13 @@ def _cross_entropy_backward(
     assert x.shape[0] == target.shape[0], "Batch dimensions must match"
     assert x.shape[0] == dloss.shape[0], "Batch dimensions must match"
     assert x.shape[0] == lse.shape[0], "Batch dimensions must match"
-    assert (
-        x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda
-    ), "Tensors must be on CUDA device"
-    assert x.dtype in [
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ], "Unsupported input dtype"
+    assert x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda, (
+        "Tensors must be on CUDA device"
+    )
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
     assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
-    M, N = x.shape
-    dx = torch.empty_like(x) if not inplace_backward else x
+    N = x.size(1)
     dtype = torch2cute_dtype_map[x.dtype]
     convert_from_dlpack = lambda tensor: (
@@ -504,14 +606,12 @@ def _cross_entropy_backward(
     )
     x_tensor = convert_from_dlpack(x)
     dx_tensor = convert_from_dlpack(dx)
-    dloss_tensor = from_dlpack(dloss.detach(), assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    lse_tensor = from_dlpack(lse.detach(), assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    target_tensor = from_dlpack(target.detach(), assumed_align=32).mark_compact_shape_dynamic(
-        mode=0
-    )
+    dloss_tensor = from_dlpack(dloss.detach(), assumed_align=4).mark_layout_dynamic()
+    lse_tensor = from_dlpack(lse.detach(), assumed_align=4).mark_layout_dynamic()
+    target_tensor = from_dlpack(target.detach(), assumed_align=8).mark_layout_dynamic()
     stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N)
+    compile_key = (dtype, N, target.dtype, dloss.stride(), lse.stride(), target.stride())
     if compile_key not in _cross_entropy_backward.compile_cache:
         cross_entropy_backward_op = CrossEntropyBackward(dtype, N)
         _cross_entropy_backward.compile_cache[compile_key] = cute.compile(
@@ -521,48 +621,95 @@ def _cross_entropy_backward(
             dloss_tensor,
             dx_tensor,
             lse_tensor,
+            Int32(ignore_index),
             stream,
         )
     _cross_entropy_backward.compile_cache[compile_key](
-        x_tensor, target_tensor, dloss_tensor, dx_tensor, lse_tensor, stream
+        x_tensor, target_tensor, dloss_tensor, dx_tensor, lse_tensor, Int32(ignore_index), stream
     )
-    return dx
 _cross_entropy_backward.compile_cache = {}
+@torch.library.custom_op("quack::cross_entropy_bwd_out", mutates_args={"dx"})
+def cross_entropy_bwd_out(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    dx: torch.Tensor,
+    ignore_index: int = -100,
+) -> None:
+    _cross_entropy_backward(x, target, dloss, lse, dx, ignore_index)
+def cross_entropy_bwd(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    ignore_index: int = -100,
+    inplace_backward: bool = False,
+) -> None:
+    if inplace_backward and not torch.compiler.is_compiling():
+        dx = x
+        _cross_entropy_backward(
+            x=x, target=target, dloss=dloss, lse=lse, dx=x, ignore_index=ignore_index
+        )
+    else:
+        dx = torch.empty_like(x)
+        cross_entropy_bwd_out(
+            x=x, target=target, dloss=dloss, lse=lse, dx=dx, ignore_index=ignore_index
+        )
+    return dx
 class CrossEntropyFunction(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, x, target, inplace_backward=False):
-        loss, lse = _cross_entropy(x, target, return_lse=True)
+    def forward(ctx, x, target, lse_partial=None, ignore_index=-100, inplace_backward=False):
+        if lse_partial is None:
+            loss, lse = cross_entropy_fwd(x, target, ignore_index=ignore_index, return_lse=True)
+        else:
+            # if we already compute partial lse, then to compute the final lse we treat
+            # @lse_partial as @x and @x as @target_logit
+            loss, lse = cross_entropy_fwd(
+                lse_partial, target, target_logit=x, ignore_index=ignore_index, return_lse=True
+            )
         ctx.save_for_backward(x, target, lse)
+        ctx.ignore_index = ignore_index
         ctx.inplace_backward = inplace_backward
         return loss
     @staticmethod
     def backward(ctx, dloss):
         x, target, lse = ctx.saved_tensors
-        dx = _cross_entropy_backward(x, target, dloss, lse, inplace_backward=ctx.inplace_backward)
-        return dx, None, None
+        dx = cross_entropy_bwd(
+            x, target, dloss, lse, ctx.ignore_index, inplace_backward=ctx.inplace_backward
+        )
+        return dx, None, None, None, None
 def cross_entropy(
     x: torch.Tensor,
     target: torch.Tensor,
-    inplace_backward: bool = True,
-    reduction: str = "none",
+    lse_partial: Optional[torch.Tensor] = None,
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+    inplace_backward: bool = False,
 ) -> torch.Tensor:
     """Cross entropy loss with automatic differentiation support.
     Args:
         x: Input logits tensor of shape (M, N)
         target: Target class indices tensor of shape (M,)
-        inplace_backward: Whether to perform backward pass in-place
+        lse_partial: Optional precomputed log-sum-exp partial results
         reduction: Specifies the reduction to apply to the output:
             'none': no reduction will be applied (default)
             'mean': the sum of the output will be divided by the number of elements
             'sum': the output will be summed
+        inplace_backward: Whether to perform backward pass in-place
+        ignore_index: Index to ignore in loss computation (loss will be 0 for these indices)
     Returns:
         Cross entropy loss tensor:
@@ -570,10 +717,9 @@ def cross_entropy(
             - If reduction='mean': scalar tensor with mean loss
             - If reduction='sum': scalar tensor with sum of losses
     """
-    loss = CrossEntropyFunction.apply(x, target, inplace_backward)
+    loss = CrossEntropyFunction.apply(x, target, lse_partial, ignore_index, inplace_backward)
     if reduction == "mean":
-        return loss.mean()
+        return loss.sum() / (target != ignore_index).sum().float()
     elif reduction == "sum":
         return loss.sum()
     elif reduction == "none":

quack-kernels 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl

quack-kernels 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl