PyPI - quack-kernels - Versions diffs - 0.1.10__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

quack-kernels 0.1.10py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

quack/__init__.py +8 -1
quack/activation.py +288 -0
quack/autotuner.py +310 -0
quack/cross_entropy.py +325 -175
quack/cute_dsl_utils.py +119 -0
quack/dense_gemm_sm100.py +2562 -0
quack/dense_gemm_sm90.py +1657 -842
quack/fast_math.py +80 -0
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +69 -0
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +569 -0
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +5 -3
quack/linear.py +240 -0
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +74 -0
quack/pipeline.py +151 -0
quack/reduce.py +241 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +583 -231
quack/softmax.py +27 -15
quack/sort/bitonic_sort.py +126 -0
quack/sort/generate_sorting_networks.py +326 -0
quack/sort/sorting_networks.py +120 -0
quack/sort/utils.py +31 -0
quack/symmetric_dense_gemm_sm90.py +2091 -0
quack/tensormap_manager.py +115 -0
quack/tile_scheduler.py +937 -0
quack/topk.py +227 -0
quack/utils.py +203 -230
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/METADATA +2 -2
quack_kernels-0.2.0.dist-info/RECORD +37 -0
quack_kernels-0.1.10.dist-info/RECORD +0 -13
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.10.dist-info → quack_kernels-0.2.0.dist-info}/top_level.txt +0 -0

quack/gemm_wrapper_utils.py ADDED Viewed

@@ -0,0 +1,158 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple, Dict, Any
+from dataclasses import dataclass
+from torch import Tensor
+import cutlass.cute as cute
+from cutlass import Int32
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from quack.cute_dsl_utils import torch2cute_dtype_map
+from quack.dense_gemm_sm90 import TileSchedulerOptions
+@dataclass
+class GemmTensorInfo:
+    tensor: Optional[Tensor]
+    dtype: Optional[Any] = None
+    major: Optional[str] = None
+    cute_tensor: Optional[cute.Tensor] = None
+class GemmWrapperBase:
+    @staticmethod
+    def validate_tensor_3d(tensor: Tensor, name: str) -> None:
+        assert tensor.dim() == 3 and tensor.is_cuda, f"{name} must be a 3D CUDA tensor"
+        assert tensor.dtype in torch2cute_dtype_map, f"Unsupported dtype for {name}"
+    @staticmethod
+    def validate_shape(tensor: Tensor, expected_shape: Tuple[int, ...], name: str) -> None:
+        assert tensor.shape == expected_shape, (
+            f"{name} must have shape {expected_shape}, got {tensor.shape}"
+        )
+    @staticmethod
+    def get_major_order(tensor: Tensor, dims: Tuple[str, str, str]) -> str:
+        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # stride(1) == 1 means dims[1] is contiguous (innermost)
+        return dims[1] if tensor.stride(1) == 1 else dims[0]
+    @staticmethod
+    def create_cute_tensor(
+        tensor: Optional[Tensor],
+        major: Optional[str],
+        dims: Tuple[str, str, str],
+        assumed_align: int = 16,
+    ) -> Optional[cute.Tensor]:
+        if tensor is None:
+            return None
+        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # If major is dims[1], leading_dim is 1; if major is dims[0], leading_dim is 0
+        leading_dim = 1 if major == dims[1] else 0
+        return from_dlpack(tensor.detach(), assumed_align=assumed_align).mark_layout_dynamic(
+            leading_dim=leading_dim
+        )
+    @staticmethod
+    def validate_and_prepare_tensors(
+        A: Tensor,
+        B: Tensor,
+        D: Optional[Tensor] = None,
+        C: Optional[Tensor] = None,
+        additional_tensors: Optional[Dict[str, Tensor]] = None,
+    ) -> Tuple[int, int, int, int, Dict[str, GemmTensorInfo]]:
+        GemmWrapperBase.validate_tensor_3d(A, "A")
+        L, M, K = A.shape
+        GemmWrapperBase.validate_tensor_3d(B, "B")
+        _, N, _ = B.shape
+        assert B.dtype == A.dtype, "A and B must have the same dtype"
+        GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+        tensors = {
+            "A": GemmTensorInfo(A),
+            "B": GemmTensorInfo(B),
+            "D": GemmTensorInfo(D),
+            "C": GemmTensorInfo(C),
+        }
+        if D is not None:
+            GemmWrapperBase.validate_tensor_3d(D, "D")
+            GemmWrapperBase.validate_shape(D, (L, M, N), "D")
+        if C is not None:
+            GemmWrapperBase.validate_tensor_3d(C, "C")
+            GemmWrapperBase.validate_shape(C, (L, M, N), "C")
+        if additional_tensors:
+            for name, tensor in additional_tensors.items():
+                if tensor is not None:
+                    GemmWrapperBase.validate_tensor_3d(tensor, name)
+                    GemmWrapperBase.validate_shape(tensor, (L, M, N), name)
+                tensors[name] = GemmTensorInfo(tensor)
+        return L, M, K, N, tensors
+    @staticmethod
+    def permute_tensors(tensors: Dict[str, GemmTensorInfo]) -> None:
+        for info in tensors.values():
+            if info.tensor is not None:
+                info.tensor = info.tensor.permute(1, 2, 0)
+    @staticmethod
+    def extract_dtypes(tensors: Dict[str, GemmTensorInfo]) -> None:
+        for info in tensors.values():
+            if info.tensor is not None:
+                info.dtype = torch2cute_dtype_map[info.tensor.dtype]
+    @staticmethod
+    def determine_major_orders(
+        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
+    ) -> None:
+        for name, dims in major_configs.items():
+            if name in tensors and tensors[name].tensor is not None:
+                tensors[name].major = GemmWrapperBase.get_major_order(tensors[name].tensor, dims)
+    @staticmethod
+    def create_cute_tensors(
+        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
+    ) -> None:
+        for name, info in tensors.items():
+            if info.tensor is not None and name in major_configs:
+                info.cute_tensor = GemmWrapperBase.create_cute_tensor(
+                    info.tensor, info.major, major_configs[name]
+                )
+    @staticmethod
+    def create_scheduler_args(
+        max_active_clusters: int, tile_count_semaphore: Optional[Tensor] = None
+    ) -> TileSchedulerOptions:
+        return TileSchedulerOptions(
+            Int32(max_active_clusters),
+            tile_count_semaphore=make_ptr(
+                Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4
+            )
+            if tile_count_semaphore is not None
+            else None,
+        )
+    @staticmethod
+    def get_compile_key(
+        tensors: Dict[str, GemmTensorInfo],
+        activation: Optional[str],
+        tile_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
+        pingpong: bool,
+        persistent: bool,
+        has_semaphore: bool,
+        *args,
+        key_tensor_names: Tuple[str, ...] = ("A", "B", "D", "C"),
+    ) -> Tuple:
+        key_parts = []
+        for name in key_tensor_names:
+            if name in tensors:
+                key_parts.append(tensors[name].dtype)
+        key_parts.append(activation)
+        key_parts.extend([tile_shape_mn, cluster_shape_mnk])
+        for name in key_tensor_names:
+            if name in tensors:
+                key_parts.append(tensors[name].major)
+        key_parts.extend([pingpong, persistent, has_semaphore])
+        key_parts.extend(args)
+        return tuple(key_parts)

quack/layernorm.py CHANGED Viewed

@@ -10,7 +10,9 @@ import cutlass
 import cutlass.cute as cute
 from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
-from quack.reduction_base import ReductionBase, torch2cute_dtype_map
+from quack.reduce import row_reduce
+from quack.reduction_base import ReductionBase
+from quack.cute_dsl_utils import torch2cute_dtype_map
 class LayerNorm(ReductionBase):
@@ -190,7 +192,7 @@ class LayerNorm(ReductionBase):
         cute.autovec_copy(tXsX, tXrX)
         x = tXrX.load().to(cute.Float32)
         threads_per_row = tv_layout.shape[0][0]
-        sum_x = utils.row_reduce(
+        sum_x = row_reduce(
             x,
             cute.ReductionOp.ADD,
             threads_per_row,
@@ -207,7 +209,7 @@ class LayerNorm(ReductionBase):
             cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
             x = tXrX.load().to(cute.Float32)
-        sum_sq_x_sub_mean = utils.row_reduce(
+        sum_sq_x_sub_mean = row_reduce(
             (x - mean) * (x - mean),
             cute.ReductionOp.ADD,
             threads_per_row,

quack/linear.py ADDED Viewed

@@ -0,0 +1,240 @@
+# Copyright (c) 2025, Tri Dao
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.amp import custom_fwd, custom_bwd
+from quack.gemm_interface import gemm, gemm_add_inplace, gemm_act, gemm_dact
+def linear_fwd_convert_type(*tensors):
+    autocast_dtype = torch.get_autocast_dtype("cuda")
+    if torch.is_autocast_enabled():
+        tensors = tuple(t.to(dtype=autocast_dtype) for t in tensors)
+    return tensors
+def linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad):
+    needs_input_grad, needs_weight_grad = needs_x_w_grad
+    if not needs_input_grad:
+        weight, weight_og = None, None
+    if not needs_weight_grad:
+        x = None
+    ctx.save_for_backward(x, weight, weight_og if ctx.fuse_grad_accum else None)
+def linear_bwd_compute_input_grad(ctx, dout, weight, matmul_fn):
+    if ctx.needs_input_grad[0]:
+        assert weight is not None
+        return matmul_fn(dout, weight)
+    else:
+        return None
+def linear_bwd_compute_weight_grad(ctx, dout, x, weight_og, matmul_fn, matmul_inplace_fn):
+    if ctx.needs_input_grad[1]:
+        assert x is not None
+        x = x.reshape(-1, x.shape[-1])
+        # fuse_grad_accum is not compatible with torch.compile
+        if not ctx.fuse_grad_accum or weight_og.grad is None or torch.compiler.is_compiling():
+            dweight = matmul_fn(dout.T, x, out_dtype=ctx.weight_dtype)
+        else:
+            # print("Using fuse grad accum in Linear", dout.shape, x.shape, weight_og.grad.shape)
+            matmul_inplace_fn(dout.T, x, weight_og.grad)
+            dweight = weight_og.grad
+            weight_og.grad = None  # So that pytorch doesn't add dweight to weight_og.grad again
+    else:
+        dweight = None
+    return dweight
+class LinearFunc(torch.autograd.Function):
+    matmul_fwd_fn = gemm
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+    # Use classmethod instead of staticmethod to allow inheritance
+    @classmethod
+    @custom_fwd(device_type="cuda")
+    def forward(cls, ctx, x, weight, fuse_grad_accum=False):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        out: (..., out_features)
+        """
+        ctx.weight_dtype = weight.dtype
+        ctx.fuse_grad_accum = fuse_grad_accum
+        weight_og = weight
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        # out = F.linear(x, weight)
+        out = cls.matmul_fwd_fn(x, weight.T)
+        linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
+        return out.reshape(*batch_shape, out.shape[-1])
+    @classmethod
+    @custom_bwd(device_type="cuda")
+    def backward(cls, ctx, dout, *args):
+        """
+        dout: (..., out_features)
+        """
+        x, weight, weight_og = ctx.saved_tensors  # weight_og is None if not ctx.fuse_grad_accum
+        batch_shape = dout.shape[:-1]
+        dout = dout.reshape(-1, dout.shape[-1])
+        dx = linear_bwd_compute_input_grad(ctx, dout, weight, cls.matmul_bwd_dx)
+        dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
+        dweight = linear_bwd_compute_weight_grad(
+            ctx, dout, x, weight_og, cls.matmul_bwd_dw, cls.matmul_bwd_dw_inplace
+        )
+        # return extra Nones for other classes that inherit from LinearFunc
+        return dx, dweight, *([None] * 10)
+class LinearUntunedFunc(LinearFunc):
+    # Passing in tuned=False to disable tuning at runtime
+    matmul_fwd_fn = partial(gemm, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+def linear_func(x, weight, fuse_grad_accum=False, tuned=True):
+    fn_cls = LinearFunc if tuned else LinearUntunedFunc
+    return fn_cls.apply(x, weight, fuse_grad_accum)
+class LinearActFunc(LinearFunc):
+    matmul_fwd_fn = gemm_act
+    # Use classmethod instead of staticmethod to allow inheritance
+    @classmethod
+    @custom_fwd(device_type="cuda")
+    def forward(cls, ctx, x, weight, activation, store_preact=True, fuse_grad_accum=False):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        out: (..., out_features)
+        Return both out and post-activation, but only out is differentiable.
+        """
+        ctx.weight_dtype = weight.dtype
+        ctx.fuse_grad_accum = fuse_grad_accum
+        weight_og = weight
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        out, postact = cls.matmul_fwd_fn(
+            x, weight.T, activation=activation, store_preact=store_preact
+        )
+        linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
+        if out is not None:
+            out = out.reshape(*batch_shape, out.shape[-1])
+        ctx.mark_non_differentiable(postact)
+        ctx.set_materialize_grads(False)  # We don't want to materialize grads for postact
+        return out, postact.reshape(*batch_shape, postact.shape[-1])
+class LinearActUntunedFunc(LinearActFunc):
+    # Passing in tuned=False to disable tuning at runtime
+    matmul_fwd_fn = partial(gemm_act, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+def linear_act_func(x, weight, activation, store_preact=True, fuse_grad_accum=False, tuned=True):
+    fn_cls = LinearActFunc if tuned else LinearActUntunedFunc
+    return fn_cls.apply(x, weight, activation, store_preact, fuse_grad_accum)
+class DActLinearFunc(LinearFunc):
+    matmul_bwd_dx = partial(gemm_dact, dynamic_scheduler=True)
+    # Use classmethod instead of staticmethod to allow inheritance
+    @classmethod
+    @custom_fwd(device_type="cuda")
+    def forward(cls, ctx, preact, weight, x, activation, fuse_grad_accum=False):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        out: (..., out_features)
+        Takes in an extra preact argument which is the pre-activation, to be used in the backward pass.
+        """
+        ctx.weight_dtype = weight.dtype
+        ctx.fuse_grad_accum = fuse_grad_accum
+        weight_og = weight
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        out = cls.matmul_fwd_fn(x, weight.T)
+        # Store preact instead of x, we will recompute x in the backward pass
+        linear_fwd_postprocess(
+            ctx, preact, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2]
+        )
+        ctx.activation = activation
+        return out.reshape(*batch_shape, out.shape[-1])
+    @classmethod
+    @custom_bwd(device_type="cuda")
+    def backward(cls, ctx, dout):
+        """
+        dout: (..., out_features)
+        """
+        # weight_og is None if not ctx.fuse_grad_accum
+        preact, weight, weight_og = ctx.saved_tensors
+        batch_shape = dout.shape[:-1]
+        dout = dout.reshape(-1, dout.shape[-1])
+        preact = preact.reshape(-1, preact.shape[-1])
+        if ctx.needs_input_grad[0]:
+            assert weight is not None
+            dpreact, x = cls.matmul_bwd_dx(dout, weight, preact, activation=ctx.activation)
+        else:
+            dpreact, x = None, None
+        dpreact = dpreact.reshape(*batch_shape, dpreact.shape[-1]) if dpreact is not None else None
+        dweight = linear_bwd_compute_weight_grad(
+            ctx, dout, x, weight_og, cls.matmul_bwd_dw, cls.matmul_bwd_dw_inplace
+        )
+        return dpreact, dweight, *([None] * 3)
+class DActLinearUntunedFunc(DActLinearFunc):
+    # Passing in tuned=False to disable tuning at runtime
+    matmul_fwd_fn = partial(gemm, tuned=False)
+    matmul_bwd_dx = partial(gemm_dact, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+def act_linear_func(preact, weight, x, activation, fuse_grad_accum=False, tuned=True):
+    fn_cls = DActLinearFunc if tuned else DActLinearUntunedFunc
+    return fn_cls.apply(preact, weight, x, activation, fuse_grad_accum)
+class Linear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        device=None,
+        dtype=None,
+        fuse_grad_accum: bool = False,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.fuse_grad_accum = fuse_grad_accum
+    def forward(self, input: Tensor) -> Tensor:
+        if (
+            self.bias is None
+            and input.is_cuda
+            and self.in_features % 8 == 0
+            and self.out_features % 8 == 0
+        ):
+            return linear_func(input, self.weight, fuse_grad_accum=self.fuse_grad_accum)
+        else:
+            return F.linear(input, self.weight, self.bias)

quack/linear_cross_entropy.py ADDED Viewed

@@ -0,0 +1,275 @@
+# Copyright (c) 2025, Tri Dao
+from typing import Optional, Literal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.amp import custom_fwd, custom_bwd
+from quack.cross_entropy import cross_entropy, cross_entropy_fwd_out
+from quack.gemm_interface import gemm, gemm_add, gemm_add_inplace
+from quack.linear import linear_fwd_convert_type
+def linear_cross_entropy_func(
+    x: Tensor,  # (..., d)
+    weight: Tensor,  # (V, d)
+    bias: Optional[Tensor],  # (V,) or None
+    target: Tensor,  # (...,), int or long
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+    inplace_backward: bool = False,
+) -> Tensor:
+    y = F.linear(x, weight, bias)  # (..., V)
+    return cross_entropy(
+        y, target, ignore_index=ignore_index, reduction=reduction, inplace_backward=inplace_backward
+    )
+def linear_cross_entropy_func_ref(
+    x: Tensor,  # (..., d)
+    weight: Tensor,  # (V, d)
+    bias: Optional[Tensor],  # (V,) or None
+    target: Tensor,  # (...,), int or long
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+) -> Tensor:
+    y = F.linear(x, weight, bias)  # (..., V)
+    return F.cross_entropy(y, target, ignore_index=ignore_index, reduction=reduction)
+def chunked_linear_cross_entropy_fwd(
+    x: Tensor,  # (B*L, d) where B is batch, L is seqlen
+    weight: Tensor,  # (V, d) where V is vocab size
+    target: Tensor,  # (B*L,)
+    chunk_size: int = 4096,
+    ignore_index: int = -100,
+    tuned: bool = True,
+) -> tuple[Tensor, Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+    """
+    Chunked forward pass for linear cross entropy.
+    Splits input along batch dimension, computes matmul and cross_entropy_fwd
+    for each chunk, stores dx for each chunk, and accumulates dw.
+    Returns:
+        loss: (B*L,) loss values
+        dx: (B*L, d) gradient w.r.t. input
+        dw: (V, d) gradient w.r.t. weight (accumulated across chunks except last)
+        last_dlogits_chunk: (chunk_len, V) gradient of last chunk's logits (for deferred dw computation)
+        last_x_chunk: (chunk_len, d) last chunk's input (for deferred dw computation)
+    """
+    B_L, d = x.shape
+    V, _ = weight.shape
+    device = x.device
+    num_chunks = (B_L + chunk_size - 1) // chunk_size
+    # Since we use gemm with TMA we require some alignment
+    assert chunk_size % 8 == 0, "chunk_size must be multiple of 8"
+    assert B_L % 8 == 0
+    # Pre-allocate outputs
+    loss = torch.empty(B_L, device=device, dtype=torch.float32)
+    logits_chunk_preallocated = torch.empty((chunk_size, V), device=device, dtype=x.dtype)
+    dx = torch.empty_like(x)
+    # Last chunk of dw will be deferred to the backward pass
+    dw = torch.empty_like(weight, dtype=torch.float32) if num_chunks > 1 else None
+    last_dlogits_chunk = None
+    last_x_chunk = None
+    # Process in chunks
+    for i, (x_chunk, target_chunk, loss_chunk, dx_chunk) in enumerate(
+        zip(*(t.split(chunk_size) for t in (x, target, loss, dx)))
+    ):
+        chunk_len = x_chunk.shape[0]
+        logits_chunk = logits_chunk_preallocated[:chunk_len]  # (chunk_len, V)
+        torch.mm(x_chunk, weight.mT, out=logits_chunk)
+        # Compute cross entropy forward with gradients
+        dlogits_chunk = logits_chunk  # inplace_backward
+        cross_entropy_fwd_out(
+            logits_chunk,
+            target_chunk,
+            None,  # target_logit
+            loss=loss_chunk,
+            lse=None,  # we don't need lse here
+            dx=dlogits_chunk,
+            ignore_index=ignore_index,
+        )
+        # Compute dx for this chunk: dlogits @ weight
+        torch.mm(dlogits_chunk, weight, out=dx_chunk)  # (chunk_len, d)
+        # Compute dw for all chunks except the last
+        if i == num_chunks - 1:
+            # Last chunk: save for backward pass
+            last_dlogits_chunk = dlogits_chunk
+            last_x_chunk = x_chunk
+        elif i == 0:
+            # First chunk: dw = dlogits.T @ x_chunk
+            gemm(dlogits_chunk.T, x_chunk, out=dw, tuned=tuned)
+        else:
+            # Middle chunks: dw += dlogits.T @ x_chunk
+            gemm_add_inplace(dlogits_chunk.T, x_chunk, dw, tuned=tuned)
+    return loss, dx, dw, last_dlogits_chunk, last_x_chunk
+class ChunkedLinearCrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        x: Tensor,
+        weight: Tensor,
+        target: Tensor,
+        ignore_index: int = -100,
+        reduction: Literal["mean", "sum"] = "mean",
+        chunk_size: int = 4096,
+        tuned: bool = True,
+    ):
+        """
+        Forward pass computes loss and stores dx and dw for backward.
+        """
+        ctx.weight_dtype = weight.dtype
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        # TODO: don't need to compute bwd if neither x nor weight requires grad, or not training
+        loss, dx, dw, last_dlogits_chunk, last_x_chunk = chunked_linear_cross_entropy_fwd(
+            x, weight, target, chunk_size, ignore_index, tuned=tuned
+        )
+        loss_sum = loss.sum()
+        loss_scale = None if reduction == "sum" else 1.0 / (target != ignore_index).sum().float()
+        ctx.save_for_backward(dx, dw, last_dlogits_chunk, last_x_chunk, loss_scale)
+        ctx.batch_shape = batch_shape
+        ctx.ignore_index = ignore_index
+        ctx.reduction = reduction
+        ctx.tuned = tuned
+        return loss_sum if loss_scale is None else loss_sum * loss_scale
+    @staticmethod
+    @custom_bwd(device_type="cuda")
+    def backward(ctx, dloss):
+        """
+        Backward pass scales pre-computed gradients by dloss and completes
+        the last chunk's dw computation.
+        dloss is a scalar.
+        """
+        dx, dw, last_dlogits_chunk, last_x_chunk, loss_scale = ctx.saved_tensors
+        tuned = ctx.tuned
+        if loss_scale is not None:
+            dloss = dloss * loss_scale
+        # TODO: the case where x or weight doesn't require grad
+        dx.mul_(dloss)
+        dx = dx.reshape(*ctx.batch_shape, dx.shape[-1])
+        # Complete dw computation: dw = dloss * dw + dloss * (last_dlogits_chunk.T @ last_x_chunk)
+        if dw is None:
+            # Only had one chunk, compute dw directly with dloss scaling
+            dw = gemm(
+                last_dlogits_chunk.T,
+                last_x_chunk,
+                out_dtype=ctx.weight_dtype,
+                alpha=dloss,
+                tuned=tuned,
+            )
+        else:
+            # Add last chunk's contribution with dloss scaling
+            # dw = dloss * dw + dloss * (last_dlogits_chunk.T @ last_x_chunk)
+            # We use alpha=dloss, beta=dloss
+            if ctx.weight_dtype == dw.dtype:
+                gemm_add_inplace(
+                    last_dlogits_chunk.T, last_x_chunk, dw, alpha=dloss, beta=dloss, tuned=tuned
+                )
+            else:
+                dw = gemm_add(
+                    last_dlogits_chunk.T,
+                    last_x_chunk,
+                    dw,
+                    alpha=dloss,
+                    beta=dloss,
+                    out_dtype=ctx.weight_dtype,
+                    tuned=tuned,
+                )
+        return dx, dw, None, None, None, None, None
+def chunked_linear_cross_entropy(
+    x: Tensor,
+    weight: Tensor,
+    target: Tensor,
+    chunk_size: int = 4096,
+    ignore_index: int = -100,
+    reduction: Literal["mean", "sum"] = "mean",
+    tuned: bool = True,
+) -> Tensor:
+    """
+    Chunked linear cross entropy with automatic differentiation support.
+    Args:
+        x: Input tensor of shape (B*L, d)
+        weight: Weight tensor of shape (V, d)
+        target: Target indices of shape (B*L,)
+        chunk_size: Size of chunks to process
+        ignore_index: Index to ignore in loss computation
+        reduction: Type of reduction to apply
+        tuned: Whether to use tuned kernels
+    Returns:
+        Loss tensor with specified reduction
+    """
+    if reduction not in ["mean", "sum"]:
+        raise ValueError(f"Invalid reduction: {reduction}")
+    loss = ChunkedLinearCrossEntropyFunction.apply(
+        x, weight, target, ignore_index, reduction, chunk_size, tuned
+    )
+    return loss
+class LinearCrossEntropy(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        ignore_index: int = -100,
+        reduction: Literal["none", "mean", "sum"] = "mean",
+        chunk_size: Optional[int] = None,
+        inplace_backward: bool = False,
+        tuned: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.chunk_size = chunk_size
+        self.inplace_backward = inplace_backward
+        self.tuned = tuned
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        if (
+            self.bias is None
+            and input.is_cuda
+            and input.stride(-1) == 1
+            and self.in_features % 8 == 0
+            and self.out_features % 8 == 0
+            and input.shape[:-1].numel() % 8 == 0
+            and self.chunk_size is not None
+            and self.chunk_size % 8 == 0
+            and self.reduction in ["mean", "sum"]
+        ):
+            return chunked_linear_cross_entropy(
+                input,
+                self.weight,
+                target,
+                chunk_size=self.chunk_size,
+                ignore_index=self.ignore_index,
+                reduction=self.reduction,
+                tuned=self.tuned,
+            )
+        else:
+            return linear_cross_entropy_func(
+                input,
+                self.weight,
+                self.bias,
+                target,
+                ignore_index=self.ignore_index,
+                reduction=self.reduction,
+                inplace_backward=self.inplace_backward,
+            )

quack-kernels 0.1.10__py3-none-any.whl → 0.2.0__py3-none-any.whl

quack-kernels 0.1.10py3-none-any.whl → 0.2.0py3-none-any.whl