PyPI - quack-kernels - Versions diffs - 0.1.7__tar.gz → 0.1.8__tar.gz - Mend

quack-kernels 0.1.7tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{quack_kernels-0.1.7/quack_kernels.egg-info → quack_kernels-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.7
+Version: 0.1.8
 Requires-Python: >=3.9
 License-File: LICENSE
 Requires-Dist: nvidia-cutlass-dsl==4.1.0.dev0

{quack_kernels-0.1.7 → quack_kernels-0.1.8}/quack/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.7"
+__version__ = "0.1.8"
 from quack.rmsnorm import rmsnorm
 from quack.softmax import softmax

{quack_kernels-0.1.7 → quack_kernels-0.1.8}/quack/cross_entropy.py RENAMED Viewed

@@ -1,16 +1,16 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
 import math
-import torch
 from typing import Optional, Type
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
-from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
+import torch
+from cutlass.cute.runtime import from_dlpack
 from quack.reduction_base import ReductionBase, torch2cute_dtype_map
@@ -79,7 +79,7 @@ class CrossEntropy(ReductionBase):
         self.kernel(mX, mTarget, mLoss, mLSE, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
+            cluster=([1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None),
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -111,7 +111,9 @@ class CrossEntropy(ReductionBase):
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
-            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
         )
         reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
@@ -166,7 +168,9 @@ class CrossEntropy(ReductionBase):
                 reduction_buffer[None, None, 0],
                 mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=-cutlass.Float32.inf,
-                hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+                hook_fn=(
+                    cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None
+                ),
             )
             if cutlass.const_expr(self.reload_from == "smem"):
                 cute.autovec_copy(tXsX, tXrX)
@@ -191,7 +195,9 @@ class CrossEntropy(ReductionBase):
                 threads_per_row,
                 reduction_buffer[None, None, 0],
                 mbar_ptr,
-                hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+                hook_fn=(
+                    cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None
+                ),
             )
         if (
@@ -225,7 +231,11 @@ def _cross_entropy(
     assert target.dim() == 1, "Target must be 1D"
     assert x.shape[0] == target.shape[0], "Batch dimensions must match"
     assert x.is_cuda and target.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
+    assert x.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ], "Unsupported input dtype"
     assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
     M, N = x.shape
     device = x.device
@@ -314,13 +324,16 @@ class CrossEntropyBackward:
         num_threads = cute.size(tv_layout, mode=[0])
         mDLoss = cute.make_tensor(
-            mDLoss.iterator, cute.append(mDLoss.layout, cute.make_layout((self.N,), stride=(0,)))
+            mDLoss.iterator,
+            cute.append(mDLoss.layout, cute.make_layout((self.N,), stride=(0,))),
         )
         mTarget = cute.make_tensor(
-            mTarget.iterator, cute.append(mTarget.layout, cute.make_layout((self.N,), stride=(0,)))
+            mTarget.iterator,
+            cute.append(mTarget.layout, cute.make_layout((self.N,), stride=(0,))),
         )
         mLSE = cute.make_tensor(
-            mLSE.iterator, cute.append(mLSE.layout, cute.make_layout((self.N,), stride=(0,)))
+            mLSE.iterator,
+            cute.append(mLSE.layout, cute.make_layout((self.N,), stride=(0,))),
         )
         smem_size = cute.size_in_bytes(
@@ -364,7 +377,9 @@ class CrossEntropyBackward:
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
-            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
         )
         idX = cute.make_identity_tensor(shape)
@@ -474,7 +489,11 @@ def _cross_entropy_backward(
     assert (
         x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda
     ), "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
+    assert x.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ], "Unsupported input dtype"
     assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
     M, N = x.shape
@@ -532,15 +551,37 @@ class CrossEntropyFunction(torch.autograd.Function):
 def cross_entropy(
-    x: torch.Tensor, target: torch.Tensor, inplace_backward: bool = False
+    x: torch.Tensor,
+    target: torch.Tensor,
+    inplace_backward: bool = True,
+    reduction: str = "none",
 ) -> torch.Tensor:
     """Cross entropy loss with automatic differentiation support.
     Args:
         x: Input logits tensor of shape (M, N)
         target: Target class indices tensor of shape (M,)
+        inplace_backward: Whether to perform backward pass in-place
+        reduction: Specifies the reduction to apply to the output:
+            'none': no reduction will be applied (default)
+            'mean': the sum of the output will be divided by the number of elements
+            'sum': the output will be summed
     Returns:
-        Cross entropy loss tensor of shape (M,)
+        Cross entropy loss tensor:
+            - If reduction='none': tensor of shape (M,) with per-example losses
+            - If reduction='mean': scalar tensor with mean loss
+            - If reduction='sum': scalar tensor with sum of losses
     """
-    return CrossEntropyFunction.apply(x, target, inplace_backward)
+    loss = CrossEntropyFunction.apply(x, target, inplace_backward)
+    if reduction == "mean":
+        return loss.mean()
+    elif reduction == "sum":
+        return loss.sum()
+    elif reduction == "none":
+        return loss
+    else:
+        raise ValueError(
+            f"Invalid reduction mode: {reduction}. Expected one of 'none', 'mean', or 'sum'"
+        )

{quack_kernels-0.1.7 → quack_kernels-0.1.8}/quack/rmsnorm.py RENAMED Viewed

@@ -1,14 +1,16 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
-import torch
 from typing import Optional
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
+from cutlass import Float32, Int32
 from cutlass.cute.runtime import from_dlpack
 import quack.utils as utils
+import torch
 from quack.reduction_base import ReductionBase, torch2cute_dtype_map
@@ -19,41 +21,55 @@ class RMSNorm(ReductionBase):
         self.delay_w_load = False
     def _calculate_threads_per_row(self):
+        """Calculate the number of threads per row for the RMSNorm kernel."""
         N = self.N
-        return (
-            8
-            if N <= 64
-            else (
-                16
-                if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
-            )
-        )
+        if N <= 64:
+            return 8
+        elif N <= 128:
+            return 16
+        elif N <= 3072:
+            return 32
+        elif N <= 6144:
+            return 64
+        elif N <= 16384:
+            return 128
+        else:
+            return 256
     def _set_cluster_n(self):
+        """
+        Set the number of clusters for the RMSNorm kernel.
+        Stored in self.cluster_n.
+        """
         N = self.N
         # cluster_n = 4 is faster and cluster_n = 2 for N=64k for some reason
         # Similarly cluster_n = 8 is faster for N=128k
         if cutlass.const_expr(self.dtype.width == 16):
-            cluster_n = (
-                1
-                if N <= 16 * 1024
-                else (
-                    2
-                    if N <= 32 * 1024
-                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
-                )
-            )
-        else:  # fp32
-            cluster_n = (
-                1
-                if N <= 32 * 1024
-                else (
-                    2
-                    if N <= 64 * 1024
-                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
-                )
-            )
+            # 16-bit types (fp16, bf16)
+            if N <= 16 * 1024:
+                cluster_n = 1
+            elif N <= 32 * 1024:
+                cluster_n = 2
+            elif N <= 64 * 1024:
+                cluster_n = 4
+            elif N <= 128 * 1024:
+                cluster_n = 8
+            else:
+                cluster_n = 16
+        else:
+            # 32-bit types (fp32)
+            if N <= 32 * 1024:
+                cluster_n = 1
+            elif N <= 64 * 1024:
+                cluster_n = 2
+            elif N <= 128 * 1024:
+                cluster_n = 4
+            elif N <= 256 * 1024:
+                cluster_n = 8
+            else:
+                cluster_n = 16
         self.cluster_n = cluster_n
     @cute.jit
@@ -64,8 +80,17 @@ class RMSNorm(ReductionBase):
         mO: cute.Tensor,
         mRstd: Optional[cute.Tensor],
         stream: cuda.CUstream,
-        eps: cutlass.Float32 = 1e-6,
+        eps: Float32 = 1e-6,
     ):
+        semistatic_shape = (*mX.shape[:-1], self.N)  # Set last dimension to be statically N
+        new_stride = lambda t: (
+            cute.assume(t.stride[0], divby=128 // t.element_type.width),
+            t.stride[1],
+        )
+        mX, mO = [
+            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            for t in (mX, mO)
+        ]
         assert mX.element_type == self.dtype
         assert mO.element_type == self.dtype
         self._set_cluster_n()
@@ -82,7 +107,7 @@ class RMSNorm(ReductionBase):
         self.kernel(mX, mW, mO, mRstd, eps, tv_layout, tiler_mn, self.reload_from).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
+            cluster=([1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None),
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -109,7 +134,9 @@ class RMSNorm(ReductionBase):
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
-            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+            mX.element_type,
+            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            byte_alignment=16,
         )
         reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
@@ -184,7 +211,7 @@ class RMSNorm(ReductionBase):
             reduction_buffer[None, None, 0],
             mbar_ptr,
             init_val=0.0,
-            hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+            hook_fn=(cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None),
         )
         rstd = utils.rsqrt(sum_sq_x / shape[1] + eps)
         if cutlass.const_expr(mRstd is not None):
@@ -232,25 +259,36 @@ def _rmsnorm_fwd(
     assert weight.dim() == 1, "Weight must be 1D"
     assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
     assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype == torch.float32, "Weight must be float32"
+    assert x.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ], "Unsupported dtype"
+    assert weight.dtype in [
+        torch.float32,
+        torch.bfloat16,
+        torch.float16,
+    ], "Weight must be float32, float16 or bfloat16"
     M, N = x.shape
     device = x.device
     out = torch.empty_like(x)
     rstd = torch.empty(M, device=device, dtype=torch.float32) if return_rstd else None
     dtype = torch2cute_dtype_map[x.dtype]
+    # convert_from_dlpack = lambda x: (
+    #     from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
+    #         mode=0, divisibility=128 // dtype.width
+    #     )
+    # )
     convert_from_dlpack = lambda x: (
-        from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
-            mode=0, stride_order=(0, 1)
-        )
+        from_dlpack(x.detach(), assumed_align=16).mark_layout_dynamic(leading_dim=1)
     )
-    x_tensor, out_tensor = [
-        # utils.convert_from_dlpack(t, leading_dim=t.ndim - 1, divisibility=128 // dtype.width)
-        convert_from_dlpack(t)
-        for t in (x, out)
-    ]
+    x_tensor, out_tensor = [convert_from_dlpack(t) for t in (x, out)]
+    # handle weight divisibility based on weight dtype
+    weight_dtype = torch2cute_dtype_map[weight.dtype]
     weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // cutlass.Float32.width
+        weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
     )
     rstd_tensor = (
         from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
@@ -258,7 +296,7 @@ def _rmsnorm_fwd(
         else None
     )
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N, rstd is not None)
+    compile_key = (dtype, N, rstd is not None, weight.dtype)
     if compile_key not in _rmsnorm_fwd.compile_cache:
         rmsnorm_op = RMSNorm(dtype, N)
         _rmsnorm_fwd.compile_cache[compile_key] = cute.compile(
@@ -301,7 +339,8 @@ def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
 class RMSNormBackward(ReductionBase):
     def __init__(self, dtype: cutlass.Numeric, N: int):
         # 2 stages for double buffering when computing mean of x_hat * wdy
-        super().__init__(dtype, N, stage=2, reduction_dtype=cutlass.Float32)
+        super().__init__(dtype, N, stage=2, reduction_dtype=Float32)
+        self.reload_wdy = None if N <= 16 * 1024 else "smem"
         if self.N > 128 * 1024 and self.dtype.width >= 32:
             # Not enough smem
             raise ValueError("RMSNormBackward does not support N > 128k with dtype >= 32 bits")
@@ -348,9 +387,18 @@ class RMSNormBackward(ReductionBase):
         mRstd: cute.Tensor,
         mdX: cute.Tensor,
         mdW: cute.Tensor,
-        sm_count: cutlass.Int32,
+        sm_count: Int32,
         stream: cuda.CUstream,
     ):
+        semistatic_shape = (*mX.shape[:-1], self.N)  # Set last dimension to be statically N
+        new_stride = lambda t: (
+            cute.assume(t.stride[0], divby=128 // t.element_type.width),
+            t.stride[1],
+        )
+        mX, mdOut, mdX = [
+            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
+            for t in (mX, mdOut, mdX)
+        ]
         self._set_cluster_n()
         tiler_mn, tv_layout = self._get_tv_layout()
         num_threads = cute.size(tv_layout, mode=[0])
@@ -460,7 +508,8 @@ class RMSNormBackward(ReductionBase):
         gdW = cute.local_tile(mdW, (1, tiler_mn[1]), (bidx_start, cluster_y))
         tdWgdW = thr_copy_dW.partition_D(gdW)
-        tdWrdW = cute.make_fragment_like(tdWgdW, cutlass.Float32)
+        # Always compute partial weight gradients in fp32
+        tdWrdW = cute.make_fragment_like(tdWgdW, Float32)
         tXrdW = thr_copy_X.retile(tdWrdW)
         gX = cute.local_tile(mX, tiler_mn, (None, cluster_y))
@@ -513,9 +562,9 @@ class RMSNormBackward(ReductionBase):
         threads_per_row = tv_layout.shape[0][0]
         tXrdW.fill(0.0)
-        stage = cutlass.Int32(0)
-        producer_phase = cutlass.Int32(1)
-        consumer_phase = cutlass.Int32(0)
+        stage = Int32(0)
+        producer_phase = Int32(1)
+        consumer_phase = Int32(0)
         for bidx in cutlass.range(bidx_start, cute.ceil_div(M, tiler_mn[0]), gdim):
             row = tXcX[None, None, None, bidx][0][0]
             rstd = cutlass.Float.zero
@@ -538,10 +587,14 @@ class RMSNormBackward(ReductionBase):
                 )
             elif tiler_mn[0] > 1:
                 utils.fill_oob(
-                    tXsX[None, None, None, stage ^ 1], None, fill_value=mX.element_type.zero
+                    tXsX[None, None, None, stage ^ 1],
+                    None,
+                    fill_value=mX.element_type.zero,
                 )
                 utils.fill_oob(
-                    tXsdOut[None, None, None, stage ^ 1], None, fill_value=mdOut.element_type.zero
+                    tXsdOut[None, None, None, stage ^ 1],
+                    None,
+                    fill_value=mdOut.element_type.zero,
                 )
             cute.arch.cp_async_commit_group()
             if row < M or tiler_mn[0] == 1:
@@ -561,12 +614,13 @@ class RMSNormBackward(ReductionBase):
                     cute.ReductionOp.ADD,
                     threads_per_row,
                     reduction_buffer[None, None, stage],
-                    mbar_full_ptr + stage if cutlass.const_expr(self.cluster_n > 1) else None,
+                    (mbar_full_ptr + stage if cutlass.const_expr(self.cluster_n > 1) else None),
                     phase=consumer_phase,
                     init_val=0.0,
                 )
                 / shape[1]
             )
             if cutlass.const_expr(self.cluster_n > 1):
                 # It's faster to have 1 lane per warp to signal the mbar, rather than all lanes
                 # Requires adjusting the thread_count when initializing the mbar
@@ -576,12 +630,20 @@ class RMSNormBackward(ReductionBase):
                     cute.arch.mbarrier_arrive(
                         mbar_empty_ptr + stage, peer_cta_rank_in_cluster=lane_idx
                     )
+            if cutlass.const_expr(self.reload_wdy == "smem"):
+                cute.autovec_copy(tXsdOut[None, None, None, stage], tXrdOut)
+                dout = tXrdOut.load().to(cute.Float32)
+                wdy = dout * weight
             dx = (wdy - x_hat * mean_xhat_wdy) * rstd
             tXrdX.store(dx.to(tXrdOut.element_type))
             if row < M or tiler_mn[0] == 1:
                 tXgdX_cur = utils.coord_offset_i64(bidx, tXgdX, dim=3)[None, None, None, 0]
                 cute.copy(copy_atom_store_dX, tXrdX, tXgdX_cur, pred=tXpX)
+            # Accumulate weight gradients in fp32
             tXrdW.store(tXrdW.load() + dout * x_hat)
             stage ^= 1
             if stage == 0:
                 consumer_phase ^= 1
@@ -609,7 +671,9 @@ class RMSNormBackward(ReductionBase):
                     cute.autovec_copy(tXsdW_other, tXrdW_other)
                     tXrdW.store(tXrdW.load() + tXrdW_other.load())
                 cute.copy(copy_atom_store_dW, tdWrdW, tdWgdW, pred=tdWpdW)
         else:
+            # dw is already in fp32, so we can directly copy to global memory
             cute.copy(copy_atom_store_dW, tdWrdW, tdWgdW, pred=tdWpdW)
@@ -634,8 +698,17 @@ def _rmsnorm_backward(
     assert weight.dim() == 1, "Weight must be 1D"
     assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
     assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype == torch.float32, "Weight must be float32"
+    assert x.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ], "Unsupported dtype"
+    assert weight.dtype in [
+        torch.float32,
+        torch.bfloat16,
+        torch.float16,
+    ], "Weight must be float32, float16 or bfloat16"
     M, N = x.shape
     dx = torch.empty_like(x)
@@ -654,28 +727,29 @@ def _rmsnorm_backward(
     sm_count = (
         sm_count * sm_count_multiple if N <= 8192 else sm_count // 2 if N <= 16384 else sm_count * 2
     )
-    dw_partial = torch.empty(sm_count, N, device=device, dtype=weight.dtype)
+    # Always store partial gradients in fp32 for numerical accuracy
+    dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
     dtype = torch2cute_dtype_map[x.dtype]
-    convert_from_dlpack = lambda tensor: (
-        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
-            mode=0, stride_order=(0, 1)
-        )
+    convert_from_dlpack = lambda x: (
+        from_dlpack(x.detach(), assumed_align=16).mark_layout_dynamic(leading_dim=1)
     )
     x_tensor, dout_tensor, dx_tensor = [convert_from_dlpack(tensor) for tensor in (x, dout, dx)]
+    # Handle weight div based on weight dtype
+    weight_dtype = torch2cute_dtype_map[weight.dtype]
     weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // cutlass.Float32.width
+        weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
     )
-    dw_partial_tensor = convert_from_dlpack(dw_partial)
+    dw_partial_tensor = from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
     rstd_tensor = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N)
+    compile_key = (dtype, N, weight.dtype)
     if compile_key not in _rmsnorm_backward.compile_cache:
         rmsnorm_backward_op = RMSNormBackward(dtype, N)
         _rmsnorm_backward.compile_cache[compile_key] = cute.compile(
@@ -700,7 +774,7 @@ def _rmsnorm_backward(
         sm_count,
         current_stream,
     )
+    # we have summed the partial gradients in fp32, now we convert back to the weight dtype
     dw = dw_partial.sum(dim=0).to(weight.dtype)
     return dx, dw
@@ -711,16 +785,29 @@ _rmsnorm_backward.compile_cache = {}
 class RMSNormFunction(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x, weight, eps):
+        x_shape_start = x.shape
+        # Flatten input
+        x = x.view(-1, x.shape[-1])
         out, rstd = _rmsnorm_fwd(x, weight, eps, return_rstd=True)
         ctx.save_for_backward(x, weight, rstd)
         ctx.eps = eps
-        return out
+        ctx.x_shape_start = x_shape_start
+        return out.reshape(x_shape_start)
     @staticmethod
     def backward(ctx, dout):
         x, weight, rstd = ctx.saved_tensors
+        x_shape_start = ctx.x_shape_start
+        # Reshape dout to match the flattened shape used in forward
+        dout = dout.view(-1, dout.shape[-1])
         dx, dw = _rmsnorm_backward(x, weight, dout, rstd)
-        # dw is returned for weight gradient, None for eps gradient
+        dx = dx.view(x_shape_start)
+        # dx is returned for input gradient,
+        # dw is returned for weight gradient,
+        # None for eps gradient
         return dx, dw, None
@@ -736,3 +823,39 @@ def rmsnorm(x: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6) -> torch.T
         Normalized output tensor of same shape as x
     """
     return RMSNormFunction.apply(x, weight, eps)
+class QuackRMSNorm(torch.nn.Module):
+    """RMSNorm module that behaves like torch.nn.RMSNorm.
+    This class provides a drop-in replacement for torch.nn.RMSNorm that uses
+    the quack.rmsnorm implementation under the hood.
+    Args:
+        dim (int): The dimension to normalize over
+        eps (float, optional): A small constant for numerical stability. Default: 1e-6
+    Attributes:
+        weight (torch.nn.Parameter): The learnable weight parameter
+        eps (float): A small constant for numerical stability
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply RMSNorm to the input tensor.
+        Args:
+            x (torch.Tensor): Input tensor
+        Returns:
+            torch.Tensor: Normalized tensor
+        """
+        return rmsnorm(x, self.weight, self.eps)
+    def reset_parameters(self):
+        """Reset the weight parameter to ones."""
+        torch.nn.init.ones_(self.weight)

{quack_kernels-0.1.7 → quack_kernels-0.1.8/quack_kernels.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.7
+Version: 0.1.8
 Requires-Python: >=3.9
 License-File: LICENSE
 Requires-Dist: nvidia-cutlass-dsl==4.1.0.dev0

{quack_kernels-0.1.7 → quack_kernels-0.1.8}/quack_kernels.egg-info/top_level.txt RENAMED Viewed

@@ -1,3 +1,4 @@
+benchmarks
 dist
 media
 quack

quack_kernels-0.1.8/tests/test_rmsnorm.py ADDED Viewed

@@ -0,0 +1,392 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import pytest
+import torch
+from quack.rmsnorm import rmsnorm, rmsnorm_ref, rstd_ref, _rmsnorm_fwd
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+# @pytest.mark.parametrize("eps", [1e-5])
+@pytest.mark.parametrize("input_dtype", [torch.bfloat16, torch.float16, torch.float32])
+# @pytest.mark.parametrize("input_dtype", [torch.float16])
+@pytest.mark.parametrize(
+    "N",
+    [
+        192,
+        256,
+        512,
+        760,
+        1024,
+        1128,
+        2048,
+        4096,
+        8192,
+        16384,
+        32768,
+        65536,
+        131072,
+        262144,
+    ],
+    # [262144]
+)
+@pytest.mark.parametrize("M", [1, 37, 199, 8 * 1024])
+# @pytest.mark.parametrize("M", [1])
+def test_rmsnorm_forward_backward(M, N, input_dtype, eps):
+    """Test RMSNorm forward pass against reference implementation."""
+    if N >= 256 * 1024 and input_dtype == torch.float32 and M >= 8 * 1024:
+        pytest.skip("Skipping large tensor test for float32 to avoid OOM")
+    device = "cuda"
+    # Set tolerance based on dtype
+    if input_dtype == torch.bfloat16:
+        atol = 1e-1
+    elif input_dtype == torch.float16:
+        atol = 1e-2
+    else:
+        atol = 1e-4
+    torch.random.manual_seed(0)
+    x = torch.randn(M, N, device=device, dtype=input_dtype, requires_grad=True)
+    weight = torch.randn(N, device=device, dtype=torch.float32, requires_grad=True)
+    x_ref = x.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    out = rmsnorm(x, weight, eps=eps)
+    out_ref = rmsnorm_ref(x_ref, weight_ref, eps=eps)
+    # rstd_ref_val = rstd_ref(x_ref, eps=eps)
+    assert out.shape == x.shape
+    assert out.dtype == input_dtype
+    torch.testing.assert_close(out, out_ref, atol=atol, rtol=1e-3)
+    # torch.testing.assert_close(rstd, rstd_ref_val, atol=atol, rtol=1e-3)
+    # Backward pass
+    if N > 128 * 1024 and input_dtype == torch.float32:
+        # Skip backward pass for due to not enough smem
+        return
+    grad_out = torch.randn_like(out)
+    torch.cuda.synchronize()
+    out_ref.backward(grad_out)
+    out.backward(grad_out)
+    torch.testing.assert_close(x.grad, x_ref.grad, atol=atol, rtol=1e-3)
+    torch.testing.assert_close(weight.grad, weight_ref.grad, atol=atol, rtol=1e-3)
+def test_rmsnorm_strided_tensor():
+    """Test RMSNorm with strided tensor input where shape is (8, 4096, 512) and stride is (sth, 576, 1)."""
+    device = "cuda"
+    dtype = torch.bfloat16
+    atol = 1e-1
+    eps = 1e-5
+    # Create a larger tensor with 576 features
+    full_tensor = torch.randn(8, 4096, 576, device=device, dtype=dtype)
+    # Take a slice of the top 512 dimensions - this creates a strided view
+    x = full_tensor[:, :, :512].detach().requires_grad_()
+    # Create weight tensor
+    weight = torch.randn(512, device=device, dtype=torch.float32, requires_grad=True)
+    # Reference implementation
+    x_ref = x.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    out = rmsnorm(x, weight, eps=eps)
+    out_ref = rmsnorm_ref(x_ref, weight_ref, eps=eps)
+    assert out.shape == x.shape
+    torch.testing.assert_close(out, out_ref, atol=atol, rtol=1e-3)
+    grad_out = torch.randn_like(out)
+    torch.cuda.synchronize()
+    out_ref.backward(grad_out)
+    out.backward(grad_out)
+    torch.testing.assert_close(x.grad, x_ref.grad, atol=atol, rtol=1e-3)
+    torch.testing.assert_close(weight.grad, weight_ref.grad, atol=atol, rtol=1e-3)
+@pytest.mark.parametrize("eps", [1e-5])
+@pytest.mark.parametrize("input_dtype", [torch.bfloat16])
+@pytest.mark.parametrize(
+    "N",
+    [131072, 262144],
+    # [262144]
+)
+@pytest.mark.parametrize("M", [32 * 1024])
+def test_rmsnorm_large_tensor(M, N, input_dtype, eps):
+    """Test RMSNorm forward pass against reference implementation."""
+    device = "cuda"
+    # Set tolerance based on dtype
+    if input_dtype == torch.bfloat16:
+        atol = 1e-1
+    elif input_dtype == torch.float16:
+        atol = 1e-2
+    else:
+        atol = 1e-4
+    torch.random.manual_seed(0)
+    torch.cuda.empty_cache()
+    x = torch.randn(M, N, device=device, dtype=input_dtype, requires_grad=False)
+    weight = torch.randn(N, device=device, dtype=torch.float32, requires_grad=False)
+    out = rmsnorm(x, weight, eps=eps)
+    # Need to compile, otherwise it OOMs
+    rmsnorm_compiled = torch.compile(rmsnorm_ref)
+    # Run once with smaller input to avoid OOMs
+    rmsnorm_compiled(x[:32], weight, eps=eps)
+    out_ref = rmsnorm_compiled(x, weight, eps=eps)
+    # Need to chunk, otherwise it OOMs
+    assert all(
+        (out_c - out_ref_c).abs().max() < atol
+        for out_c, out_ref_c in zip(out.chunk(16), out_ref.chunk(16))
+    )
+@pytest.mark.parametrize("return_rstd", [True, False])
+def test_rmsnorm_return_rstd_option(return_rstd):
+    """Test that return_rstd option works correctly."""
+    device = "cuda"
+    M, N = 32, 1024
+    eps = 1e-6
+    x = torch.randn(M, N, device=device, dtype=torch.float16)
+    weight = torch.randn(N, device=device, dtype=torch.float32)
+    if return_rstd:
+        out, rstd = _rmsnorm_fwd(x, weight, eps=eps, return_rstd=True)
+        assert out.shape == (M, N)
+        assert rstd.shape == (M,)
+        assert rstd.dtype == torch.float32
+    else:
+        out = _rmsnorm_fwd(x, weight, eps=eps, return_rstd=False)
+        assert out.shape == (M, N)
+        assert isinstance(out, torch.Tensor)
+def test_rmsnorm_input_validation():
+    """Test input validation and error handling."""
+    device = "cuda"
+    # Test 3D input (should now work since rmsnorm was updated to accept 3D inputs)
+    x_3d = torch.randn(2, 32, 1024, device=device, dtype=torch.float16)
+    weight = torch.randn(1024, device=device, dtype=torch.float32)
+    # This should not raise an exception now
+    out = rmsnorm(x_3d, weight)
+    # Verify output shape matches input shape
+    assert out.shape == x_3d.shape
+    # Verify output dtype matches input dtype
+    assert out.dtype == x_3d.dtype
+    # Test weight dimension mismatch
+    x = torch.randn(32, 1024, device=device, dtype=torch.float16)
+    weight_wrong = torch.randn(512, device=device, dtype=torch.float32)
+    with pytest.raises(AssertionError, match="Last dimension of input must match weight dimension"):
+        rmsnorm(x, weight_wrong)
+    # Test CPU tensors (should fail)
+    x_cpu = torch.randn(32, 1024, dtype=torch.float16)
+    weight_cpu = torch.randn(1024, dtype=torch.float32)
+    with pytest.raises(AssertionError, match="Tensors must be on CUDA device"):
+        rmsnorm(x_cpu, weight_cpu)
+    # Test unsupported dtype
+    x = torch.randn(32, 1024, device=device, dtype=torch.float64)
+    weight = torch.randn(1024, device=device, dtype=torch.float32)
+    with pytest.raises(AssertionError, match="Unsupported dtype"):
+        rmsnorm(x, weight)
+    # Test wrong weight dtype
+    x = torch.randn(32, 1024, device=device, dtype=torch.float16)
+    weight_wrong_dtype = torch.randn(1024, device=device, dtype=torch.float64)
+    with pytest.raises(AssertionError, match="Weight must be float32, float16 or bfloat16"):
+        rmsnorm(x, weight_wrong_dtype)
+def test_rmsnorm_bf16_weights():
+    """Test that bfloat16 weights work correctly with rmsnorm."""
+    device = "cuda"
+    M, N = 32, 1024
+    eps = 1e-6
+    # Test with bfloat16 input and weights
+    x = torch.randn(M, N, device=device, dtype=torch.bfloat16)
+    weight_bf16 = torch.randn(N, device=device, dtype=torch.bfloat16)
+    # Run rmsnorm with bfloat16 weights
+    out_bf16 = rmsnorm(x, weight_bf16, eps=eps)
+    # Verify output shape and dtype
+    assert out_bf16.shape == x.shape
+    assert out_bf16.dtype == torch.bfloat16
+    # Convert to float32 for reference comparison
+    x_fp32 = x.to(torch.float32)
+    weight_fp32 = weight_bf16.to(torch.float32)
+    # Run reference implementation with float32
+    out_ref = rmsnorm_ref(x_fp32, weight_fp32, eps=eps).to(torch.bfloat16)
+    # Verify output values match reference implementation
+    torch.testing.assert_close(out_bf16, out_ref, atol=1e-1, rtol=1e-2)
+def test_rmsnorm_bf16_weights_backward():
+    """Test that bfloat16 weights work correctly with rmsnorm backward pass."""
+    device = "cuda"
+    M, N = 32, 1024
+    eps = 1e-6
+    atol = 1e-1  # Higher tolerance for bfloat16
+    # Create tensors with gradients
+    x = torch.randn(M, N, device=device, dtype=torch.bfloat16, requires_grad=True)
+    weight_bf16 = torch.randn(N, device=device, dtype=torch.bfloat16, requires_grad=True)
+    # Create reference tensors with float32 weights for comparison
+    x_ref = x.detach().clone().requires_grad_()
+    weight_fp32 = weight_bf16.to(torch.float32).detach().requires_grad_()
+    # Forward pass
+    out_bf16 = rmsnorm(x, weight_bf16, eps=eps)
+    out_ref = rmsnorm(x_ref, weight_fp32, eps=eps)
+    # Create gradient for backward pass
+    grad_out = torch.randn_like(out_bf16)
+    grad_out_ref = grad_out.clone()
+    # Backward pass
+    torch.cuda.synchronize()
+    out_bf16.backward(grad_out)
+    out_ref.backward(grad_out_ref)
+    # Verify gradients
+    torch.testing.assert_close(x.grad, x_ref.grad, atol=atol, rtol=1e-2)
+    torch.testing.assert_close(
+        weight_bf16.grad, weight_fp32.grad.to(torch.bfloat16), atol=atol, rtol=1e-2
+    )
+    # Test with mixed precision: bfloat16 input and float32 weights
+    x = torch.randn(M, N, device=device, dtype=torch.bfloat16, requires_grad=True)
+    weight_fp32 = torch.randn(N, device=device, dtype=torch.float32, requires_grad=True)
+    # Forward pass
+    out_mixed = rmsnorm(x, weight_fp32, eps=eps)
+    # Create gradient for backward pass
+    grad_out = torch.randn_like(out_mixed)
+    # Backward pass
+    torch.cuda.synchronize()
+    out_mixed.backward(grad_out)
+    # Just verify that backward pass completes without errors
+    assert x.grad is not None
+    assert weight_fp32.grad is not None
+def test_rmsnorm_fp16_weights():
+    """Test that float16 weights work correctly with rmsnorm."""
+    device = "cuda"
+    M, N = 32, 1024
+    eps = 1e-6
+    # Test with float16 input and weights
+    x = torch.randn(M, N, device=device, dtype=torch.float16)
+    weight_fp16 = torch.randn(N, device=device, dtype=torch.float16)
+    # Run rmsnorm with float16 weights
+    out_fp16 = rmsnorm(x, weight_fp16, eps=eps)
+    # Verify output shape and dtype
+    assert out_fp16.shape == x.shape
+    assert out_fp16.dtype == torch.float16
+    # Convert to float32 for reference comparison
+    x_fp32 = x.to(torch.float32)
+    weight_fp32 = weight_fp16.to(torch.float32)
+    # Run reference implementation with float32
+    out_ref = rmsnorm_ref(x_fp32, weight_fp32, eps=eps).to(torch.float16)
+    # Verify output values match reference implementation
+    torch.testing.assert_close(out_fp16, out_ref, atol=1e-2, rtol=1e-2)
+def test_rmsnorm_fp16_weights_backward():
+    """Test that float16 weights work correctly with rmsnorm backward pass."""
+    device = "cuda"
+    M, N = 32, 1024
+    eps = 1e-6
+    atol = 1e-2  # Tolerance for float16
+    # Create tensors with gradients
+    x = torch.randn(M, N, device=device, dtype=torch.float16, requires_grad=True)
+    weight_fp16 = torch.randn(N, device=device, dtype=torch.float16, requires_grad=True)
+    # Create reference tensors with float32 weights for comparison
+    x_ref = x.detach().clone().requires_grad_()
+    weight_fp32 = weight_fp16.to(torch.float32).detach().requires_grad_()
+    # Forward pass
+    out_fp16 = rmsnorm(x, weight_fp16, eps=eps)
+    out_ref = rmsnorm(x_ref, weight_fp32, eps=eps)
+    # Create gradient for backward pass
+    grad_out = torch.randn_like(out_fp16)
+    grad_out_ref = grad_out.clone()
+    # Backward pass
+    torch.cuda.synchronize()
+    out_fp16.backward(grad_out)
+    out_ref.backward(grad_out_ref)
+    # Verify gradients
+    torch.testing.assert_close(x.grad, x_ref.grad, atol=atol, rtol=1e-2)
+    torch.testing.assert_close(
+        weight_fp16.grad, weight_fp32.grad.to(torch.float16), atol=atol, rtol=1e-2
+    )
+    # Test with mixed precision: float16 input and float32 weights
+    x = torch.randn(M, N, device=device, dtype=torch.float16, requires_grad=True)
+    weight_fp32 = torch.randn(N, device=device, dtype=torch.float32, requires_grad=True)
+    # Forward pass
+    out_mixed = rmsnorm(x, weight_fp32, eps=eps)
+    # Create gradient for backward pass
+    grad_out = torch.randn_like(out_mixed)
+    # Backward pass
+    torch.cuda.synchronize()
+    out_mixed.backward(grad_out)
+    # Just verify that backward pass completes without errors
+    assert x.grad is not None
+    assert weight_fp32.grad is not None
+def test_rmsnorm_compile_cache():
+    """Test that compile cache works correctly for repeated calls."""
+    device = "cuda"
+    M, N = 32, 1024
+    eps = 1e-6
+    # Clear cache
+    _rmsnorm_fwd.compile_cache.clear()
+    assert len(_rmsnorm_fwd.compile_cache) == 0
+    x1 = torch.randn(M, N, device=device, dtype=torch.float16)
+    weight1 = torch.randn(N, device=device, dtype=torch.float32)
+    # First call should compile
+    out1 = _rmsnorm_fwd(x1, weight1, eps=eps)
+    assert len(_rmsnorm_fwd.compile_cache) == 1
+    # Same shape should reuse cache
+    x2 = torch.randn(M, N, device=device, dtype=torch.float16)
+    weight2 = torch.randn(N, device=device, dtype=torch.float32)
+    out2 = _rmsnorm_fwd(x2, weight2, eps=eps)
+    assert len(_rmsnorm_fwd.compile_cache) == 1
+    # Different shape should create new cache entry
+    x3 = torch.randn(M, N * 2, device=device, dtype=torch.float16)
+    weight3 = torch.randn(N * 2, device=device, dtype=torch.float32)
+    out3 = _rmsnorm_fwd(x3, weight3, eps=eps)
+    assert len(_rmsnorm_fwd.compile_cache) == 2
+    # Different dtype should create new cache entry
+    x4 = torch.randn(M, N, device=device, dtype=torch.float32)
+    weight4 = torch.randn(N, device=device, dtype=torch.float32)
+    out4 = _rmsnorm_fwd(x4, weight4, eps=eps)
+    assert len(_rmsnorm_fwd.compile_cache) == 3

quack_kernels-0.1.7/tests/test_rmsnorm.py DELETED Viewed

@@ -1,183 +0,0 @@
-# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
-import pytest
-import torch
-from quack.rmsnorm import rmsnorm, rmsnorm_ref, rstd_ref
-@pytest.mark.parametrize("eps", [1e-5, 1e-6])
-# @pytest.mark.parametrize("eps", [1e-5])
-@pytest.mark.parametrize("input_dtype", [torch.bfloat16, torch.float16, torch.float32])
-# @pytest.mark.parametrize("input_dtype", [torch.float16])
-@pytest.mark.parametrize(
-    "N",
-    [192, 256, 512, 760, 1024, 1128, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144]
-    # [262144]
-)
-@pytest.mark.parametrize("M", [1, 37, 199, 8 * 1024])
-# @pytest.mark.parametrize("M", [1])
-def test_rmsnorm_forward_backward(M, N, input_dtype, eps):
-    """Test RMSNorm forward pass against reference implementation."""
-    if N >= 256 * 1024 and input_dtype == torch.float32 and M >= 8 * 1024:
-        pytest.skip("Skipping large tensor test for float32 to avoid OOM")
-    device = "cuda"
-    # Set tolerance based on dtype
-    if input_dtype == torch.bfloat16:
-        atol = 1e-1
-    elif input_dtype == torch.float16:
-        atol = 1e-2
-    else:
-        atol = 1e-4
-    torch.random.manual_seed(0)
-    x = torch.randn(M, N, device=device, dtype=input_dtype, requires_grad=True)
-    weight = torch.randn(N, device=device, dtype=torch.float32, requires_grad=True)
-    x_ref = x.detach().clone().requires_grad_()
-    weight_ref = weight.detach().clone().requires_grad_()
-    out = rmsnorm(x, weight, eps=eps)
-    out_ref = rmsnorm_ref(x_ref, weight_ref, eps=eps)
-    # rstd_ref_val = rstd_ref(x_ref, eps=eps)
-    assert out.shape == x.shape
-    assert out.dtype == input_dtype
-    torch.testing.assert_close(out, out_ref, atol=atol, rtol=1e-3)
-    # torch.testing.assert_close(rstd, rstd_ref_val, atol=atol, rtol=1e-3)
-    # Backward pass
-    if N > 128 * 1024 and input_dtype == torch.float32:
-        # Skip backward pass for due to not enough smem
-        return
-    grad_out = torch.randn_like(out)
-    torch.cuda.synchronize()
-    out_ref.backward(grad_out)
-    out.backward(grad_out)
-    torch.testing.assert_close(x.grad, x_ref.grad, atol=atol, rtol=1e-3)
-    torch.testing.assert_close(weight.grad, weight_ref.grad, atol=atol, rtol=1e-3)
-@pytest.mark.parametrize("eps", [1e-5])
-@pytest.mark.parametrize("input_dtype", [torch.bfloat16])
-@pytest.mark.parametrize(
-    "N",
-    [131072, 262144]
-    # [262144]
-)
-@pytest.mark.parametrize("M", [32 * 1024])
-def test_rmsnorm_large_tensor(M, N, input_dtype, eps):
-    """Test RMSNorm forward pass against reference implementation."""
-    device = "cuda"
-    # Set tolerance based on dtype
-    if input_dtype == torch.bfloat16:
-        atol = 1e-1
-    elif input_dtype == torch.float16:
-        atol = 1e-2
-    else:
-        atol = 1e-4
-    torch.random.manual_seed(0)
-    torch.cuda.empty_cache()
-    x = torch.randn(M, N, device=device, dtype=input_dtype, requires_grad=False)
-    weight = torch.randn(N, device=device, dtype=torch.float32, requires_grad=False)
-    out = rmsnorm(x, weight, eps=eps)
-    # Need to compile, otherwise it OOMs
-    rmsnorm_compiled = torch.compile(rmsnorm_ref)
-    # Run once with smaller input to avoid OOMs
-    rmsnorm_compiled(x[:32], weight, eps=eps)
-    out_ref = rmsnorm_compiled(x, weight, eps=eps)
-    # Need to chunk, otherwise it OOMs
-    assert all((out_c - out_ref_c).abs().max() < atol
-               for out_c, out_ref_c in zip(out.chunk(16), out_ref.chunk(16)))
-@pytest.mark.parametrize("return_rstd", [True, False])
-def test_rmsnorm_return_rstd_option(return_rstd):
-    """Test that return_rstd option works correctly."""
-    device = "cuda"
-    M, N = 32, 1024
-    eps = 1e-6
-    x = torch.randn(M, N, device=device, dtype=torch.float16)
-    weight = torch.randn(N, device=device, dtype=torch.float32)
-    if return_rstd:
-        out, rstd = rmsnorm(x, weight, eps=eps, return_rstd=True)
-        assert out.shape == (M, N)
-        assert rstd.shape == (M,)
-        assert rstd.dtype == torch.float32
-    else:
-        out = rmsnorm(x, weight, eps=eps, return_rstd=False)
-        assert out.shape == (M, N)
-        assert isinstance(out, torch.Tensor)
-def test_rmsnorm_input_validation():
-    """Test input validation and error handling."""
-    device = "cuda"
-    # Test 3D input (should fail)
-    x_3d = torch.randn(2, 32, 1024, device=device, dtype=torch.float16)
-    weight = torch.randn(1024, device=device, dtype=torch.float32)
-    with pytest.raises(AssertionError, match="Input must be 2D"):
-        rmsnorm(x_3d, weight)
-    # Test weight dimension mismatch
-    x = torch.randn(32, 1024, device=device, dtype=torch.float16)
-    weight_wrong = torch.randn(512, device=device, dtype=torch.float32)
-    with pytest.raises(AssertionError, match="Last dimension of input must match weight dimension"):
-        rmsnorm(x, weight_wrong)
-    # Test CPU tensors (should fail)
-    x_cpu = torch.randn(32, 1024, dtype=torch.float16)
-    weight_cpu = torch.randn(1024, dtype=torch.float32)
-    with pytest.raises(AssertionError, match="Tensors must be on CUDA device"):
-        rmsnorm(x_cpu, weight_cpu)
-    # Test unsupported dtype
-    x = torch.randn(32, 1024, device=device, dtype=torch.float64)
-    weight = torch.randn(1024, device=device, dtype=torch.float32)
-    with pytest.raises(AssertionError, match="Unsupported dtype"):
-        rmsnorm(x, weight)
-    # Test wrong weight dtype
-    x = torch.randn(32, 1024, device=device, dtype=torch.float16)
-    weight_wrong_dtype = torch.randn(1024, device=device, dtype=torch.float16)
-    with pytest.raises(AssertionError, match="Weight must be float32"):
-        rmsnorm(x, weight_wrong_dtype)
-def test_rmsnorm_compile_cache():
-    """Test that compile cache works correctly for repeated calls."""
-    device = "cuda"
-    M, N = 32, 1024
-    eps = 1e-6
-    # Clear cache
-    rmsnorm.compile_cache.clear()
-    assert len(rmsnorm.compile_cache) == 0
-    x1 = torch.randn(M, N, device=device, dtype=torch.float16)
-    weight1 = torch.randn(N, device=device, dtype=torch.float32)
-    # First call should compile
-    out1 = rmsnorm(x1, weight1, eps=eps)
-    assert len(rmsnorm.compile_cache) == 1
-    # Same shape should reuse cache
-    x2 = torch.randn(M, N, device=device, dtype=torch.float16)
-    weight2 = torch.randn(N, device=device, dtype=torch.float32)
-    out2 = rmsnorm(x2, weight2, eps=eps)
-    assert len(rmsnorm.compile_cache) == 1
-    # Different shape should create new cache entry
-    x3 = torch.randn(M, N * 2, device=device, dtype=torch.float16)
-    weight3 = torch.randn(N * 2, device=device, dtype=torch.float32)
-    out3 = rmsnorm(x3, weight3, eps=eps)
-    assert len(rmsnorm.compile_cache) == 2
-    # Different dtype should create new cache entry
-    x4 = torch.randn(M, N, device=device, dtype=torch.float32)
-    weight4 = torch.randn(N, device=device, dtype=torch.float32)
-    out4 = rmsnorm(x4, weight4, eps=eps)
-    assert len(rmsnorm.compile_cache) == 3