PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/rmsnorm.py CHANGED Viewed

@@ -1,156 +1,91 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
-from typing import Optional, Tuple
+import math
+from typing import Optional, Tuple, Type
 from functools import partial
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
-from cutlass import Float32, Int32
-from cutlass import const_expr
-from cutlass.cute.runtime import from_dlpack
+from cutlass import Float32, Int32, const_expr
 import torch
 from torch import Tensor
 import quack.utils as utils
+import quack.copy_utils as copy_utils
+import quack.layout_utils as layout_utils
+from quack.compile_utils import make_fake_tensor as fake_tensor
 from quack.reduce import row_reduce
 from quack.reduction_base import ReductionBase
 from quack.cute_dsl_utils import torch2cute_dtype_map
 class RMSNorm(ReductionBase):
-    def __init__(self, dtype: cutlass.Numeric, N: int):
-        super().__init__(dtype, N, stage=1)
-        self.reload_from = None if N <= 8192 else "smem"
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int, is_layernorm: bool = False):
+        super().__init__(dtype, N, stage=2 if is_layernorm else 1)
+        self.is_layernorm = is_layernorm
+        self.reload_from = None if N <= (16384 if is_layernorm else 8192) else "smem"
         self.delay_w_load = False
-    def _calculate_threads_per_row(self):
-        """Calculate the number of threads per row for the RMSNorm kernel."""
+    def _threads_per_row(self):
         N = self.N
-        if N <= 64:
-            return 8
-        elif N <= 128:
-            return 16
-        elif N <= 3072:
-            return 32
-        elif N <= 6144:
-            return 64
-        elif N <= 16384:
-            return 128
-        else:
-            return 256
+        for limit, threads in [(64, 8), (128, 16), (3072, 32), (6144, 64), (16384, 128)]:
+            if N <= limit:
+                return threads
+        return 256
     def _set_cluster_n(self):
-        """
-        Set the number of clusters for the RMSNorm kernel.
-        Stored in self.cluster_n.
-        """
         N = self.N
         # cluster_n = 4 is faster and cluster_n = 2 for N=64k for some reason
         # Similarly cluster_n = 8 is faster for N=128k
         if const_expr(self.dtype.width == 16):
-            # 16-bit types (fp16, bf16)
-            if N <= 16 * 1024:
-                cluster_n = 1
-            elif N <= 32 * 1024:
-                cluster_n = 2
-            elif N <= 64 * 1024:
-                cluster_n = 4
-            elif N <= 128 * 1024:
-                cluster_n = 8
-            else:
-                cluster_n = 16
+            thresholds = [(16 * 1024, 1), (32 * 1024, 2), (64 * 1024, 4), (128 * 1024, 8)]
         else:
-            # 32-bit types (fp32)
-            if N <= 32 * 1024:
-                cluster_n = 1
-            elif N <= 64 * 1024:
-                cluster_n = 2
-            elif N <= 128 * 1024:
-                cluster_n = 4
-            elif N <= 256 * 1024:
-                cluster_n = 8
-            else:
-                cluster_n = 16
-        self.cluster_n = cluster_n
-    def _smem_size_in_bytes(self, tiler_mn, num_warps, dtype_res=None):
-        return (
-            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
-            + (
-                cute.size_in_bytes(dtype_res, cute.make_layout(tiler_mn))
-                if dtype_res is not None
-                else 0
-            )
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
-            + self.stage * (cutlass.Int64.width // 8)
-        )
+            thresholds = [(32 * 1024, 1), (64 * 1024, 2), (128 * 1024, 4), (256 * 1024, 8)]
+        for limit, cluster in thresholds:
+            if N <= limit:
+                self.cluster_n = cluster
+                return
+        self.cluster_n = 16
     @cute.jit
     def __call__(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mB: Optional[cute.Tensor],
         mRes: Optional[cute.Tensor],
         mO: cute.Tensor,
         mResO: Optional[cute.Tensor],
         mRstd: Optional[cute.Tensor],
+        mMean: Optional[cute.Tensor],
+        eps: Float32,
         stream: cuda.CUstream,
-        eps: Float32 = 1e-6,
     ):
-        semistatic_shape = (*mX.shape[:-1], self.N)  # Set last dimension to be statically N
-        new_stride = lambda t: (
-            cute.assume(t.stride[0], divby=128 // t.element_type.width),
-            t.stride[1],
-        )
-        mX, mRes, mO, mResO = [
-            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
-            if const_expr(t is not None)
-            else None
-            for t in (mX, mRes, mO, mResO)
-        ]
         assert mX.element_type == self.dtype
-        assert mO.element_type == self.dtype
         self._set_cluster_n()
         largest_dtype_width = const_expr(
-            max(
-                mX.element_type.width,
-                mRes.element_type.width if mRes is not None else 0,
-                mO.element_type.width,
-                mResO.element_type.width if mResO is not None else 0,
-            )
-        )
-        tiler_mn, tv_layout = self._get_tv_layout(
-            num_copy_bits=128 // largest_dtype_width * mX.element_type.width
+            max(*(t.element_type.width for t in [mX, mRes, mW, mB, mO, mResO] if t is not None))
         )
-        num_threads = cute.size(tv_layout, mode=[0])
-        num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
-        if const_expr(mB is not None):
-            mB_expanded_layout = cute.prepend(
-                mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
-            )
-            mB = cute.make_tensor(mB.iterator, mB_expanded_layout)
-        if const_expr(mRstd is not None):
-            mRstd_expanded_layout = cute.append(
-                mRstd.layout, cute.make_layout((self.N,), stride=(0,))
-            )
-            mRstd = cute.make_tensor(mRstd.iterator, mRstd_expanded_layout)
+        vecsize = math.gcd(self.N, 128 // largest_dtype_width)
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(vecsize=vecsize)
+        num_threads = tiled_copy.size
+        mW, mB = [
+            layout_utils.expand(mT, dim=0, size=tiler_mn[0]) if const_expr(mT is not None) else None
+            for mT in (mW, mB)
+        ]
+        mRstd, mMean = [
+            layout_utils.expand(mT, dim=1, size=self.N) if const_expr(mT is not None) else None
+            for mT in (mRstd, mMean)
+        ]
         self.kernel(
-            mX, mW, mB, mRes, mO, mResO, mRstd, eps, tv_layout, tiler_mn, self.reload_from
+            mX, mW, mB, mRes, mO, mResO, mRstd, mMean, eps, tiler_mn, tiled_copy, threads_per_row
         ).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=([1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None),
-            smem=self._smem_size_in_bytes(
-                tiler_mn, num_warps, dtype_res=mRes.element_type if mRes is not None else None
-            ),
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
             stream=stream,
         )
@@ -158,30 +93,26 @@ class RMSNorm(ReductionBase):
     def kernel(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mB: Optional[cute.Tensor],
         mRes: Optional[cute.Tensor],
         mO: cute.Tensor,
         mResO: Optional[cute.Tensor],
         mRstd: Optional[cute.Tensor],
-        eps: cute.Float32,
-        tv_layout: cute.Layout,
+        mMean: Optional[cute.Tensor],
+        eps: Float32,
         tiler_mn: cute.Shape,
-        reload_from: cutlass.Constexpr = None,
-        delay_w_load: cutlass.Constexpr = False,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
-        if const_expr(self.cluster_n > 1):
-            cluster_y = cute.arch.block_idx()[1]
-        else:
-            cluster_y = const_expr(0)
+        cluster_y = const_expr(0) if const_expr(self.cluster_n == 1) else cute.arch.block_idx()[1]
+        tv_layout = tiled_copy.layout_tv_tiled
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
-            mX.element_type,
-            cute.make_ordered_layout(tiler_mn, order=(1, 0)),
-            byte_alignment=16,
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
         )
         if const_expr(mRes is not None):
             sRes = smem.allocate_tensor(
@@ -194,73 +125,18 @@ class RMSNorm(ReductionBase):
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
-        mX, mRes, mO, mResO = [
-            utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) if mT is not None else None
-            for mT in (mX, mRes, mO, mResO)
+        gX, gRes, gO, gResO, gRstd, gMean, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) if mT is not None else None
+            for mT in (mX, mRes, mO, mResO, mRstd, mMean, idX)
         ]
-        gX, gRes, gO, gResO = [
-            cute.local_tile(mT, tiler_mn, (0, cluster_y)) if mT is not None else None
-            for mT in (mX, mRes, mO, mResO)
+        gW, gB = [
+            cute.local_tile(mT, tiler_mn, (0, cluster_y)) if const_expr(mT is not None) else None
+            for mT in (mW, mB)
         ]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
-        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
-        gB = cute.local_tile(mB, tiler_mn, (0, cluster_y)) if const_expr(mB is not None) else None
-        gRstd = (
-            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
-            if const_expr(mRstd is not None)
-            else None
-        )
-        # declare the atoms which will be used later for memory copy
-        num_copy_elems_X = tv_layout.shape[1][0]
-        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        num_copy_bits_W = const_expr(min(128, num_copy_elems_X * mW.element_type.width))
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
-        )
-        num_bits_per_copy_B = (
-            cutlass.const_expr(min(128, num_copy_elems_X * mB.element_type.width))
-            if const_expr(mB is not None)
-            else 0
-        )
-        copy_atom_load_B = (
-            cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mB.element_type, num_bits_per_copy=num_bits_per_copy_B
-            )
-            if const_expr(mB is not None)
-            else None
-        )
-        if const_expr(mRes is not None):
-            num_copy_bits_Res = const_expr(min(128, num_copy_elems_X * mRes.element_type.width))
-            copy_atom_load_Res_async = cute.make_copy_atom(
-                cute.nvgpu.cpasync.CopyG2SOp(),
-                mRes.element_type,
-                num_bits_per_copy=num_copy_bits_Res,
-            )
-        num_copy_bits_O = const_expr(min(128, num_copy_elems_X * mO.element_type.width))
-        copy_atom_store_O = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mO.element_type, num_bits_per_copy=num_copy_bits_O
-        )
-        if const_expr(mResO is not None):
-            num_copy_bits_ResO = const_expr(min(128, num_copy_elems_X * mResO.element_type.width))
-            copy_atom_store_ResO = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mResO.element_type,
-                num_bits_per_copy=num_copy_bits_ResO,
-            )
+        thr_copy_X = tiled_copy.get_slice(tidx)
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X_async, tv_layout, tiler_mn).get_slice(
-            tidx
-        )
-        tXgW = thr_copy_X.partition_S(gW)
+        tXgW = thr_copy_X.partition_S(gW) if const_expr(mW is not None) else None
         tXgB = thr_copy_X.partition_S(gB) if const_expr(mB is not None) else None
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
@@ -271,34 +147,40 @@ class RMSNorm(ReductionBase):
         if const_expr(mResO is not None):
             tXgResO = thr_copy_X.partition_D(gResO)
         tXrRstd = thr_copy_X.partition_D(gRstd) if const_expr(mRstd is not None) else None
+        tXrMean = thr_copy_X.partition_D(gMean) if const_expr(mMean is not None) else None
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
         # allocate fragments for gmem->rmem
-        tXrW = cute.make_fragment_like(tXgW)
-        tXrW.fill(0.0)
+        tXrW = cute.make_fragment_like(tXgW) if const_expr(mW is not None) else None
         tXrB = cute.make_fragment_like(tXgB) if const_expr(mB is not None) else None
         tXrX, tXrO = [cute.make_fragment_like(t) for t in (tXgX, tXgO)]
         if const_expr(mRes is not None):
             tXrRes = cute.make_fragment_like(tXgRes)
-        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        num_warps = cute.size(tiled_copy) // cute.arch.WARP_SIZE
         self._initialize_cluster(tidx, mbar_ptr, num_warps)
-        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            copy_utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if not is_even_N
+            else None
         )
+        # Each copy will use the same predicate
+        copy = partial(copy_utils.copy, pred=tXpX)
         row = tXcX[0][0]
         if row < shape[0]:
-            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+            copy(tXgX, tXsX, is_async=True)
             if const_expr(mRes is not None):
-                cute.copy(copy_atom_load_Res_async, tXgRes, tXsRes, pred=tXpX)
+                copy(tXgRes, tXsRes, is_async=True)
         cute.arch.cp_async_commit_group()
-        if const_expr(not delay_w_load):
-            cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpX)
+        if const_expr(not self.delay_w_load):
+            if const_expr(mW is not None):
+                copy(tXgW, tXrW)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_B, tXgB, tXrB, pred=tXpX)
+                copy(tXgB, tXrB)
         cute.arch.cp_async_wait_group(0)
         cute.autovec_copy(tXsX, tXrX)
@@ -310,19 +192,63 @@ class RMSNorm(ReductionBase):
             tXrResO = cute.make_fragment_like(tXgResO)
             tXrResO.store(x.to(tXrResO.element_type))
             if row < shape[0]:
-                cute.copy(copy_atom_store_ResO, tXrResO, tXgResO, pred=tXpX)
-        threads_per_row = tv_layout.shape[0][0]
-        sum_sq_x = row_reduce(
-            x * x,
-            cute.ReductionOp.ADD,
-            threads_per_row,
-            reduction_buffer[None, None, 0],
-            mbar_ptr,
-            init_val=0.0,
-            hook_fn=(cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None),
-        )
-        rstd = cute.math.rsqrt(sum_sq_x / shape[1] + eps, fastmath=True)
+                copy(tXrResO, tXgResO)
+        mean, rstd = None, None
+        if const_expr(self.is_layernorm):
+            # LayerNorm: compute mean first, then variance
+            sum_x = row_reduce(
+                x,
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
+                init_val=0.0,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            )
+            mean = sum_x / shape[1]
+            if const_expr(mMean is not None):
+                # Only the thread corresponding to column 0 writes out the mean to gmem
+                if (
+                    tXcX[0][1] == 0
+                    and row < shape[0]
+                    and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+                ):
+                    tXrMean[0] = mean
+            if const_expr(self.reload_from == "smem"):
+                cute.autovec_copy(tXsX, tXrX)
+                x = tXrX.load().to(cute.Float32)
+                if const_expr(mRes is not None):
+                    cute.autovec_copy(tXsRes, tXrRes)
+                    x += tXrRes.load().to(cute.Float32)
+            elif const_expr(self.reload_from == "gmem"):
+                copy(tXgX, tXrX)
+                x = tXrX.load().to(cute.Float32)
+                if const_expr(mRes is not None):
+                    copy(tXgRes, tXrRes)
+                    x += tXrRes.load().to(cute.Float32)
+            sum_sq_x_sub_mean = row_reduce(
+                (x - mean) * (x - mean),
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 1],
+                mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
+                init_val=0.0,
+            )
+            rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
+        else:
+            # RMSNorm: compute sum of squares directly
+            mean = const_expr(0.0)
+            sum_sq_x = row_reduce(
+                x * x,
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr,
+                init_val=0.0,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            )
+            rstd = cute.math.rsqrt(sum_sq_x / shape[1] + eps, fastmath=True)
         if const_expr(mRstd is not None):
             # Only the thread corresponding to column 0 writes out the rstd to gmem
             if (
@@ -331,139 +257,114 @@ class RMSNorm(ReductionBase):
                 and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
             ):
                 tXrRstd[0] = rstd
-        if const_expr(delay_w_load):
-            cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpX)
+        if const_expr(self.delay_w_load):
+            if const_expr(mW is not None):
+                copy(tXgW, tXrW)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_B, tXgB, tXrB, pred=tXpX)
-        if const_expr(reload_from == "smem" or reload_from == "gmem"):
-            if const_expr(reload_from == "smem"):
+                copy(tXgB, tXrB)
+        if const_expr(self.reload_from == "smem" or self.reload_from == "gmem"):
+            if const_expr(self.reload_from == "smem"):
                 cute.autovec_copy(tXsX, tXrX)
+                if const_expr(mRes is not None):
+                    cute.autovec_copy(tXsRes, tXrRes)
             else:
-                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+                copy(tXgX, tXrX)
+                if const_expr(mRes is not None):
+                    copy(tXgRes, tXrRes)
             x = tXrX.load().to(cute.Float32)
             if const_expr(mRes is not None):
-                cute.autovec_copy(tXsRes, tXrRes)
                 x += tXrRes.load().to(cute.Float32)
-        x_hat = x * rstd
-        w = tXrW.load().to(cute.Float32)
-        y = x_hat * w
+        x_hat = (x - mean) * rstd if const_expr(self.is_layernorm) else x * rstd
+        y = x_hat
+        if const_expr(mW is not None):
+            y *= tXrW.load().to(cute.Float32)
         if const_expr(mB is not None):
-            b = tXrB.load().to(cute.Float32)
-            y = y + b
+            y += tXrB.load().to(cute.Float32)
         tXrO.store(y.to(tXrO.element_type))
         if row < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tXpX)
+            copy(tXrO, tXgO)
 @torch.library.custom_op(
     "quack::_rmsnorm_fwd",
-    mutates_args=("out", "rstd", "residual_out"),
+    mutates_args=("out", "rstd", "mean", "residual_out"),
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor? residual, Tensor(a6!)? residual_out, float eps=1e-6) -> ()",
+    schema="(Tensor x, Tensor? weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor(a5!)? mean, Tensor? residual, Tensor(a7!)? residual_out, float eps=1e-6, bool is_layernorm=False) -> ()",
 )
 def _rmsnorm_fwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor],
     out: Tensor,
     bias: Optional[Tensor] = None,
     rstd: Optional[Tensor] = None,
+    mean: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     residual_out: Optional[Tensor] = None,
     eps: float = 1e-6,
+    is_layernorm: bool = False,
 ) -> None:
-    """RMSNorm forward pass.
+    """RMSNorm/LayerNorm forward pass.
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         eps: Small value for numerical stability
+        is_layernorm: If True, compute LayerNorm instead of RMSNorm
     Returns:
         Normalized output tensor of same shape as x
     """
-    assert x.dim() == 2, "Input must be 2D"
-    assert weight.dim() == 1, "Weight must be 1D"
-    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
-    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype in [
-        torch.float32,
-        torch.bfloat16,
-        torch.float16,
-    ], "Weight must be float32, float16 or bfloat16"
+    # Don't need to check is_cuda since torch.library ensures that
+    supported_types = {torch.float16, torch.bfloat16, torch.float32}
+    assert x.dtype in supported_types, "Unsupported dtype"
+    if weight is not None:
+        assert weight.dtype in supported_types, "Weight must be float32, float16 or bfloat16"
     if residual is not None:
-        assert residual.shape == x.shape
-        assert residual.is_cuda
-        assert residual.dtype in [
-            torch.float16,
-            torch.bfloat16,
-            torch.float32,
-        ], "Residual must be float16, bfloat16, or float32"
+        assert residual.dtype in supported_types, "Residual must be float16, bfloat16, or float32"
     _, N = x.shape
-    device = x.device
-    dtype = torch2cute_dtype_map[x.dtype]
-    # convert_from_dlpack = lambda x: (
-    #     from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
-    #         mode=0, divisibility=128 // dtype.width
-    #     )
-    # )
-    convert_from_dlpack = lambda x: (
-        from_dlpack(x.detach(), assumed_align=16).mark_layout_dynamic(leading_dim=1)
-    )
-    x_tensor, res_tensor, out_tensor, res_out_tensor = [
-        convert_from_dlpack(t) if t is not None else None for t in (x, residual, out, residual_out)
+    dtype, out_dtype, weight_dtype, bias_dtype, res_dtype, res_out_dtype = [
+        torch2cute_dtype_map[t.dtype] if t is not None else None
+        for t in [x, out, weight, bias, residual, residual_out]
     ]
-    # handle weight divisibility based on weight dtype
-    weight_dtype = torch2cute_dtype_map[weight.dtype]
-    weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
-    )
-    if bias is not None:
-        bias_dtype = torch2cute_dtype_map[bias.dtype]
-        bias_tensor = utils.convert_from_dlpack(
-            bias.detach(), leading_dim=0, divisibility=128 // bias_dtype.width
-        )
-    else:
-        bias_tensor = None
-    rstd_tensor = (
-        from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
-        if rstd is not None
-        else None
-    )
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     compile_key = (
-        N,
         dtype,
-        res_tensor.element_type if residual is not None else None,
-        weight_tensor.element_type,
-        bias_tensor.element_type if bias is not None else None,
-        res_out_tensor.element_type if residual_out is not None else None,
+        out_dtype,
+        res_dtype,
+        weight_dtype,
+        bias_dtype,
+        res_out_dtype,
+        N,
         rstd is not None,
+        mean is not None,
+        is_layernorm,
     )
     if compile_key not in _rmsnorm_fwd.compile_cache:
-        rmsnorm_op = RMSNorm(dtype, N)
+        batch_sym = cute.sym_int()
+        all_dtypes = [dtype, out_dtype, res_dtype, weight_dtype, bias_dtype, res_out_dtype]
+        div = math.gcd(N, *(128 // dt.width for dt in all_dtypes if dt is not None))
+        x_cute, out_cute, res_cute, res_out_cute = [
+            fake_tensor(dt, (batch_sym, N), div)
+            for dt in [dtype, out_dtype, res_dtype, res_out_dtype]
+        ]
+        weight_cute, bias_cute = [fake_tensor(dt, (N,), div) for dt in [weight_dtype, bias_dtype]]
+        rstd_cute = fake_tensor(Float32, (batch_sym,)) if rstd is not None else None
+        mean_cute = fake_tensor(Float32, (batch_sym,)) if mean is not None else None
         _rmsnorm_fwd.compile_cache[compile_key] = cute.compile(
-            rmsnorm_op,
-            x_tensor,
-            weight_tensor,
-            bias_tensor,
-            res_tensor,
-            out_tensor,
-            res_out_tensor,
-            rstd_tensor,
-            current_stream,
-            eps,
+            RMSNorm(dtype, N, is_layernorm=is_layernorm),
+            x_cute,
+            weight_cute,
+            bias_cute,
+            res_cute,
+            out_cute,
+            res_out_cute,
+            rstd_cute,
+            mean_cute,
+            Float32(0),  # eps, just for compilation
+            cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+            options="--enable-tvm-ffi",
         )
     _rmsnorm_fwd.compile_cache[compile_key](
-        x_tensor,
-        weight_tensor,
-        bias_tensor,
-        res_tensor,
-        out_tensor,
-        res_out_tensor,
-        rstd_tensor,
-        current_stream,
-        eps,
+        x, weight, bias, residual, out, residual_out, rstd, mean, eps
     )
@@ -472,7 +373,7 @@ _rmsnorm_fwd.compile_cache = {}
 def rmsnorm_fwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor] = None,
     bias: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -494,19 +395,20 @@ def rmsnorm_fwd(
         )
     else:
         residual_out = None
-    _rmsnorm_fwd(x, weight, out, bias, rstd, residual, residual_out, eps=eps)
+    _rmsnorm_fwd(x, weight, out, bias, rstd, None, residual, residual_out, eps, False)
     # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
     if residual_out is None:
         residual_out = x
     return out, residual_out, rstd
-def rmsnorm_ref(x, w, bias=None, residual=None, eps=1e-6):
+def rmsnorm_ref(x, w=None, bias=None, residual=None, eps=1e-6):
     x_f32 = x.float()
     if residual is not None:
         residual_f32 = residual.float()
         x_f32 += residual_f32
-    out = x_f32 / (torch.sqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + eps)) * w
+    x_norm = x_f32 / (torch.sqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + eps))
+    out = x_norm * w if w is not None else x_norm
     if bias is not None:
         out = out + bias.float()
     if residual is None:
@@ -519,13 +421,19 @@ def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
     """Reference implementation for RMSNorm backward pass."""
     x_f32 = x.float()
     x_hat = x_f32 * rstd.unsqueeze(1)
-    wdy = dout * w
+    if w is not None:
+        wdy = dout * w
+    else:
+        wdy = dout
     c1 = (x_hat * wdy).mean(dim=-1, keepdim=True)
     dx = (wdy - x_hat * c1) * rstd.unsqueeze(1)
     # dL/dW
-    dw = (dout * x_hat).sum(dim=0)
-    return dx.to(x.dtype), dw.to(w.dtype)
+    if w is not None:
+        dw = (dout * x_hat).sum(dim=0)
+        return dx.to(x.dtype), dw.to(w.dtype)
+    else:
+        return dx.to(x.dtype), None
 class RMSNormBackward(ReductionBase):
@@ -537,91 +445,57 @@ class RMSNormBackward(ReductionBase):
             # Not enough smem
             raise ValueError("RMSNormBackward does not support N > 128k with dtype >= 32 bits")
-    def _get_num_threads(self):
+    def _num_threads(self):
         return 128 if self.N <= 4096 else 256
-    def _calculate_threads_per_row(self):
+    def _threads_per_row(self):
         N = self.N
-        return (
-            8
-            if N <= 64
-            else (
-                16
-                if N <= 128
-                else (32 if N <= 256 else (64 if N <= 512 else (128 if N <= 4096 else 256)))
-            )
-        )
+        for limit, threads in [(64, 8), (128, 16), (256, 32), (512, 64), (4096, 128)]:
+            if N <= limit:
+                return threads
+        return 256
     def _set_cluster_n(self):
         N = self.N
-        cluster_n = (
-            1
-            if N <= 8 * 1024
-            else (2 if N <= 16 * 1024 else (4 if N <= 32 * 1024 else (8 if N <= 64 * 1024 else 16)))
-        )
-        self.cluster_n = cluster_n
-    def _smem_size_in_bytes(self, tiler_mn, num_warps, do_dtype=None):
-        if do_dtype is None:
-            do_dtype = self.dtype
-        return (
-            # We need space for X and dO, and multiply by 2 due to double buffering
-            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
-            + cute.size_in_bytes(do_dtype, cute.make_layout(tiler_mn)) * 2
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
-            + self.stage * (cutlass.Int64.width // 8) * 2  # mult 2 as we need 2 mbar per stage
-        )
+        for limit, cluster in [(8 * 1024, 1), (16 * 1024, 2), (32 * 1024, 4), (64 * 1024, 8)]:
+            if N <= limit:
+                self.cluster_n = cluster
+                return
+        self.cluster_n = 16
     @cute.jit
     def __call__(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mdO: cute.Tensor,
         mdResO: Optional[cute.Tensor],
         mRstd: cute.Tensor,
         mdX: cute.Tensor,
-        mdW: cute.Tensor,
+        mdW: Optional[cute.Tensor],
         mdRes: Optional[cute.Tensor],
         mdB: Optional[cute.Tensor],
         sm_count: Int32,
         stream: cuda.CUstream,
     ):
-        semistatic_shape = (*mX.shape[:-1], self.N)  # Set last dimension to be statically N
-        new_stride = lambda t: (
-            cute.assume(t.stride[0], divby=128 // t.element_type.width),
-            t.stride[1],
-        )
-        mX, mdO, mdResO, mdX, mdRes = [
-            cute.make_tensor(t.iterator, cute.make_layout(semistatic_shape, stride=new_stride(t)))
-            if const_expr(t is not None)
-            else None
-            for t in (mX, mdO, mdResO, mdX, mdRes)
-        ]
+        assert mX.element_type == self.dtype
         self._set_cluster_n()
         largest_dtype_width = const_expr(
-            max(
-                mX.element_type.width,
-                mdO.element_type.width,
-                mdX.element_type.width,
-                mdResO.element_type.width if mdResO is not None else 0,
-                mdRes.element_type.width if mdRes is not None else 0,
-            )
+            max(*(t.element_type.width for t in [mX, mW, mdO, mdResO, mdX, mdRes] if t is not None))
         )
-        tiler_mn, tv_layout = self._get_tv_layout(
-            num_copy_bits=128 // largest_dtype_width * mX.element_type.width
+        vecsize = math.gcd(self.N, 128 // largest_dtype_width)
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(vecsize=vecsize)
+        num_threads = tiled_copy.size
+        mW = (
+            layout_utils.expand(mW, dim=0, size=tiler_mn[0]) if const_expr(mW is not None) else None
         )
-        num_threads = cute.size(tv_layout, mode=[0])
-        num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
         num_blocks = sm_count
-        self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn).launch(
+        self.kernel(
+            mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tiler_mn, tiled_copy, threads_per_row
+        ).launch(
             grid=[num_blocks, self.cluster_n, 1],
             block=[num_threads, 1, 1],
             cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
-            smem=self._smem_size_in_bytes(tiler_mn, num_warps, do_dtype=mdO.element_type),
             stream=stream,
         )
@@ -629,24 +503,23 @@ class RMSNormBackward(ReductionBase):
     def kernel(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mdO: cute.Tensor,
         mdResO: Optional[cute.Tensor],
         mRstd: cute.Tensor,
         mdX: cute.Tensor,
-        mdW: cute.Tensor,
+        mdW: Optional[cute.Tensor],
         mdB: Optional[cute.Tensor],
         mdRes: Optional[cute.Tensor],
-        tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx_start, _, _ = cute.arch.block_idx()
         gdim, _, _ = cute.arch.grid_dim()
-        if const_expr(self.cluster_n > 1):
-            cluster_y = cute.arch.block_idx()[1]
-        else:
-            cluster_y = const_expr(0)
+        cluster_y = const_expr(0) if const_expr(self.cluster_n == 1) else cute.arch.block_idx()[1]
+        tv_layout = tiled_copy.layout_tv_tiled
         shape = mX.shape
         M, N = shape[0], shape[1]
@@ -666,103 +539,20 @@ class RMSNormBackward(ReductionBase):
         else:
             mbar_full_ptr, mbar_empty_ptr = None, None
-        num_copy_elems_X = tv_layout.shape[1][0]
-        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        num_copy_bits_dO = const_expr(min(128, num_copy_elems_X * mdO.element_type.width))
-        copy_atom_load_dO_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mdO.element_type, num_bits_per_copy=num_copy_bits_dO
-        )
-        num_copy_bits_W = const_expr(min(128, num_copy_elems_X * mW.element_type.width))
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
-        )
-        if const_expr(mdResO is not None):
-            num_copy_bits_dResO = const_expr(min(128, num_copy_elems_X * mdResO.element_type.width))
-            copy_atom_load_dResO = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mdResO.element_type,
-                num_bits_per_copy=num_copy_bits_dResO,
-            )
-        num_copy_bits_dX = const_expr(min(128, num_copy_elems_X * mdX.element_type.width))
-        copy_atom_store_dX = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mdX.element_type, num_bits_per_copy=num_copy_bits_dX
-        )
-        num_copy_bits_dW = const_expr(min(128, num_copy_elems_X * mdW.element_type.width))
-        copy_atom_store_dW = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mdW.element_type, num_bits_per_copy=num_copy_bits_dW
-        )
-        if const_expr(mdB is not None):
-            num_copy_bits_dB = const_expr(min(128, num_copy_elems_X * mdB.element_type.width))
-            copy_atom_store_dB = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mdB.element_type, num_bits_per_copy=num_copy_bits_dB
-            )
-        if const_expr(mdRes is not None):
-            num_copy_bits_dRes = const_expr(min(128, num_copy_elems_X * mdRes.element_type.width))
-            copy_atom_load_dRes = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mdRes.element_type,
-                num_bits_per_copy=num_copy_bits_dRes,
-            )
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
-        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
-        tXgW = thr_copy_X.partition_S(gW)
-        tXrW = cute.make_fragment_like(tXgW)
-        # Need this, otherwise rW can have arbitrary values that changes the reduction
-        if not is_even_N:
-            tXrW.fill(0.0)
-        gW_coord = cute.local_tile(idX, tiler_mn, (0, cluster_y))
-        tXpW = (
-            utils.predicate_k(thr_copy_X.partition_S(gW_coord), limit=shape[1])
-            if not is_even_N
-            else None
-        )
-        cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpW)
-        weight = tXrW.load().to(cute.Float32)
-        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        self._initialize_cluster(tidx, mbar_ptr, num_warps, is_persistent=True)
-        dw_coord = cute.local_tile(idX, tiler_mn, (0, cluster_y))
-        tXpdW = (
-            utils.predicate_k(thr_copy_X.partition_S(dw_coord), limit=shape[1])
-            if not is_even_N
-            else None
-        )
-        if const_expr(mdB is not None):
-            db_coord = cute.local_tile(idX, tiler_mn, (0, cluster_y))
-            tXpdB = (
-                utils.predicate_k(thr_copy_X.partition_S(db_coord), limit=shape[1])
-                if not is_even_N
-                else None
-            )
-        gdW = cute.local_tile(mdW, (1, tiler_mn[1]), (bidx_start, cluster_y))
-        tXgdW = thr_copy_X.partition_S(gdW)
-        # Always compute partial weight gradients in fp32
-        tXrdW = cute.make_fragment_like(tXgdW, Float32)
-        gdB = (
-            cute.local_tile(mdB, (1, tiler_mn[1]), (bidx_start, cluster_y))
-            if const_expr(mdB is not None)
-            else None
-        )
-        tXgdB = thr_copy_X.partition_S(gdB) if const_expr(mdB is not None) else None
-        tXrdB = cute.make_fragment_like(tXgdB, Float32) if const_expr(mdB is not None) else None
+        thr_copy_X = tiled_copy.get_slice(tidx)
         gX, gdO, gdResO, gdX, gdRes, cX = [
             cute.local_tile(mT, tiler_mn, (None, cluster_y)) if mT is not None else None
             for mT in (mX, mdO, mdResO, mdX, mdRes, idX)
         ]
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y)) if mW is not None else None
+        gdW, gdB = [
+            cute.local_tile(mT, (1, tiler_mn[1]), (bidx_start, cluster_y))
+            if const_expr(mT is not None)
+            else None
+            for mT in (mdW, mdB)
+        ]
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
         tXgdO = thr_copy_X.partition_S(gdO)
@@ -773,12 +563,6 @@ class RMSNormBackward(ReductionBase):
         if const_expr(mdRes is not None):
             tXgdRes = thr_copy_X.partition_D(gdRes)
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None, None]
-        # This doesn't change across iterations
-        tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX[None, None, 0]), limit=shape[1])
-            if not is_even_N
-            else None
-        )
         tXrX, tXrdO, tXrdX = [
             cute.make_fragment_like(thr[None, None, None, 0]) for thr in (tXgX, tXgdO, tXgdX)
@@ -790,28 +574,57 @@ class RMSNormBackward(ReductionBase):
         if const_expr(mdRes is not None):
             tXrdRes = cute.make_fragment_like(tXgdRes[None, None, None, 0])
-        copy_X = partial(cute.copy, copy_atom_load_X_async, pred=tXpX)
-        copy_dO = partial(cute.copy, copy_atom_load_dO_async, pred=tXpX)
+        # This doesn't change across iterations
+        tXpX = (
+            None
+            if is_even_N
+            else copy_utils.predicate_k(thr_copy_X.partition_S(cX[None, None, 0]), limit=shape[1])
+        )
+        # Each copy will use the same number of elements as X
+        copy = partial(copy_utils.copy, pred=tXpX)
+        tXgdW, tXrdW = None, None
+        tXgdB, tXrdB = None, None
+        if const_expr(mdW is not None):
+            tXgdW = thr_copy_X.partition_S(gdW)
+            # Always compute partial weight gradients in fp32
+            tXrdW = cute.make_fragment_like(tXgdW, Float32)
+        if const_expr(mdB is not None):
+            tXgdB = thr_copy_X.partition_S(gdB)
+            # Always compute partial bias gradients in fp32
+            tXrdB = cute.make_fragment_like(tXgdB, Float32)
+        num_warps = cute.size(tiled_copy) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps, is_persistent=True)
+        tXrW = None
+        if const_expr(mW is not None):
+            tXgW = thr_copy_X.partition_S(gW)
+            tXrW = cute.make_fragment_like(tXgW)
+            # Need this, otherwise rW can have arbitrary values that changes the reduction
+            if const_expr(not is_even_N):
+                tXrW.fill(0.0)
+            copy(tXgW, tXrW)
         # Prefetch the first batch
         row = tXcX[None, None, None, bidx_start][0][0]
         if row < M:
-            tXgX_cur = utils.coord_offset_i64(bidx_start, tXgX, dim=3)[None, None, None, 0]
-            tXgdO_cur = utils.coord_offset_i64(bidx_start, tXgdO, dim=3)[None, None, None, 0]
-            copy_X(tXgX_cur, tXsX[None, None, None, 0])
-            copy_dO(tXgdO_cur, tXsdO[None, None, None, 0])
-        elif tiler_mn[0] > 1:
-            # Fill with zero, otherwise smem will be uninitialized, and we could read this back
-            # later into registers, causing wrong dW.
-            utils.fill_oob(tXsX[None, None, None, 0], None, fill_value=mX.element_type.zero)
-            utils.fill_oob(tXsdO[None, None, None, 0], None, fill_value=mdO.element_type.zero)
+            copy(tXgX[None, None, None, bidx_start], tXsX[None, None, None, 0], is_async=True)
+            copy(tXgdO[None, None, None, bidx_start], tXsdO[None, None, None, 0], is_async=True)
+        else:
+            if const_expr(tiler_mn[0] > 1):
+                # Fill with zero, otherwise smem will be uninitialized, and we could read this back
+                # later into registers, causing wrong dW.
+                utils.fill_oob(tXsX[None, None, None, 0], None, fill_value=mX.element_type.zero)
+                utils.fill_oob(tXsdO[None, None, None, 0], None, fill_value=mdO.element_type.zero)
         cute.arch.cp_async_commit_group()
         if const_expr(self.cluster_n > 1):
             cute.arch.cluster_wait()
-        threads_per_row = tv_layout.shape[0][0]
-        tXrdW.fill(0.0)
+        if const_expr(mdW is not None):
+            tXrdW.fill(0.0)
         if const_expr(mdB is not None):
             tXrdB.fill(0.0)
         stage = Int32(0)
@@ -820,29 +633,31 @@ class RMSNormBackward(ReductionBase):
         for bidx in cutlass.range(bidx_start, cute.ceil_div(M, tiler_mn[0]), gdim):
             row = tXcX[None, None, None, bidx][0][0]
             if row + gdim * tiler_mn[0] < M:  # Prefetch the next batch
-                tXgX_cur = utils.coord_offset_i64(bidx + gdim, tXgX, dim=3)[None, None, None, 0]
-                tXgdO_cur = utils.coord_offset_i64(bidx + gdim, tXgdO, dim=3)[None, None, None, 0]
-                copy_X(tXgX_cur, tXsX[None, None, None, stage ^ 1])
-                copy_dO(tXgdO_cur, tXsdO[None, None, None, stage ^ 1])
-            elif tiler_mn[0] > 1:
-                utils.fill_oob(
+                copy(
+                    tXgX[None, None, None, bidx + gdim],
                     tXsX[None, None, None, stage ^ 1],
-                    None,
-                    fill_value=mX.element_type.zero,
+                    is_async=True,
                 )
-                utils.fill_oob(
+                copy(
+                    tXgdO[None, None, None, bidx + gdim],
                     tXsdO[None, None, None, stage ^ 1],
-                    None,
-                    fill_value=mdO.element_type.zero,
+                    is_async=True,
                 )
+            else:
+                if const_expr(tiler_mn[0] > 1):
+                    utils.fill_oob(
+                        tXsX[None, None, None, stage ^ 1], None, fill_value=mX.element_type.zero
+                    )
+                    utils.fill_oob(
+                        tXsdO[None, None, None, stage ^ 1], None, fill_value=mdO.element_type.zero
+                    )
             cute.arch.cp_async_commit_group()
             rstd = cutlass.Float.zero
             if row < M or tiler_mn[0] == 1:
                 rstd = mRstd[row]
             if const_expr(mdResO is not None):
-                tXgdResO_cur = utils.coord_offset_i64(bidx, tXgdResO, dim=3)[None, None, None, 0]
                 if row < M or tiler_mn[0] == 1:
-                    cute.copy(copy_atom_load_dResO, tXgdResO_cur, tXrdResO, pred=tXpX)
+                    copy(tXgdResO[None, None, None, bidx], tXrdResO)
                 elif tiler_mn[0] > 1:
                     tXrdResO.fill(0.0)
             cute.arch.cp_async_wait_group(1)
@@ -850,10 +665,10 @@ class RMSNormBackward(ReductionBase):
             x = tXrX.load().to(cute.Float32)
             cute.autovec_copy(tXsdO[None, None, None, stage], tXrdO)
             dout = tXrdO.load().to(cute.Float32)
-            if const_expr(mdResO is not None):
-                dout += tXrdResO.load().to(cute.Float32)
             x_hat = x * rstd
-            wdy = dout * weight
+            wdy = dout
+            if const_expr(mW is not None):
+                wdy *= tXrW.load().to(Float32)
             if const_expr(self.cluster_n > 1):
                 cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
             mean_xhat_wdy = (
@@ -870,6 +685,10 @@ class RMSNormBackward(ReductionBase):
             )
             if const_expr(self.cluster_n > 1):
+                # Need this fence since the STAS from the producer is using the async proxy.
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
                 # It's faster to have 1 lane per warp to signal the mbar, rather than all lanes
                 # Requires adjusting the thread_count when initializing the mbar
                 cute.arch.sync_warp()
@@ -882,22 +701,22 @@ class RMSNormBackward(ReductionBase):
             if const_expr(self.reload_wdy == "smem"):
                 cute.autovec_copy(tXsdO[None, None, None, stage], tXrdO)
                 dout = tXrdO.load().to(cute.Float32)
-                if const_expr(mdResO is not None):
-                    dout += tXrdResO.load().to(cute.Float32)
-                wdy = dout * weight
+                wdy = dout
+                if const_expr(mW is not None):
+                    wdy *= tXrW.load().to(Float32)
             dx = (wdy - x_hat * mean_xhat_wdy) * rstd
+            if const_expr(mdResO is not None):
+                dx += tXrdResO.load().to(cute.Float32)
             tXrdX.store(dx.to(tXrdX.element_type))
             if row < M or tiler_mn[0] == 1:
-                tXgdX_cur = utils.coord_offset_i64(bidx, tXgdX, dim=3)[None, None, None, 0]
-                cute.copy(copy_atom_store_dX, tXrdX, tXgdX_cur, pred=tXpX)
+                copy(tXrdX, tXgdX[None, None, None, bidx])
             if const_expr(mdRes is not None):
                 tXrdRes.store(dx.to(tXrdRes.element_type))
-                tXgdRes_cur = utils.coord_offset_i64(bidx, tXgdRes, dim=3)[None, None, None, 0]
                 if row < M or tiler_mn[0] == 1:
-                    cute.copy(copy_atom_load_dRes, tXrdRes, tXgdRes_cur, pred=tXpX)
-            # Accumulate weight gradients in fp32
-            tXrdW.store(tXrdW.load() + dout * x_hat)
+                    copy(tXrdRes, tXgdRes[None, None, None, bidx])
+            if const_expr(mdW is not None):
+                tXrdW.store(tXrdW.load() + dout * x_hat)
             if const_expr(mdB is not None):
                 tXrdB.store(tXrdB.load() + dout)
@@ -906,29 +725,29 @@ class RMSNormBackward(ReductionBase):
                 consumer_phase ^= 1
                 producer_phase ^= 1
-        if const_expr(self.cluster_n > 1):  # Prevent cluster from exiting early
-            cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
         if const_expr(tiler_mn[0] > 1):
-            # reduction of dw_partial within the same threadblock
-            sdW = cute.make_tensor(
-                cute.recast_ptr(sX.iterator, dtype=cute.Float32),
-                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
-            )
-            tXsdW = thr_copy_X.partition_D(sdW)
-            cute.arch.barrier()
-            row = tXcX[None, None, None, 0][0][0]
-            if row > 0:
-                cute.autovec_copy(tXrdW, tXsdW)
-            cute.arch.barrier()
-            if row == 0:
-                for i in cutlass.range_constexpr(1, const_expr(tiler_mn[0])):
-                    tXrdW_other = cute.make_fragment_like(tXrdW)
-                    tXsdW_other = cute.make_tensor(tXsdW.iterator + i * sdW.stride[0], tXsdW.layout)
-                    cute.autovec_copy(tXsdW_other, tXrdW_other)
-                    tXrdW.store(tXrdW.load() + tXrdW_other.load())
-                cute.copy(copy_atom_store_dW, tXrdW, tXgdW, pred=tXpdW)
-            cute.arch.barrier()
+            if const_expr(mdW is not None):
+                # reduction of dw_partial within the same threadblock
+                sdW = cute.make_tensor(
+                    cute.recast_ptr(sX.iterator, dtype=cute.Float32),
+                    cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+                )
+                tXsdW = thr_copy_X.partition_D(sdW)
+                cute.arch.barrier()
+                row = tXcX[None, None, None, 0][0][0]
+                if row > 0:
+                    cute.autovec_copy(tXrdW, tXsdW)
+                cute.arch.barrier()
+                if row == 0:
+                    for i in cutlass.range_constexpr(1, const_expr(tiler_mn[0])):
+                        tXrdW_other = cute.make_fragment_like(tXrdW)
+                        tXsdW_other = cute.make_tensor(
+                            tXsdW.iterator + i * sdW.stride[0], tXsdW.layout
+                        )
+                        cute.autovec_copy(tXsdW_other, tXrdW_other)
+                        tXrdW.store(tXrdW.load() + tXrdW_other.load())
+                    copy(tXrdW, tXgdW)
+                cute.arch.barrier()
             if const_expr(mdB is not None):
                 sdB = cute.make_tensor(
                     cute.recast_ptr(sX.iterator, dtype=cute.Float32),
@@ -948,12 +767,21 @@ class RMSNormBackward(ReductionBase):
                         )
                         cute.autovec_copy(tXsdB_other, tXrdB_other)
                         tXrdB.store(tXrdB.load() + tXrdB_other.load())
-                    cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
+                    copy(tXrdB, tXgdB)
         else:
             # dw is already in fp32, so we can directly copy to global memory
-            cute.copy(copy_atom_store_dW, tXrdW, tXgdW, pred=tXpdW)
+            if const_expr(mdW is not None):
+                copy(tXrdW, tXgdW)
             if const_expr(mdB is not None):
-                cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
+                copy(tXrdB, tXgdB)
+        if const_expr(self.cluster_n > 1):  # Prevent cluster from exiting early
+            # Assume state contains that next useful buffer
+            # So we only need to advance to num_stages - 1 times to last used buffer
+            stage ^= 1
+            if stage == 0:
+                producer_phase ^= 1
+            cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
 def _get_sm_count(N: int, device: torch.device) -> int:
@@ -978,120 +806,103 @@ def _get_sm_count(N: int, device: torch.device) -> int:
     mutates_args={"dx", "dw_partial", "db_partial", "dresidual"},
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!) dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual) -> ()",
+    schema="(Tensor x, Tensor? weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!)? dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual, int? sm_count) -> ()",
 )
 def _rmsnorm_bwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor],
     dout: Tensor,
     rstd: Tensor,
     dx: Tensor,
-    dw_partial: Tensor,
+    dw_partial: Optional[Tensor],
     db_partial: Optional[Tensor] = None,
     dresidual_out: Optional[Tensor] = None,
     dresidual: Optional[Tensor] = None,
+    sm_count: Optional[int] = None,
 ) -> None:
     """RMSNorm backward pass.
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         dout: Upstream gradients tensor of shape (M, N)
         rstd: Reciprocal standard deviation tensor of shape (M,)
     Returns:
         Tuple of (dx, dw) where:
         - dx: Input gradients tensor of same shape as x
-        - dw: Weight gradients tensor of same shape as weight
+        - dw: Weight gradients tensor of same shape as weight (or None if weight is None)
     """
     assert x.dim() == 2, "Input must be 2D"
-    assert weight.dim() == 1, "Weight must be 1D"
-    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
-    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype in [
-        torch.float32,
-        torch.bfloat16,
-        torch.float16,
-    ], "Weight must be float32, float16 or bfloat16"
+    assert x.is_cuda, "Input tensor must be on CUDA device"
+    supported_types = {torch.float16, torch.bfloat16, torch.float32}
+    assert x.dtype in supported_types, "Unsupported dtype"
+    if weight is not None:
+        assert weight.dim() == 1, "Weight must be 1D"
+        assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
+        assert weight.is_cuda, "Weight tensor must be on CUDA device"
+        assert weight.dtype in supported_types, "Weight must be float32, float16 or bfloat16"
     if dresidual_out is not None:
         assert dresidual_out.shape == x.shape
         assert dresidual_out.is_cuda
-        assert dresidual_out.dtype in [
-            torch.float16,
-            torch.bfloat16,
-            torch.float32,
-        ], "Residual must be float16, bfloat16, or float32"
+        assert dresidual_out.dtype in supported_types, (
+            "Residual must be float16, bfloat16, or float32"
+        )
     if dresidual is not None:
         assert dresidual.shape == x.shape
         assert dresidual.is_cuda
-        assert dresidual.dtype in [
-            torch.float16,
-            torch.bfloat16,
-            torch.float32,
-        ], "Residual must be float16, bfloat16, or float32"
+        assert dresidual.dtype in supported_types, "Residual must be float16, bfloat16, or float32"
     N = x.size(1)
-    device = x.device
-    sm_count = dw_partial.shape[0]
-    convert_from_dlpack = lambda x: (
-        from_dlpack(x.detach(), assumed_align=16).mark_layout_dynamic(leading_dim=1)
-    )
-    x_tensor, dout_tensor, dres_out_tensor, dx_tensor, dres_tensor = [
-        convert_from_dlpack(t) if t is not None else None
-        for t in (x, dout, dresidual_out, dx, dresidual)
+    if dw_partial is None and db_partial is None:
+        assert sm_count is not None
+    else:
+        sm_count = dw_partial.shape[0] if dw_partial is not None else db_partial.shape[0]
+    dtype, dout_dtype, dx_dtype, weight_dtype, dres_dtype, dres_out_dtype = [
+        torch2cute_dtype_map[t.dtype] if t is not None else None
+        for t in [x, dout, dx, weight, dresidual, dresidual_out]
     ]
-    # Handle weight div based on weight dtype
-    weight_dtype = torch2cute_dtype_map[weight.dtype]
-    weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
-    )
-    dw_partial_tensor = from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    db_partial_tensor = (
-        from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
-        if db_partial is not None
-        else None
-    )
-    rstd_tensor = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     compile_key = (
         N,
-        x_tensor.element_type,
-        weight_tensor.element_type,
-        db_partial.dtype if db_partial is not None else None,
-        dresidual.dtype if dresidual is not None else None,
-        dresidual_out.dtype if dresidual_out is not None else None,
+        dtype,
+        dout_dtype,
+        dx_dtype,
+        weight_dtype,
+        db_partial is not None,
+        dres_dtype,
+        dres_out_dtype,
     )
     if compile_key not in _rmsnorm_bwd.compile_cache:
-        rmsnorm_backward_op = RMSNormBackward(x_tensor.element_type, N)
+        batch_sym, batch_partial_sym = cute.sym_int(), cute.sym_int()
+        all_dtypes = [dtype, dout_dtype, dx_dtype, dres_dtype, dres_out_dtype]
+        div = math.gcd(N, *(128 // dt.width for dt in all_dtypes if dt is not None))
+        x_cute, dout_cute, dx_cute, dres_out_cute, dres_cute = [
+            fake_tensor(dt, (batch_sym, N), div)
+            for dt in [dtype, dout_dtype, dx_dtype, dres_out_dtype, dres_dtype]
+        ]
+        weight_cute = fake_tensor(weight_dtype, (N,), div)
+        rstd_cute = fake_tensor(Float32, (batch_sym,))
+        dw_partial_cute = (
+            fake_tensor(Float32, (batch_partial_sym, N), div) if dw_partial is not None else None
+        )
+        db_partial_cute = (
+            fake_tensor(Float32, (batch_partial_sym, N), div) if db_partial is not None else None
+        )
         _rmsnorm_bwd.compile_cache[compile_key] = cute.compile(
-            rmsnorm_backward_op,
-            x_tensor,
-            weight_tensor,
-            dout_tensor,
-            dres_out_tensor,
-            rstd_tensor,
-            dx_tensor,
-            dw_partial_tensor,
-            dres_tensor,
-            db_partial_tensor,
+            RMSNormBackward(dtype, N),
+            x_cute,
+            weight_cute,
+            dout_cute,
+            dres_out_cute,
+            rstd_cute,
+            dx_cute,
+            dw_partial_cute,
+            dres_cute,
+            db_partial_cute,
             sm_count,
-            current_stream,
+            cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+            options="--enable-tvm-ffi",
         )
     _rmsnorm_bwd.compile_cache[compile_key](
-        x_tensor,
-        weight_tensor,
-        dout_tensor,
-        dres_out_tensor,
-        rstd_tensor,
-        dx_tensor,
-        dw_partial_tensor,
-        dres_tensor,
-        db_partial_tensor,
-        sm_count,
-        current_stream,
+        x, weight, dout, dresidual_out, rstd, dx, dw_partial, dresidual, db_partial, sm_count
     )
@@ -1100,30 +911,37 @@ _rmsnorm_bwd.compile_cache = {}
 def rmsnorm_bwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor],
     dout: Tensor,
     rstd: Tensor,
     dresidual_out: Optional[Tensor] = None,  # grad wrt residual_out
     has_bias: bool = False,
-) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+    has_residual: bool = False,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
     device = x.device
     N = x.size(1)
-    sm_count = _get_sm_count(N, device)
     dx = torch.empty_like(x)
     if dresidual_out is not None and dresidual_out.dtype != dx.dtype:
         dresidual = torch.empty_like(x, dtype=dresidual_out.dtype)
     else:
         dresidual = None
-    # Always store partial gradients in fp32 for numerical accuracy
-    dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
+    sm_count = _get_sm_count(N, device)
+    if weight is not None:
+        # Always store partial gradients in fp32 for numerical accuracy
+        dw_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32)
+    else:
+        dw_partial = None
     db_partial = torch.empty(sm_count, N, device=device, dtype=torch.float32) if has_bias else None
-    _rmsnorm_bwd(x, weight, dout, rstd, dx, dw_partial, db_partial, dresidual_out, dresidual)
+    _rmsnorm_bwd(
+        x, weight, dout, rstd, dx, dw_partial, db_partial, dresidual_out, dresidual, sm_count
+    )
     # we have summed the partial gradients in fp32, now we convert back to the weight dtype
-    dw = dw_partial.sum(dim=0).to(weight.dtype)
+    dw = dw_partial.sum(dim=0).to(weight.dtype) if weight is not None else None
     db = db_partial.sum(dim=0).to(weight.dtype) if has_bias else None
     # dresidual is the same as dx in this case
-    if dresidual_out is not None and dresidual_out.dtype == dx.dtype:
+    if has_residual and dresidual is None:
         dresidual = dx
     return dx, dw, db, dresidual
@@ -1180,11 +998,16 @@ class RMSNormFunction(torch.autograd.Function):
         x_shape_og = ctx.x_shape_og
         # Reshape dout to match the flattened shape used in forward
         dout = dout.view(-1, dout.shape[-1])
-        dx, dw, db, dresidual = rmsnorm_bwd(x, weight, dout, rstd, dresidual_out, has_bias)
+        dx, dw, db, dresidual = rmsnorm_bwd(
+            x,
+            weight,
+            dout,
+            rstd,
+            dresidual_out,
+            has_bias,
+            has_residual=ctx.residual_dtype is not None,
+        )
         dx = dx.view(x_shape_og)
-        if dresidual_out is not None:
-            dresidual_out = dresidual_out.reshape(x_shape_og)
         if dresidual is not None:
             dresidual = dresidual.reshape(x_shape_og)
@@ -1193,7 +1016,7 @@ class RMSNormFunction(torch.autograd.Function):
 def rmsnorm(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor] = None,
     bias: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -1205,7 +1028,7 @@ def rmsnorm(
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         eps: Small value for numerical stability
     Returns:
@@ -1214,7 +1037,7 @@ def rmsnorm(
     return RMSNormFunction.apply(x, weight, bias, residual, out_dtype, residual_dtype, eps, prenorm)
-class QuackRMSNorm(torch.nn.Module):
+class QuackRMSNorm(torch.nn.RMSNorm):
     """RMSNorm module that behaves like torch.nn.RMSNorm.
     This class provides a drop-in replacement for torch.nn.RMSNorm that uses
@@ -1229,10 +1052,10 @@ class QuackRMSNorm(torch.nn.Module):
         eps (float): A small constant for numerical stability
     """
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(dim))
-        self.eps = eps
+    def __init__(
+        self, dim: int, eps: float = 1e-6, elementwise_affine: bool = True, device=None, dtype=None
+    ):
+        super().__init__(dim, eps, elementwise_affine, device=device, dtype=dtype)
     def forward(self, x: Tensor) -> Tensor:
         """Apply RMSNorm to the input tensor.
@@ -1245,6 +1068,67 @@ class QuackRMSNorm(torch.nn.Module):
         """
         return rmsnorm(x, self.weight, eps=self.eps)
-    def reset_parameters(self):
-        """Reset the weight parameter to ones."""
-        torch.nn.init.ones_(self.weight)
+def layernorm_fwd(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-6,
+    return_rstd: bool = False,
+    return_mean: bool = False,
+):
+    """LayerNorm forward pass using the unified RMSNorm/LayerNorm kernel.
+    Args:
+        x: Input tensor of shape (M, N)
+        weight: Weight tensor of shape (N,). Must be float32.
+        bias: Optional bias tensor of shape (N,). Must be float32.
+        eps: Small value for numerical stability
+        return_rstd: Whether to return the reciprocal standard deviation
+        return_mean: Whether to return the mean
+    Returns:
+        Normalized output tensor of same shape as x
+        If return_rstd is True, also returns rstd tensor of shape (M,)
+        If return_mean is True, also returns mean tensor of shape (M,)
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert weight.dim() == 1, "Weight must be 1D"
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
+    assert weight.dtype == torch.float32, "Weight must be float32"
+    if bias is not None:
+        assert bias.dim() == 1, "Bias must be 1D"
+        assert bias.dtype == torch.float32, "Bias must be float32"
+    M, N = x.shape
+    device = x.device
+    out = torch.empty_like(x)
+    rstd = torch.empty(M, device=device, dtype=torch.float32) if return_rstd else None
+    mean = torch.empty(M, device=device, dtype=torch.float32) if return_mean else None
+    _rmsnorm_fwd(x, weight, out, bias, rstd, mean, None, None, eps, True)
+    if return_rstd and return_mean:
+        return out, rstd, mean
+    elif return_rstd:
+        return out, rstd
+    elif return_mean:
+        return out, mean
+    return out
+def layernorm_ref(x: Tensor, w: Tensor, eps: float = 1e-6) -> Tensor:
+    """Reference implementation for LayerNorm."""
+    x_f32 = x.float()
+    return torch.nn.functional.layer_norm(x_f32, w.shape, w, None, eps).to(x.dtype)
+def layernorm_rstd_ref(x: torch.Tensor, eps: float = 1e-6):
+    x_f32 = x.float()
+    mean = x_f32.mean(dim=-1, keepdim=True)
+    var = ((x_f32 - mean) ** 2).mean(dim=-1)
+    return 1.0 / torch.sqrt(var + eps)
+def layernorm_mean_ref(x: torch.Tensor) -> torch.Tensor:
+    return x.float().mean(dim=-1)

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl