PyPI - quack-kernels - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

quack-kernels 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

quack/__init__.py +1 -1
quack/activation.py +16 -25
quack/autotuner.py +64 -5
quack/cross_entropy.py +6 -10
quack/cute_dsl_utils.py +6 -7
quack/dense_gemm_sm90.py +582 -287
quack/gemm_act_sm90.py +70 -29
quack/gemm_dact_sm90.py +43 -10
quack/gemm_interface.py +453 -130
quack/{dense_gemm_sm100.py → gemm_sm100.py} +443 -419
quack/gemm_wrapper_utils.py +179 -22
quack/layernorm.py +1 -1
quack/reduce.py +6 -7
quack/rmsnorm.py +126 -158
quack/softmax.py +1 -1
quack/tile_scheduler.py +37 -49
quack/utils.py +61 -71
quack/varlen_utils.py +1 -6
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/METADATA +3 -3
quack_kernels-0.2.2.dist-info/RECORD +37 -0
quack_kernels-0.2.0.dist-info/RECORD +0 -37
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.2.dist-info}/top_level.txt +0 -0

quack/rmsnorm.py CHANGED Viewed

@@ -19,6 +19,7 @@ from quack.reduce import row_reduce
 from quack.reduction_base import ReductionBase
 from quack.cute_dsl_utils import torch2cute_dtype_map
 class RMSNorm(ReductionBase):
     def __init__(self, dtype: cutlass.Numeric, N: int):
         super().__init__(dtype, N, stage=1)
@@ -93,7 +94,7 @@ class RMSNorm(ReductionBase):
     def __call__(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mB: Optional[cute.Tensor],
         mRes: Optional[cute.Tensor],
         mO: cute.Tensor,
@@ -129,10 +130,15 @@ class RMSNorm(ReductionBase):
         )
         num_threads = cute.size(tv_layout, mode=[0])
         num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+        if const_expr(mW is not None):
+            mW_expanded_layout = cute.prepend(
+                mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
+            mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
         if const_expr(mB is not None):
-            mB_expanded_layout = cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+            mB_expanded_layout = cute.prepend(
+                mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
             mB = cute.make_tensor(mB.iterator, mB_expanded_layout)
         if const_expr(mRstd is not None):
             mRstd_expanded_layout = cute.append(
@@ -155,7 +161,7 @@ class RMSNorm(ReductionBase):
     def kernel(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mB: Optional[cute.Tensor],
         mRes: Optional[cute.Tensor],
         mO: cute.Tensor,
@@ -201,12 +207,10 @@ class RMSNorm(ReductionBase):
             for mT in (mX, mRes, mO, mResO)
         ]
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
-        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
-        gB = (
-            cute.local_tile(mB, tiler_mn, (0, cluster_y))
-            if const_expr(mB is not None)
-            else None
-        )
+        gW, gB = [
+            cute.local_tile(mT, tiler_mn, (0, cluster_y)) if const_expr(mT is not None) else None
+            for mT in (mW, mB)
+        ]
         gRstd = (
             cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
             if const_expr(mRstd is not None)
@@ -215,47 +219,14 @@ class RMSNorm(ReductionBase):
         # declare the atoms which will be used later for memory copy
         num_copy_elems_X = tv_layout.shape[1][0]
-        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
+        copy_atom_load_X_async = utils.get_copy_atom(
+            mX.element_type, num_copy_elems_X, is_async=True
         )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        num_copy_bits_W = const_expr(min(128, num_copy_elems_X * mW.element_type.width))
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
-        )
-        num_bits_per_copy_B = cutlass.const_expr(
-            min(128, num_copy_elems_X * mB.element_type.width)
-        ) if const_expr(mB is not None) else 0
-        copy_atom_load_B = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mB.element_type, num_bits_per_copy=num_bits_per_copy_B
-        ) if const_expr(mB is not None) else None
-        if const_expr(mRes is not None):
-            num_copy_bits_Res = const_expr(min(128, num_copy_elems_X * mRes.element_type.width))
-            copy_atom_load_Res_async = cute.make_copy_atom(
-                cute.nvgpu.cpasync.CopyG2SOp(),
-                mRes.element_type,
-                num_bits_per_copy=num_copy_bits_Res,
-            )
-        num_copy_bits_O = const_expr(min(128, num_copy_elems_X * mO.element_type.width))
-        copy_atom_store_O = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mO.element_type, num_bits_per_copy=num_copy_bits_O
-        )
-        if const_expr(mResO is not None):
-            num_copy_bits_ResO = const_expr(min(128, num_copy_elems_X * mResO.element_type.width))
-            copy_atom_store_ResO = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mResO.element_type,
-                num_bits_per_copy=num_copy_bits_ResO,
-            )
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X_async, tv_layout, tiler_mn).get_slice(
             tidx
         )
-        tXgW = thr_copy_X.partition_S(gW)
+        tXgW = thr_copy_X.partition_S(gW) if const_expr(mW is not None) else None
         tXgB = thr_copy_X.partition_S(gB) if const_expr(mB is not None) else None
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
@@ -269,8 +240,9 @@ class RMSNorm(ReductionBase):
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
         # allocate fragments for gmem->rmem
-        tXrW = cute.make_fragment_like(tXgW)
-        tXrW.fill(0.0)
+        tXrW = cute.make_fragment_like(tXgW) if const_expr(mW is not None) else None
+        if const_expr(mW is not None):
+            tXrW.fill(0.0)
         tXrB = cute.make_fragment_like(tXgB) if const_expr(mB is not None) else None
         tXrX, tXrO = [cute.make_fragment_like(t) for t in (tXgX, tXgO)]
         if const_expr(mRes is not None):
@@ -283,17 +255,21 @@ class RMSNorm(ReductionBase):
         tXpX = (
             utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
         )
+        # Each copy will use the same number of elements as X and same predicate
+        copy = partial(utils.copy, pred=tXpX, num_copy_elems=num_copy_elems_X)
         row = tXcX[0][0]
         if row < shape[0]:
-            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+            copy(tXgX, tXsX, is_async=True)
             if const_expr(mRes is not None):
-                cute.copy(copy_atom_load_Res_async, tXgRes, tXsRes, pred=tXpX)
+                copy(tXgRes, tXsRes, is_async=True)
         cute.arch.cp_async_commit_group()
         if const_expr(not delay_w_load):
-            cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpX)
+            if const_expr(mW is not None):
+                copy(tXgW, tXrW)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_B, tXgB, tXrB, pred=tXpX)
+                copy(tXgB, tXrB)
         cute.arch.cp_async_wait_group(0)
         cute.autovec_copy(tXsX, tXrX)
@@ -305,7 +281,7 @@ class RMSNorm(ReductionBase):
             tXrResO = cute.make_fragment_like(tXgResO)
             tXrResO.store(x.to(tXrResO.element_type))
             if row < shape[0]:
-                cute.copy(copy_atom_store_ResO, tXrResO, tXgResO, pred=tXpX)
+                copy(tXrResO, tXgResO)
         threads_per_row = tv_layout.shape[0][0]
         sum_sq_x = row_reduce(
@@ -317,7 +293,7 @@ class RMSNorm(ReductionBase):
             init_val=0.0,
             hook_fn=(cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None),
         )
-        rstd = utils.rsqrt(sum_sq_x / shape[1] + eps)
+        rstd = cute.math.rsqrt(sum_sq_x / shape[1] + eps, fastmath=True)
         if const_expr(mRstd is not None):
             # Only the thread corresponding to column 0 writes out the rstd to gmem
             if (
@@ -327,27 +303,28 @@ class RMSNorm(ReductionBase):
             ):
                 tXrRstd[0] = rstd
         if const_expr(delay_w_load):
-            cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpX)
+            if const_expr(mW is not None):
+                copy(tXgW, tXrW)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_B, tXgB, tXrB, pred=tXpX)
+                copy(tXgB, tXrB)
         if const_expr(reload_from == "smem" or reload_from == "gmem"):
             if const_expr(reload_from == "smem"):
                 cute.autovec_copy(tXsX, tXrX)
             else:
-                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+                copy(tXgX, tXrX)
             x = tXrX.load().to(cute.Float32)
             if const_expr(mRes is not None):
                 cute.autovec_copy(tXsRes, tXrRes)
                 x += tXrRes.load().to(cute.Float32)
         x_hat = x * rstd
-        w = tXrW.load().to(cute.Float32)
-        y = x_hat * w
+        y = x_hat
+        if const_expr(mW is not None):
+            y *= tXrW.load().to(cute.Float32)
         if const_expr(mB is not None):
-            b = tXrB.load().to(cute.Float32)
-            y = y + b
+            y += tXrB.load().to(cute.Float32)
         tXrO.store(y.to(tXrO.element_type))
         if row < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tXpX)
+            copy(tXrO, tXgO)
 @torch.library.custom_op(
@@ -355,11 +332,11 @@ class RMSNorm(ReductionBase):
     mutates_args=("out", "rstd", "residual_out"),
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor(a!) out, Tensor? bias, Tensor(a!)? rstd, Tensor? residual, Tensor(a!)? residual_out, float eps=1e-6) -> ()",
+    schema="(Tensor x, Tensor? weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor? residual, Tensor(a6!)? residual_out, float eps=1e-6) -> ()",
 )
 def _rmsnorm_fwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor],
     out: Tensor,
     bias: Optional[Tensor] = None,
     rstd: Optional[Tensor] = None,
@@ -370,21 +347,23 @@ def _rmsnorm_fwd(
     """RMSNorm forward pass.
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         eps: Small value for numerical stability
     Returns:
         Normalized output tensor of same shape as x
     """
     assert x.dim() == 2, "Input must be 2D"
-    assert weight.dim() == 1, "Weight must be 1D"
-    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
-    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
+    assert x.is_cuda, "Input tensor must be on CUDA device"
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype in [
-        torch.float32,
-        torch.bfloat16,
-        torch.float16,
-    ], "Weight must be float32, float16 or bfloat16"
+    if weight is not None:
+        assert weight.dim() == 1, "Weight must be 1D"
+        assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
+        assert weight.is_cuda, "Weight tensor must be on CUDA device"
+        assert weight.dtype in [
+            torch.float32,
+            torch.bfloat16,
+            torch.float16,
+        ], "Weight must be float32, float16 or bfloat16"
     if residual is not None:
         assert residual.shape == x.shape
         assert residual.is_cuda
@@ -397,11 +376,6 @@ def _rmsnorm_fwd(
     _, N = x.shape
     device = x.device
     dtype = torch2cute_dtype_map[x.dtype]
-    # convert_from_dlpack = lambda x: (
-    #     from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
-    #         mode=0, divisibility=128 // dtype.width
-    #     )
-    # )
     convert_from_dlpack = lambda x: (
         from_dlpack(x.detach(), assumed_align=16).mark_layout_dynamic(leading_dim=1)
     )
@@ -409,10 +383,13 @@ def _rmsnorm_fwd(
         convert_from_dlpack(t) if t is not None else None for t in (x, residual, out, residual_out)
     ]
     # handle weight divisibility based on weight dtype
-    weight_dtype = torch2cute_dtype_map[weight.dtype]
-    weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
-    )
+    if weight is not None:
+        weight_dtype = torch2cute_dtype_map[weight.dtype]
+        weight_tensor = utils.convert_from_dlpack(
+            weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
+        )
+    else:
+        weight_tensor = None
     if bias is not None:
         bias_dtype = torch2cute_dtype_map[bias.dtype]
         bias_tensor = utils.convert_from_dlpack(
@@ -430,7 +407,7 @@ def _rmsnorm_fwd(
         N,
         dtype,
         res_tensor.element_type if residual is not None else None,
-        weight_tensor.element_type,
+        weight_tensor.element_type if weight is not None else None,
         bias_tensor.element_type if bias is not None else None,
         res_out_tensor.element_type if residual_out is not None else None,
         rstd is not None,
@@ -467,7 +444,7 @@ _rmsnorm_fwd.compile_cache = {}
 def rmsnorm_fwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor] = None,
     bias: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -496,12 +473,13 @@ def rmsnorm_fwd(
     return out, residual_out, rstd
-def rmsnorm_ref(x, w, bias=None, residual=None, eps=1e-6):
+def rmsnorm_ref(x, w=None, bias=None, residual=None, eps=1e-6):
     x_f32 = x.float()
     if residual is not None:
         residual_f32 = residual.float()
         x_f32 += residual_f32
-    out = x_f32 / (torch.sqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + eps)) * w
+    x_norm = x_f32 / (torch.sqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + eps))
+    out = x_norm * w if w is not None else x_norm
     if bias is not None:
         out = out + bias.float()
     if residual is None:
@@ -509,6 +487,7 @@ def rmsnorm_ref(x, w, bias=None, residual=None, eps=1e-6):
     else:
         return out.to(x.dtype), x_f32.to(residual.dtype)
 def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
     """Reference implementation for RMSNorm backward pass."""
     x_f32 = x.float()
@@ -521,6 +500,7 @@ def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
     dw = (dout * x_hat).sum(dim=0)
     return dx.to(x.dtype), dw.to(w.dtype)
 class RMSNormBackward(ReductionBase):
     def __init__(self, dtype: cutlass.Numeric, N: int):
         # 2 stages for double buffering when computing mean of x_hat * wdy
@@ -606,8 +586,11 @@ class RMSNormBackward(ReductionBase):
         )
         num_threads = cute.size(tv_layout, mode=[0])
         num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+        if const_expr(mW is not None):
+            mW_expanded_layout = cute.prepend(
+                mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
+            mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
         num_blocks = sm_count
         self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn).launch(
@@ -660,50 +643,10 @@ class RMSNormBackward(ReductionBase):
             mbar_full_ptr, mbar_empty_ptr = None, None
         num_copy_elems_X = tv_layout.shape[1][0]
-        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        num_copy_bits_dO = const_expr(min(128, num_copy_elems_X * mdO.element_type.width))
-        copy_atom_load_dO_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mdO.element_type, num_bits_per_copy=num_copy_bits_dO
-        )
-        num_copy_bits_W = const_expr(min(128, num_copy_elems_X * mW.element_type.width))
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
-        )
-        if const_expr(mdResO is not None):
-            num_copy_bits_dResO = const_expr(min(128, num_copy_elems_X * mdResO.element_type.width))
-            copy_atom_load_dResO = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mdResO.element_type,
-                num_bits_per_copy=num_copy_bits_dResO,
-            )
-        num_copy_bits_dX = const_expr(min(128, num_copy_elems_X * mdX.element_type.width))
-        copy_atom_store_dX = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mdX.element_type, num_bits_per_copy=num_copy_bits_dX
-        )
-        num_copy_bits_dW = const_expr(min(128, num_copy_elems_X * mdW.element_type.width))
-        copy_atom_store_dW = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mdW.element_type, num_bits_per_copy=num_copy_bits_dW
-        )
-        if const_expr(mdB is not None):
-            num_copy_bits_dB = const_expr(min(128, num_copy_elems_X * mdB.element_type.width))
-            copy_atom_store_dB = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mdB.element_type, num_bits_per_copy=num_copy_bits_dB
-            )
-        if const_expr(mdRes is not None):
-            num_copy_bits_dRes = const_expr(min(128, num_copy_elems_X * mdRes.element_type.width))
-            copy_atom_load_dRes = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mdRes.element_type,
-                num_bits_per_copy=num_copy_bits_dRes,
-            )
+        copy_atom_load_X = utils.get_copy_atom(mX.element_type, num_copy_elems_X, is_async=False)
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        # Each copy will use the same number of elements as X
+        copy = partial(utils.copy, num_copy_elems=num_copy_elems_X)
         gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         tXgW = thr_copy_X.partition_S(gW)
@@ -718,7 +661,7 @@ class RMSNormBackward(ReductionBase):
             if not is_even_N
             else None
         )
-        cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpW)
+        copy(tXgW, tXrW, pred=tXpW)
         weight = tXrW.load().to(cute.Float32)
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
@@ -744,7 +687,11 @@ class RMSNormBackward(ReductionBase):
         # Always compute partial weight gradients in fp32
         tXrdW = cute.make_fragment_like(tXgdW, Float32)
-        gdB = cute.local_tile(mdB, (1, tiler_mn[1]), (bidx_start, cluster_y)) if const_expr(mdB is not None) else None
+        gdB = (
+            cute.local_tile(mdB, (1, tiler_mn[1]), (bidx_start, cluster_y))
+            if const_expr(mdB is not None)
+            else None
+        )
         tXgdB = thr_copy_X.partition_S(gdB) if const_expr(mdB is not None) else None
         tXrdB = cute.make_fragment_like(tXgdB, Float32) if const_expr(mdB is not None) else None
@@ -772,21 +719,20 @@ class RMSNormBackward(ReductionBase):
         tXrX, tXrdO, tXrdX = [
             cute.make_fragment_like(thr[None, None, None, 0]) for thr in (tXgX, tXgdO, tXgdX)
         ]
+        tXrdResO = None
         if const_expr(mdResO is not None):
             tXrdResO = cute.make_fragment_like(tXgdResO[None, None, None, 0])
+        tXrdRes = None
         if const_expr(mdRes is not None):
             tXrdRes = cute.make_fragment_like(tXgdRes[None, None, None, 0])
-        copy_X = partial(cute.copy, copy_atom_load_X_async, pred=tXpX)
-        copy_dO = partial(cute.copy, copy_atom_load_dO_async, pred=tXpX)
         # Prefetch the first batch
         row = tXcX[None, None, None, bidx_start][0][0]
         if row < M:
             tXgX_cur = utils.coord_offset_i64(bidx_start, tXgX, dim=3)[None, None, None, 0]
             tXgdO_cur = utils.coord_offset_i64(bidx_start, tXgdO, dim=3)[None, None, None, 0]
-            copy_X(tXgX_cur, tXsX[None, None, None, 0])
-            copy_dO(tXgdO_cur, tXsdO[None, None, None, 0])
+            copy(tXgX_cur, tXsX[None, None, None, 0], pred=tXpX, is_async=True)
+            copy(tXgdO_cur, tXsdO[None, None, None, 0], pred=tXpX, is_async=True)
         elif tiler_mn[0] > 1:
             # Fill with zero, otherwise smem will be uninitialized, and we could read this back
             # later into registers, causing wrong dW.
@@ -809,8 +755,8 @@ class RMSNormBackward(ReductionBase):
             if row + gdim * tiler_mn[0] < M:  # Prefetch the next batch
                 tXgX_cur = utils.coord_offset_i64(bidx + gdim, tXgX, dim=3)[None, None, None, 0]
                 tXgdO_cur = utils.coord_offset_i64(bidx + gdim, tXgdO, dim=3)[None, None, None, 0]
-                copy_X(tXgX_cur, tXsX[None, None, None, stage ^ 1])
-                copy_dO(tXgdO_cur, tXsdO[None, None, None, stage ^ 1])
+                copy(tXgX_cur, tXsX[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
+                copy(tXgdO_cur, tXsdO[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
             elif tiler_mn[0] > 1:
                 utils.fill_oob(
                     tXsX[None, None, None, stage ^ 1],
@@ -829,7 +775,7 @@ class RMSNormBackward(ReductionBase):
             if const_expr(mdResO is not None):
                 tXgdResO_cur = utils.coord_offset_i64(bidx, tXgdResO, dim=3)[None, None, None, 0]
                 if row < M or tiler_mn[0] == 1:
-                    cute.copy(copy_atom_load_dResO, tXgdResO_cur, tXrdResO, pred=tXpX)
+                    copy(tXgdResO_cur, tXrdResO, pred=tXpX)
                 elif tiler_mn[0] > 1:
                     tXrdResO.fill(0.0)
             cute.arch.cp_async_wait_group(1)
@@ -877,12 +823,12 @@ class RMSNormBackward(ReductionBase):
             tXrdX.store(dx.to(tXrdX.element_type))
             if row < M or tiler_mn[0] == 1:
                 tXgdX_cur = utils.coord_offset_i64(bidx, tXgdX, dim=3)[None, None, None, 0]
-                cute.copy(copy_atom_store_dX, tXrdX, tXgdX_cur, pred=tXpX)
+                copy(tXrdX, tXgdX_cur, pred=tXpX)
             if const_expr(mdRes is not None):
                 tXrdRes.store(dx.to(tXrdRes.element_type))
                 tXgdRes_cur = utils.coord_offset_i64(bidx, tXgdRes, dim=3)[None, None, None, 0]
                 if row < M or tiler_mn[0] == 1:
-                    cute.copy(copy_atom_load_dRes, tXrdRes, tXgdRes_cur, pred=tXpX)
+                    copy(tXrdRes, tXgdRes_cur, pred=tXpX)
             # Accumulate weight gradients in fp32
             tXrdW.store(tXrdW.load() + dout * x_hat)
             if const_expr(mdB is not None):
@@ -914,7 +860,7 @@ class RMSNormBackward(ReductionBase):
                     tXsdW_other = cute.make_tensor(tXsdW.iterator + i * sdW.stride[0], tXsdW.layout)
                     cute.autovec_copy(tXsdW_other, tXrdW_other)
                     tXrdW.store(tXrdW.load() + tXrdW_other.load())
-                cute.copy(copy_atom_store_dW, tXrdW, tXgdW, pred=tXpdW)
+                copy(tXrdW, tXgdW, pred=tXpdW)
             cute.arch.barrier()
             if const_expr(mdB is not None):
                 sdB = cute.make_tensor(
@@ -930,15 +876,17 @@ class RMSNormBackward(ReductionBase):
                 if row == 0:
                     for i in cutlass.range_constexpr(1, const_expr(tiler_mn[0])):
                         tXrdB_other = cute.make_fragment_like(tXrdB)
-                        tXsdB_other = cute.make_tensor(tXsdB.iterator + i * sdB.stride[0], tXsdB.layout)
+                        tXsdB_other = cute.make_tensor(
+                            tXsdB.iterator + i * sdB.stride[0], tXsdB.layout
+                        )
                         cute.autovec_copy(tXsdB_other, tXrdB_other)
                         tXrdB.store(tXrdB.load() + tXrdB_other.load())
-                    cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
+                    copy(tXrdB, tXgdB, pred=tXpdB)
         else:
             # dw is already in fp32, so we can directly copy to global memory
-            cute.copy(copy_atom_store_dW, tXrdW, tXgdW, pred=tXpdW)
+            copy(tXrdW, tXgdW, pred=tXpdW)
             if const_expr(mdB is not None):
-                cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
+                copy(tXrdB, tXgdB, pred=tXpdB)
 def _get_sm_count(N: int, device: torch.device) -> int:
@@ -963,7 +911,7 @@ def _get_sm_count(N: int, device: torch.device) -> int:
     mutates_args={"dx", "dw_partial", "db_partial", "dresidual"},
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor dout, Tensor rstd, Tensor(a!) dx, Tensor(a!) dw_partial, Tensor(a!)? db_partial, Tensor? dresidual_out, Tensor(a!)? dresidual) -> ()",
+    schema="(Tensor x, Tensor weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!) dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual) -> ()",
 )
 def _rmsnorm_bwd(
     x: Tensor,
@@ -1031,14 +979,23 @@ def _rmsnorm_bwd(
     )
     dw_partial_tensor = from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    db_partial_tensor = from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0) if db_partial is not None else None
+    db_partial_tensor = (
+        from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        if db_partial is not None
+        else None
+    )
     rstd_tensor = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (N, x_tensor.element_type, weight_tensor.element_type, db_partial.dtype if db_partial is not None else None,
+    compile_key = (
+        N,
+        x_tensor.element_type,
+        weight_tensor.element_type,
+        db_partial.dtype if db_partial is not None else None,
         dresidual.dtype if dresidual is not None else None,
-        dresidual_out.dtype if dresidual_out is not None else None)
+        dresidual_out.dtype if dresidual_out is not None else None,
+    )
     if compile_key not in _rmsnorm_bwd.compile_cache:
         rmsnorm_backward_op = RMSNormBackward(x_tensor.element_type, N)
         _rmsnorm_bwd.compile_cache[compile_key] = cute.compile(
@@ -1106,7 +1063,17 @@ def rmsnorm_bwd(
 class RMSNormFunction(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, x, weight, bias=None, residual=None, out_dtype=None, residual_dtype=None, eps=1e-6, prenorm=False):
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias=None,
+        residual=None,
+        out_dtype=None,
+        residual_dtype=None,
+        eps=1e-6,
+        prenorm=False,
+    ):
         x_shape_og = x.shape
         # Flatten input
         x = x.reshape(-1, x.shape[-1])
@@ -1129,7 +1096,7 @@ class RMSNormFunction(torch.autograd.Function):
         ctx.x_shape_og = x_shape_og
         ctx.residual_dtype = residual.dtype if residual is not None else None
         ctx.prenorm = prenorm
-        if residual_out is None or prenorm == False:
+        if residual_out is None or not prenorm:
             return out.reshape(x_shape_og)
         else:
             return out.reshape(x_shape_og), residual_out.reshape(x_shape_og)
@@ -1137,6 +1104,7 @@ class RMSNormFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, dout, *args):
         x, weight, rstd = ctx.saved_tensors
+        assert weight is not None, "RMSNorm backward doesn't support weight=None yet"
         has_bias = ctx.has_bias
         if ctx.prenorm and ctx.residual_dtype is not None:
             dresidual_out = args[0]
@@ -1159,7 +1127,7 @@ class RMSNormFunction(torch.autograd.Function):
 def rmsnorm(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor] = None,
     bias: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -1171,7 +1139,7 @@ def rmsnorm(
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         eps: Small value for numerical stability
     Returns:
@@ -1213,4 +1181,4 @@ class QuackRMSNorm(torch.nn.Module):
     def reset_parameters(self):
         """Reset the weight parameter to ones."""
-        torch.nn.init.ones_(self.weight)
+        torch.nn.init.ones_(self.weight)

quack/softmax.py CHANGED Viewed

@@ -159,7 +159,7 @@ class Softmax(ReductionBase):
                 hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
             )
             log2_e = math.log2(math.e)
-            exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
+            exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
             denom = row_reduce(
                 exp_x,
                 cute.ReductionOp.ADD,

quack-kernels 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

quack-kernels 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl