PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

quack/__init__.py +1 -1
quack/autotuner.py +64 -5
quack/cute_dsl_utils.py +6 -7
quack/dense_gemm_sm90.py +582 -287
quack/gemm_act_sm90.py +70 -29
quack/gemm_dact_sm90.py +43 -10
quack/gemm_interface.py +453 -130
quack/{dense_gemm_sm100.py → gemm_sm100.py} +443 -419
quack/gemm_wrapper_utils.py +179 -22
quack/rmsnorm.py +83 -149
quack/tile_scheduler.py +34 -47
quack/utils.py +61 -8
quack/varlen_utils.py +1 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.2.dist-info}/METADATA +2 -2
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.2.dist-info}/RECORD +18 -18
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.2.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.2.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.2.dist-info}/top_level.txt +0 -0

quack/rmsnorm.py CHANGED Viewed

@@ -94,7 +94,7 @@ class RMSNorm(ReductionBase):
     def __call__(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mB: Optional[cute.Tensor],
         mRes: Optional[cute.Tensor],
         mO: cute.Tensor,
@@ -130,8 +130,11 @@ class RMSNorm(ReductionBase):
         )
         num_threads = cute.size(tv_layout, mode=[0])
         num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+        if const_expr(mW is not None):
+            mW_expanded_layout = cute.prepend(
+                mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
+            mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
         if const_expr(mB is not None):
             mB_expanded_layout = cute.prepend(
                 mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
@@ -158,7 +161,7 @@ class RMSNorm(ReductionBase):
     def kernel(
         self,
         mX: cute.Tensor,
-        mW: cute.Tensor,
+        mW: Optional[cute.Tensor],
         mB: Optional[cute.Tensor],
         mRes: Optional[cute.Tensor],
         mO: cute.Tensor,
@@ -204,8 +207,10 @@ class RMSNorm(ReductionBase):
             for mT in (mX, mRes, mO, mResO)
         ]
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
-        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
-        gB = cute.local_tile(mB, tiler_mn, (0, cluster_y)) if const_expr(mB is not None) else None
+        gW, gB = [
+            cute.local_tile(mT, tiler_mn, (0, cluster_y)) if const_expr(mT is not None) else None
+            for mT in (mW, mB)
+        ]
         gRstd = (
             cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
             if const_expr(mRstd is not None)
@@ -214,53 +219,14 @@ class RMSNorm(ReductionBase):
         # declare the atoms which will be used later for memory copy
         num_copy_elems_X = tv_layout.shape[1][0]
-        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        num_copy_bits_W = const_expr(min(128, num_copy_elems_X * mW.element_type.width))
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
-        )
-        num_bits_per_copy_B = (
-            cutlass.const_expr(min(128, num_copy_elems_X * mB.element_type.width))
-            if const_expr(mB is not None)
-            else 0
-        )
-        copy_atom_load_B = (
-            cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mB.element_type, num_bits_per_copy=num_bits_per_copy_B
-            )
-            if const_expr(mB is not None)
-            else None
-        )
-        if const_expr(mRes is not None):
-            num_copy_bits_Res = const_expr(min(128, num_copy_elems_X * mRes.element_type.width))
-            copy_atom_load_Res_async = cute.make_copy_atom(
-                cute.nvgpu.cpasync.CopyG2SOp(),
-                mRes.element_type,
-                num_bits_per_copy=num_copy_bits_Res,
-            )
-        num_copy_bits_O = const_expr(min(128, num_copy_elems_X * mO.element_type.width))
-        copy_atom_store_O = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mO.element_type, num_bits_per_copy=num_copy_bits_O
+        copy_atom_load_X_async = utils.get_copy_atom(
+            mX.element_type, num_copy_elems_X, is_async=True
         )
-        if const_expr(mResO is not None):
-            num_copy_bits_ResO = const_expr(min(128, num_copy_elems_X * mResO.element_type.width))
-            copy_atom_store_ResO = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mResO.element_type,
-                num_bits_per_copy=num_copy_bits_ResO,
-            )
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X_async, tv_layout, tiler_mn).get_slice(
             tidx
         )
-        tXgW = thr_copy_X.partition_S(gW)
+        tXgW = thr_copy_X.partition_S(gW) if const_expr(mW is not None) else None
         tXgB = thr_copy_X.partition_S(gB) if const_expr(mB is not None) else None
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
@@ -274,8 +240,9 @@ class RMSNorm(ReductionBase):
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
         # allocate fragments for gmem->rmem
-        tXrW = cute.make_fragment_like(tXgW)
-        tXrW.fill(0.0)
+        tXrW = cute.make_fragment_like(tXgW) if const_expr(mW is not None) else None
+        if const_expr(mW is not None):
+            tXrW.fill(0.0)
         tXrB = cute.make_fragment_like(tXgB) if const_expr(mB is not None) else None
         tXrX, tXrO = [cute.make_fragment_like(t) for t in (tXgX, tXgO)]
         if const_expr(mRes is not None):
@@ -288,17 +255,21 @@ class RMSNorm(ReductionBase):
         tXpX = (
             utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
         )
+        # Each copy will use the same number of elements as X and same predicate
+        copy = partial(utils.copy, pred=tXpX, num_copy_elems=num_copy_elems_X)
         row = tXcX[0][0]
         if row < shape[0]:
-            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+            copy(tXgX, tXsX, is_async=True)
             if const_expr(mRes is not None):
-                cute.copy(copy_atom_load_Res_async, tXgRes, tXsRes, pred=tXpX)
+                copy(tXgRes, tXsRes, is_async=True)
         cute.arch.cp_async_commit_group()
         if const_expr(not delay_w_load):
-            cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpX)
+            if const_expr(mW is not None):
+                copy(tXgW, tXrW)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_B, tXgB, tXrB, pred=tXpX)
+                copy(tXgB, tXrB)
         cute.arch.cp_async_wait_group(0)
         cute.autovec_copy(tXsX, tXrX)
@@ -310,7 +281,7 @@ class RMSNorm(ReductionBase):
             tXrResO = cute.make_fragment_like(tXgResO)
             tXrResO.store(x.to(tXrResO.element_type))
             if row < shape[0]:
-                cute.copy(copy_atom_store_ResO, tXrResO, tXgResO, pred=tXpX)
+                copy(tXrResO, tXgResO)
         threads_per_row = tv_layout.shape[0][0]
         sum_sq_x = row_reduce(
@@ -332,27 +303,28 @@ class RMSNorm(ReductionBase):
             ):
                 tXrRstd[0] = rstd
         if const_expr(delay_w_load):
-            cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpX)
+            if const_expr(mW is not None):
+                copy(tXgW, tXrW)
             if const_expr(mB is not None):
-                cute.copy(copy_atom_load_B, tXgB, tXrB, pred=tXpX)
+                copy(tXgB, tXrB)
         if const_expr(reload_from == "smem" or reload_from == "gmem"):
             if const_expr(reload_from == "smem"):
                 cute.autovec_copy(tXsX, tXrX)
             else:
-                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+                copy(tXgX, tXrX)
             x = tXrX.load().to(cute.Float32)
             if const_expr(mRes is not None):
                 cute.autovec_copy(tXsRes, tXrRes)
                 x += tXrRes.load().to(cute.Float32)
         x_hat = x * rstd
-        w = tXrW.load().to(cute.Float32)
-        y = x_hat * w
+        y = x_hat
+        if const_expr(mW is not None):
+            y *= tXrW.load().to(cute.Float32)
         if const_expr(mB is not None):
-            b = tXrB.load().to(cute.Float32)
-            y = y + b
+            y += tXrB.load().to(cute.Float32)
         tXrO.store(y.to(tXrO.element_type))
         if row < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tXpX)
+            copy(tXrO, tXgO)
 @torch.library.custom_op(
@@ -360,11 +332,11 @@ class RMSNorm(ReductionBase):
     mutates_args=("out", "rstd", "residual_out"),
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor? residual, Tensor(a6!)? residual_out, float eps=1e-6) -> ()",
+    schema="(Tensor x, Tensor? weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor? residual, Tensor(a6!)? residual_out, float eps=1e-6) -> ()",
 )
 def _rmsnorm_fwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor],
     out: Tensor,
     bias: Optional[Tensor] = None,
     rstd: Optional[Tensor] = None,
@@ -375,21 +347,23 @@ def _rmsnorm_fwd(
     """RMSNorm forward pass.
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         eps: Small value for numerical stability
     Returns:
         Normalized output tensor of same shape as x
     """
     assert x.dim() == 2, "Input must be 2D"
-    assert weight.dim() == 1, "Weight must be 1D"
-    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
-    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
+    assert x.is_cuda, "Input tensor must be on CUDA device"
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype in [
-        torch.float32,
-        torch.bfloat16,
-        torch.float16,
-    ], "Weight must be float32, float16 or bfloat16"
+    if weight is not None:
+        assert weight.dim() == 1, "Weight must be 1D"
+        assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
+        assert weight.is_cuda, "Weight tensor must be on CUDA device"
+        assert weight.dtype in [
+            torch.float32,
+            torch.bfloat16,
+            torch.float16,
+        ], "Weight must be float32, float16 or bfloat16"
     if residual is not None:
         assert residual.shape == x.shape
         assert residual.is_cuda
@@ -402,11 +376,6 @@ def _rmsnorm_fwd(
     _, N = x.shape
     device = x.device
     dtype = torch2cute_dtype_map[x.dtype]
-    # convert_from_dlpack = lambda x: (
-    #     from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
-    #         mode=0, divisibility=128 // dtype.width
-    #     )
-    # )
     convert_from_dlpack = lambda x: (
         from_dlpack(x.detach(), assumed_align=16).mark_layout_dynamic(leading_dim=1)
     )
@@ -414,10 +383,13 @@ def _rmsnorm_fwd(
         convert_from_dlpack(t) if t is not None else None for t in (x, residual, out, residual_out)
     ]
     # handle weight divisibility based on weight dtype
-    weight_dtype = torch2cute_dtype_map[weight.dtype]
-    weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
-    )
+    if weight is not None:
+        weight_dtype = torch2cute_dtype_map[weight.dtype]
+        weight_tensor = utils.convert_from_dlpack(
+            weight.detach(), leading_dim=0, divisibility=128 // weight_dtype.width
+        )
+    else:
+        weight_tensor = None
     if bias is not None:
         bias_dtype = torch2cute_dtype_map[bias.dtype]
         bias_tensor = utils.convert_from_dlpack(
@@ -435,7 +407,7 @@ def _rmsnorm_fwd(
         N,
         dtype,
         res_tensor.element_type if residual is not None else None,
-        weight_tensor.element_type,
+        weight_tensor.element_type if weight is not None else None,
         bias_tensor.element_type if bias is not None else None,
         res_out_tensor.element_type if residual_out is not None else None,
         rstd is not None,
@@ -472,7 +444,7 @@ _rmsnorm_fwd.compile_cache = {}
 def rmsnorm_fwd(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor] = None,
     bias: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -501,12 +473,13 @@ def rmsnorm_fwd(
     return out, residual_out, rstd
-def rmsnorm_ref(x, w, bias=None, residual=None, eps=1e-6):
+def rmsnorm_ref(x, w=None, bias=None, residual=None, eps=1e-6):
     x_f32 = x.float()
     if residual is not None:
         residual_f32 = residual.float()
         x_f32 += residual_f32
-    out = x_f32 / (torch.sqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + eps)) * w
+    x_norm = x_f32 / (torch.sqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + eps))
+    out = x_norm * w if w is not None else x_norm
     if bias is not None:
         out = out + bias.float()
     if residual is None:
@@ -613,8 +586,11 @@ class RMSNormBackward(ReductionBase):
         )
         num_threads = cute.size(tv_layout, mode=[0])
         num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+        if const_expr(mW is not None):
+            mW_expanded_layout = cute.prepend(
+                mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
+            mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
         num_blocks = sm_count
         self.kernel(mX, mW, mdO, mdResO, mRstd, mdX, mdW, mdB, mdRes, tv_layout, tiler_mn).launch(
@@ -667,50 +643,10 @@ class RMSNormBackward(ReductionBase):
             mbar_full_ptr, mbar_empty_ptr = None, None
         num_copy_elems_X = tv_layout.shape[1][0]
-        num_copy_bits_X = mX.element_type.width * num_copy_elems_X
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=num_copy_bits_X
-        )
-        num_copy_bits_dO = const_expr(min(128, num_copy_elems_X * mdO.element_type.width))
-        copy_atom_load_dO_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mdO.element_type, num_bits_per_copy=num_copy_bits_dO
-        )
-        num_copy_bits_W = const_expr(min(128, num_copy_elems_X * mW.element_type.width))
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
-        )
-        if const_expr(mdResO is not None):
-            num_copy_bits_dResO = const_expr(min(128, num_copy_elems_X * mdResO.element_type.width))
-            copy_atom_load_dResO = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mdResO.element_type,
-                num_bits_per_copy=num_copy_bits_dResO,
-            )
-        num_copy_bits_dX = const_expr(min(128, num_copy_elems_X * mdX.element_type.width))
-        copy_atom_store_dX = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mdX.element_type, num_bits_per_copy=num_copy_bits_dX
-        )
-        num_copy_bits_dW = const_expr(min(128, num_copy_elems_X * mdW.element_type.width))
-        copy_atom_store_dW = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mdW.element_type, num_bits_per_copy=num_copy_bits_dW
-        )
-        if const_expr(mdB is not None):
-            num_copy_bits_dB = const_expr(min(128, num_copy_elems_X * mdB.element_type.width))
-            copy_atom_store_dB = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mdB.element_type, num_bits_per_copy=num_copy_bits_dB
-            )
-        if const_expr(mdRes is not None):
-            num_copy_bits_dRes = const_expr(min(128, num_copy_elems_X * mdRes.element_type.width))
-            copy_atom_load_dRes = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mdRes.element_type,
-                num_bits_per_copy=num_copy_bits_dRes,
-            )
+        copy_atom_load_X = utils.get_copy_atom(mX.element_type, num_copy_elems_X, is_async=False)
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        # Each copy will use the same number of elements as X
+        copy = partial(utils.copy, num_copy_elems=num_copy_elems_X)
         gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         tXgW = thr_copy_X.partition_S(gW)
@@ -725,7 +661,7 @@ class RMSNormBackward(ReductionBase):
             if not is_even_N
             else None
         )
-        cute.copy(copy_atom_load_W, tXgW, tXrW, pred=tXpW)
+        copy(tXgW, tXrW, pred=tXpW)
         weight = tXrW.load().to(cute.Float32)
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
@@ -790,16 +726,13 @@ class RMSNormBackward(ReductionBase):
         if const_expr(mdRes is not None):
             tXrdRes = cute.make_fragment_like(tXgdRes[None, None, None, 0])
-        copy_X = partial(cute.copy, copy_atom_load_X_async, pred=tXpX)
-        copy_dO = partial(cute.copy, copy_atom_load_dO_async, pred=tXpX)
         # Prefetch the first batch
         row = tXcX[None, None, None, bidx_start][0][0]
         if row < M:
             tXgX_cur = utils.coord_offset_i64(bidx_start, tXgX, dim=3)[None, None, None, 0]
             tXgdO_cur = utils.coord_offset_i64(bidx_start, tXgdO, dim=3)[None, None, None, 0]
-            copy_X(tXgX_cur, tXsX[None, None, None, 0])
-            copy_dO(tXgdO_cur, tXsdO[None, None, None, 0])
+            copy(tXgX_cur, tXsX[None, None, None, 0], pred=tXpX, is_async=True)
+            copy(tXgdO_cur, tXsdO[None, None, None, 0], pred=tXpX, is_async=True)
         elif tiler_mn[0] > 1:
             # Fill with zero, otherwise smem will be uninitialized, and we could read this back
             # later into registers, causing wrong dW.
@@ -822,8 +755,8 @@ class RMSNormBackward(ReductionBase):
             if row + gdim * tiler_mn[0] < M:  # Prefetch the next batch
                 tXgX_cur = utils.coord_offset_i64(bidx + gdim, tXgX, dim=3)[None, None, None, 0]
                 tXgdO_cur = utils.coord_offset_i64(bidx + gdim, tXgdO, dim=3)[None, None, None, 0]
-                copy_X(tXgX_cur, tXsX[None, None, None, stage ^ 1])
-                copy_dO(tXgdO_cur, tXsdO[None, None, None, stage ^ 1])
+                copy(tXgX_cur, tXsX[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
+                copy(tXgdO_cur, tXsdO[None, None, None, stage ^ 1], pred=tXpX, is_async=True)
             elif tiler_mn[0] > 1:
                 utils.fill_oob(
                     tXsX[None, None, None, stage ^ 1],
@@ -842,7 +775,7 @@ class RMSNormBackward(ReductionBase):
             if const_expr(mdResO is not None):
                 tXgdResO_cur = utils.coord_offset_i64(bidx, tXgdResO, dim=3)[None, None, None, 0]
                 if row < M or tiler_mn[0] == 1:
-                    cute.copy(copy_atom_load_dResO, tXgdResO_cur, tXrdResO, pred=tXpX)
+                    copy(tXgdResO_cur, tXrdResO, pred=tXpX)
                 elif tiler_mn[0] > 1:
                     tXrdResO.fill(0.0)
             cute.arch.cp_async_wait_group(1)
@@ -890,12 +823,12 @@ class RMSNormBackward(ReductionBase):
             tXrdX.store(dx.to(tXrdX.element_type))
             if row < M or tiler_mn[0] == 1:
                 tXgdX_cur = utils.coord_offset_i64(bidx, tXgdX, dim=3)[None, None, None, 0]
-                cute.copy(copy_atom_store_dX, tXrdX, tXgdX_cur, pred=tXpX)
+                copy(tXrdX, tXgdX_cur, pred=tXpX)
             if const_expr(mdRes is not None):
                 tXrdRes.store(dx.to(tXrdRes.element_type))
                 tXgdRes_cur = utils.coord_offset_i64(bidx, tXgdRes, dim=3)[None, None, None, 0]
                 if row < M or tiler_mn[0] == 1:
-                    cute.copy(copy_atom_load_dRes, tXrdRes, tXgdRes_cur, pred=tXpX)
+                    copy(tXrdRes, tXgdRes_cur, pred=tXpX)
             # Accumulate weight gradients in fp32
             tXrdW.store(tXrdW.load() + dout * x_hat)
             if const_expr(mdB is not None):
@@ -927,7 +860,7 @@ class RMSNormBackward(ReductionBase):
                     tXsdW_other = cute.make_tensor(tXsdW.iterator + i * sdW.stride[0], tXsdW.layout)
                     cute.autovec_copy(tXsdW_other, tXrdW_other)
                     tXrdW.store(tXrdW.load() + tXrdW_other.load())
-                cute.copy(copy_atom_store_dW, tXrdW, tXgdW, pred=tXpdW)
+                copy(tXrdW, tXgdW, pred=tXpdW)
             cute.arch.barrier()
             if const_expr(mdB is not None):
                 sdB = cute.make_tensor(
@@ -948,12 +881,12 @@ class RMSNormBackward(ReductionBase):
                         )
                         cute.autovec_copy(tXsdB_other, tXrdB_other)
                         tXrdB.store(tXrdB.load() + tXrdB_other.load())
-                    cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
+                    copy(tXrdB, tXgdB, pred=tXpdB)
         else:
             # dw is already in fp32, so we can directly copy to global memory
-            cute.copy(copy_atom_store_dW, tXrdW, tXgdW, pred=tXpdW)
+            copy(tXrdW, tXgdW, pred=tXpdW)
             if const_expr(mdB is not None):
-                cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
+                copy(tXrdB, tXgdB, pred=tXpdB)
 def _get_sm_count(N: int, device: torch.device) -> int:
@@ -1171,6 +1104,7 @@ class RMSNormFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, dout, *args):
         x, weight, rstd = ctx.saved_tensors
+        assert weight is not None, "RMSNorm backward doesn't support weight=None yet"
         has_bias = ctx.has_bias
         if ctx.prenorm and ctx.residual_dtype is not None:
             dresidual_out = args[0]
@@ -1193,7 +1127,7 @@ class RMSNormFunction(torch.autograd.Function):
 def rmsnorm(
     x: Tensor,
-    weight: Tensor,
+    weight: Optional[Tensor] = None,
     bias: Optional[Tensor] = None,
     residual: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -1205,7 +1139,7 @@ def rmsnorm(
     Args:
         x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
+        weight: Optional weight tensor of shape (N,)
         eps: Small value for numerical stability
     Returns:

quack/tile_scheduler.py CHANGED Viewed

@@ -135,7 +135,7 @@ class TileScheduler:
         ip=None,
     ):
         self._current_work_linear_idx = current_work_linear_idx
-        self._num_tiles_executed = num_tiles_executed
+        self.num_tiles_executed = num_tiles_executed
         self._tile_count = tile_count
         self._scheduler_pipeline = scheduler_pipeline
         self._pipeline_state = pipeline_state
@@ -251,7 +251,7 @@ class TileScheduler:
         )
         tile_coord_mnkl = (pid_m, pid_n, None, batch_idx)
         if const_expr(not params.is_persistent):
-            is_valid = self._num_tiles_executed == 0
+            is_valid = self.num_tiles_executed == 0
         else:
             is_valid = self._current_work_linear_idx < cute.size(params.problem_shape_ncluster_mnl)
         return cutlass.utils.WorkTileInfo(tile_coord_mnkl, is_valid)
@@ -276,38 +276,6 @@ class TileScheduler:
                 current_work_linear_idx = cute.arch.shuffle_sync(current_work_linear_idx, 0)
             self._current_work_linear_idx = current_work_linear_idx
-    # We have to split broadcast_next_work and advance_to_next_work into two functions
-    # due to a bug in cute-dsl 4.2: https://github.com/NVIDIA/cutlass/issues/2647
-    @cute.jit
-    def broadcast_next_work(self, is_scheduler_warp: bool | Boolean = False, *, loc=None, ip=None):
-        """is_scheduler_warp should only be true for one warp in the whole cluster"""
-        params = self.params
-        if const_expr(params.is_persistent and params.tile_count_semaphore is not None):
-            current_work_linear_idx = self._current_work_linear_idx
-            if is_scheduler_warp:
-                self._scheduler_pipeline.producer_acquire(self._pipeline_state)
-                lane_idx = cute.arch.lane_idx()
-                if lane_idx < cute.size(params.cluster_shape_mn):
-                    # cute.printf("Producer bidx = {}, tidx = {}, after empty wait, idx = {}", bidx, tidx, current_work_linear_idx)
-                    if const_expr(cute.size(params.cluster_shape_mn) == 1):
-                        self._tile_count[self._pipeline_state.index] = current_work_linear_idx
-                        self._scheduler_pipeline.producer_commit(self._pipeline_state)
-                    else:
-                        peer_cta_rank_in_cluster = lane_idx
-                        mbar_ptr = self._scheduler_pipeline.producer_get_barrier(
-                            self._pipeline_state
-                        )
-                        cute.arch.mbarrier_arrive_and_expect_tx(
-                            mbar_ptr, 4, peer_cta_rank_in_cluster
-                        )
-                        utils.store_shared_remote(
-                            val=current_work_linear_idx,
-                            smem_ptr=self._tile_count.iterator + self._pipeline_state.index,
-                            mbar_ptr=mbar_ptr,
-                            peer_cta_rank_in_cluster=peer_cta_rank_in_cluster,
-                        )
-                    # cute.printf("Producer bidx = {}, tidx = {}, after full arrive", bidx, tidx)
     @cute.jit
     def advance_to_next_work(
         self,
@@ -328,7 +296,30 @@ class TileScheduler:
                 if const_expr(advance_count > 1):
                     self._pipeline_state.advance_iters(advance_count - 1)
                 current_work_linear_idx = self._current_work_linear_idx
-                if not is_scheduler_warp:
+                if is_scheduler_warp:
+                    self._scheduler_pipeline.producer_acquire(self._pipeline_state)
+                    lane_idx = cute.arch.lane_idx()
+                    if lane_idx < cute.size(params.cluster_shape_mn):
+                        # cute.printf("Producer bidx = {}, tidx = {}, after empty wait, idx = {}", bidx, tidx, current_work_linear_idx)
+                        if const_expr(cute.size(params.cluster_shape_mn) == 1):
+                            self._tile_count[self._pipeline_state.index] = current_work_linear_idx
+                            self._scheduler_pipeline.producer_commit(self._pipeline_state)
+                        else:
+                            peer_cta_rank_in_cluster = lane_idx
+                            mbar_ptr = self._scheduler_pipeline.producer_get_barrier(
+                                self._pipeline_state
+                            )
+                            cute.arch.mbarrier_arrive_and_expect_tx(
+                                mbar_ptr, 4, peer_cta_rank_in_cluster
+                            )
+                            utils.store_shared_remote(
+                                val=current_work_linear_idx,
+                                smem_ptr=self._tile_count.iterator + self._pipeline_state.index,
+                                mbar_ptr=mbar_ptr,
+                                peer_cta_rank_in_cluster=peer_cta_rank_in_cluster,
+                            )
+                        # cute.printf("Producer bidx = {}, tidx = {}, after full arrive", bidx, tidx)
+                else:
                     # if tidx % 64 == 0: cute.printf("bidx = {},tidx = {}, before full wait, idx = {}", bidx, tidx, current_work_linear_idx)
                     self._scheduler_pipeline.consumer_wait(self._pipeline_state)
                     # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, after full wait, idx = {}", bidx, tidx, current_work_linear_idx)
@@ -341,21 +332,17 @@ class TileScheduler:
                         # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, after empty arrive", bidx, tidx)
                 self._current_work_linear_idx = current_work_linear_idx
                 self._pipeline_state.advance()
-        self._num_tiles_executed += Int32(advance_count)
+        self.num_tiles_executed += Int32(advance_count)
     def producer_tail(self):
         if const_expr(self.params.is_persistent and self.params.tile_count_semaphore is not None):
             self._scheduler_pipeline.producer_tail(self._pipeline_state)
-    @property
-    def num_tiles_executed(self) -> Int32:
-        return self._num_tiles_executed
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
         for obj in [
             self._current_work_linear_idx,
-            self._num_tiles_executed,
+            self.num_tiles_executed,
             self._tile_count,
             self._scheduler_pipeline,
             self._pipeline_state,
@@ -371,7 +358,7 @@ class TileScheduler:
         for obj, n_items in zip(
             [
                 self._current_work_linear_idx,
-                self._num_tiles_executed,
+                self.num_tiles_executed,
                 self._tile_count,
                 self._scheduler_pipeline,
                 self._pipeline_state,
@@ -562,7 +549,7 @@ class TriangularTileScheduler(TileScheduler):
         pid_n = cid_n * params.cluster_shape_mn[1] + bidx_in_cluster[1]
         tile_coord_mnkl = (pid_m, pid_n, None, bidz)
         if const_expr(not params.is_persistent):
-            is_valid = self._num_tiles_executed == 0
+            is_valid = self.num_tiles_executed == 0
         else:
             is_valid = (
                 self._current_work_linear_idx
@@ -681,7 +668,7 @@ class VarlenMTileScheduler(TileScheduler):
         ip=None,
     ):
         self._current_work_linear_idx = current_work_linear_idx
-        self._num_tiles_executed = num_tiles_executed
+        self.num_tiles_executed = num_tiles_executed
         self._current_batch_idx = current_batch_idx
         self._num_work_idx_before_cur_batch = num_work_idx_before_cur_batch
         self._tile_count = tile_count
@@ -878,7 +865,7 @@ class VarlenMTileScheduler(TileScheduler):
         pid_n = cid_n * params.cluster_shape_mn[1] + bidx_in_cluster[1]
         tile_coord_mnkl = (pid_m, pid_n, None, batch_idx)
         if const_expr(not params.is_persistent):
-            is_valid = self._num_tiles_executed == 0 and batch_idx < num_batch
+            is_valid = self.num_tiles_executed == 0 and batch_idx < num_batch
         else:
             is_valid = batch_idx < num_batch
         return cutlass.utils.WorkTileInfo(tile_coord_mnkl, is_valid)
@@ -905,7 +892,7 @@ class VarlenMTileScheduler(TileScheduler):
         values, self._values_pos = [], []
         for obj in [
             self._current_work_linear_idx,
-            self._num_tiles_executed,
+            self.num_tiles_executed,
             self._current_batch_idx,
             self._num_work_idx_before_cur_batch,
             self._tile_count,
@@ -923,7 +910,7 @@ class VarlenMTileScheduler(TileScheduler):
         for obj, n_items in zip(
             [
                 self._current_work_linear_idx,
-                self._num_tiles_executed,
+                self.num_tiles_executed,
                 self._current_batch_idx,
                 self._num_work_idx_before_cur_batch,
                 self._tile_count,

quack-kernels 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl