PyPI - quack-kernels - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

quack-kernels 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

quack/__init__.py +1 -1
quack/cross_entropy.py +11 -7
quack/layernorm.py +351 -0
quack/reduction_base.py +16 -8
quack/rmsnorm.py +227 -151
quack/softmax.py +9 -6
quack/utils.py +66 -10
{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/METADATA +1 -1
quack_kernels-0.1.7.dist-info/RECORD +12 -0
quack_kernels-0.1.5.dist-info/RECORD +0 -11
{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/top_level.txt +0 -0

quack/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.5"
+__version__ = "0.1.7"
 from quack.rmsnorm import rmsnorm
 from quack.softmax import softmax

quack/cross_entropy.py CHANGED Viewed

@@ -104,7 +104,10 @@ class CrossEntropy(ReductionBase):
         shape: cute.Shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, idX)]
+        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
+        mX_off = utils.domain_offset_i64((bidx * tiler_mn[0], 0), mX)
+        gX = cute.local_tile(mX_off, tiler_mn, (0, cluster_y))
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -150,7 +153,9 @@ class CrossEntropy(ReductionBase):
         target_logit = cute.Float32.zero
         if row < shape[0] and tXcX[0][1] == 0:
-            target_logit = cute.Float32(mX[row, target])
+            # Use Int64 for indexing to deal with large tensors
+            mX_off = utils.domain_offset_i64((row, 0), mX)
+            target_logit = cute.Float32(mX_off[0, target])
         threads_per_row = tv_layout.shape[0][0]
         if cutlass.const_expr(not self.online_softmax):
@@ -363,11 +368,10 @@ class CrossEntropyBackward:
         )
         idX = cute.make_identity_tensor(shape)
-        gX, gdX, cX, gTarget, gDLoss, gLse = [
-            cute.local_tile(mT, tiler_mn, (bidx, bidy))
-            for mT in (mX, mdX, idX, mTarget, mDLoss, mLSE)
-        ]
+        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
+        mX, mdX = [utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mdX)]
+        gX, gdX = [cute.local_tile(mT, tiler_mn, (0, bidy)) for mT in (mX, mdX)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, bidy))
         copy_atom_load_X = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(), gX.element_type, num_bits_per_copy=128

quack/layernorm.py ADDED Viewed

@@ -0,0 +1,351 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import torch
+from typing import Optional
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+import quack.utils as utils
+from quack.reduction_base import ReductionBase, torch2cute_dtype_map
+class LayerNorm(ReductionBase):
+    def __init__(self, dtype: cutlass.Numeric, N: int):
+        super().__init__(dtype, N, stage=2)  # 2 stages for mean and var
+        self.reload_from = None if N <= 16384 else "smem"
+        self.delay_w_load = False
+    def _calculate_threads_per_row(self):
+        N = self.N
+        return (
+            8
+            if N <= 64
+            else (
+                16
+                if N <= 128
+                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+            )
+        )
+    def _set_cluster_n(self):
+        N = self.N
+        # cluster_n = 4 is faster and cluster_n = 2 for N=64k for some reason
+        # Similarly cluster_n = 8 is faster for N=128k
+        if cutlass.const_expr(self.dtype.width == 16):
+            cluster_n = (
+                1
+                if N <= 16 * 1024
+                else (
+                    2
+                    if N <= 32 * 1024
+                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
+                )
+            )
+        else:  # fp32
+            cluster_n = (
+                1
+                if N <= 32 * 1024
+                else (
+                    2
+                    if N <= 64 * 1024
+                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
+                )
+            )
+        self.cluster_n = cluster_n
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mW: cute.Tensor,
+        mO: cute.Tensor,
+        mRstd: Optional[cute.Tensor],
+        mMean: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+        eps: cutlass.Float32 = 1e-6,
+    ):
+        assert mX.element_type == self.dtype
+        assert mO.element_type == self.dtype
+        self._set_cluster_n()
+        tiler_mn, tv_layout = self._get_tv_layout()
+        num_threads = cute.size(tv_layout, mode=[0])
+        num_warps = num_threads // cute.arch.WARP_SIZE
+        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
+        if cutlass.const_expr(mRstd is not None):
+            mRstd_expanded_layout = cute.append(
+                mRstd.layout, cute.make_layout((self.N,), stride=(0,))
+            )
+            mRstd = cute.make_tensor(mRstd.iterator, mRstd_expanded_layout)
+        if cutlass.const_expr(mMean is not None):
+            mMean_expanded_layout = cute.append(
+                mMean.layout, cute.make_layout((self.N,), stride=(0,))
+            )
+            mMean = cute.make_tensor(mMean.iterator, mMean_expanded_layout)
+        self.kernel(mX, mW, mO, mRstd, mMean, eps, tv_layout, tiler_mn, self.reload_from).launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
+            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,
+        mW: cute.Tensor,
+        mO: cute.Tensor,
+        mRstd: Optional[cute.Tensor],
+        mMean: Optional[cute.Tensor],
+        eps: cute.Float32,
+        tv_layout: cute.Layout,
+        tiler_mn: cute.Shape,
+        reload_from: cutlass.Constexpr = None,
+        delay_w_load: cutlass.Constexpr = False,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+        # slice for CTAs
+        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
+        mX, mO = [utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
+        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
+        gRstd = (
+            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
+            if cutlass.const_expr(mRstd is not None)
+            else None
+        )
+        gMean = (
+            cute.local_tile(mMean, tiler_mn, (bidx, cluster_y))
+            if cutlass.const_expr(mMean is not None)
+            else None
+        )
+        # declare the atoms which will be used later for memory copy
+        copy_atom_load_X = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=128
+        )
+        copy_atom_load_X_async = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=128
+        )
+        copy_atom_load_W = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=128
+        )
+        copy_atom_store_O = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mO.element_type, num_bits_per_copy=128
+        )
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X_async, tv_layout, tiler_mn).get_slice(
+            tidx
+        )
+        thr_copy_W = cute.make_tiled_copy(copy_atom_load_W, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
+        tWgW = thr_copy_W.partition_S(gW)
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXgO = thr_copy_O.partition_D(gO)
+        tXrRstd = thr_copy_O.partition_D(gRstd) if cutlass.const_expr(mRstd is not None) else None
+        tXrMean = thr_copy_O.partition_D(gMean) if cutlass.const_expr(mMean is not None) else None
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        # allocate fragments for gmem->rmem
+        tWrW = cute.make_fragment_like(tWgW)
+        tXrW = thr_copy_X.retile(tWrW)
+        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+        tXpX = utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+        row = tXcX[0][0]
+        if row < shape[0]:
+            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
+        cute.arch.cp_async_commit_group()
+        tWpW = utils.predicate_k(thr_copy_W.partition_S(cX), limit=shape[1])
+        if cutlass.const_expr(not delay_w_load):
+            cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
+        cute.arch.cp_async_wait_group(0)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(cute.Float32)
+        threads_per_row = tv_layout.shape[0][0]
+        sum_x = utils.row_reduce(
+            x,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
+            init_val=0.0,
+            hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+        )
+        mean = sum_x / shape[1]
+        if cutlass.const_expr(reload_from == "smem"):
+            cute.autovec_copy(tXsX, tXrX)
+            x = tXrX.load().to(cute.Float32)
+        elif cutlass.const_expr(reload_from == "gmem"):
+            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+            x = tXrX.load().to(cute.Float32)
+        sum_sq_x_sub_mean = utils.row_reduce(
+            (x - mean) * (x - mean),
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 1],
+            mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
+            init_val=0.0,
+        )
+        rstd = utils.rsqrt(sum_sq_x_sub_mean / shape[1] + eps)
+        if cutlass.const_expr(mRstd is not None):
+            # Only the thread corresponding to column 0 writes out the rstd to gmem
+            if (
+                tXcX[0][1] == 0
+                and row < shape[0]
+                and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+            ):
+                tXrRstd[0] = rstd
+        if cutlass.const_expr(mMean is not None):
+            # Only the thread corresponding to column 0 writes out the mean to gmem
+            if (
+                tXcX[0][1] == 0
+                and row < shape[0]
+                and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+            ):
+                tXrMean[0] = mean
+        if cutlass.const_expr(delay_w_load):
+            cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
+        if cutlass.const_expr(reload_from == "smem"):
+            cute.autovec_copy(tXsX, tXrX)
+            x = tXrX.load().to(cute.Float32)
+        elif cutlass.const_expr(reload_from == "gmem"):
+            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+            x = tXrX.load().to(cute.Float32)
+        x_hat = (x - mean) * rstd
+        w = tXrW.load().to(cute.Float32)
+        y = x_hat * w
+        tXrO.store(y.to(tXrO.element_type))
+        tOpO = utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1])
+        if row < shape[0]:
+            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+def layernorm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    return_rstd: bool = False,
+    return_mean: bool = False,
+) -> torch.Tensor:
+    """LayerNorm forward pass.
+    Args:
+        x: Input tensor of shape (M, N)
+        weight: Weight tensor of shape (N,)
+        eps: Small value for numerical stability
+        return_rstd: Whether to return the reciprocal standard deviation
+        return_mean: Whether to return the mean
+    Returns:
+        Normalized output tensor of same shape as x
+        If return_rstd is True, also returns rstd tensor of shape (M,)
+        If return_mean is True, also returns mean tensor of shape (M,)
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert weight.dim() == 1, "Weight must be 1D"
+    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
+    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
+    assert weight.dtype == torch.float32, "Weight must be float32"
+    M, N = x.shape
+    device = x.device
+    out = torch.empty_like(x)
+    rstd = torch.empty(M, device=device, dtype=torch.float32) if return_rstd else None
+    mean = torch.empty(M, device=device, dtype=torch.float32) if return_mean else None
+    dtype = torch2cute_dtype_map[x.dtype]
+    convert_from_dlpack = lambda x: (
+        from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+    )
+    x_tensor, out_tensor = [
+        # utils.convert_from_dlpack(t, leading_dim=t.ndim - 1, divisibility=128 // dtype.width)
+        convert_from_dlpack(t)
+        for t in (x, out)
+    ]
+    weight_tensor = utils.convert_from_dlpack(
+        weight.detach(), leading_dim=0, divisibility=128 // cutlass.Float32.width
+    )
+    rstd_tensor = (
+        from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+        if rstd is not None
+        else None
+    )
+    mean_tensor = (
+        from_dlpack(mean.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+        if mean is not None
+        else None
+    )
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    compile_key = (dtype, N, rstd is not None, mean is not None)
+    if compile_key not in layernorm.compile_cache:
+        rmsnorm_op = LayerNorm(dtype, N)
+        layernorm.compile_cache[compile_key] = cute.compile(
+            rmsnorm_op,
+            x_tensor,
+            weight_tensor,
+            out_tensor,
+            rstd_tensor,
+            mean_tensor,
+            current_stream,
+        )
+    layernorm.compile_cache[compile_key](
+        x_tensor, weight_tensor, out_tensor, rstd_tensor, mean_tensor, current_stream, eps
+    )
+    return (
+        (out, rstd, mean)
+        if return_mean and return_rstd
+        else (
+            (out, rstd)
+            if return_rstd and not return_mean
+            else ((out, mean) if return_mean and not return_rstd else (out))
+        )
+    )
+layernorm.compile_cache = {}
+def layernorm_ref(x: torch.Tensor, w: torch.Tensor, eps: float = 1e-6):
+    x_f32 = x.float()
+    return torch.nn.functional.layer_norm(x_f32, w.shape, w, None, eps).to(x.dtype)
+def rstd_ref(x: torch.Tensor, eps: float = 1e-6):
+    x_f32 = x.float()
+    mean = x_f32.mean(dim=-1, keepdim=True)
+    var = ((x_f32 - mean) ** 2).mean(dim=-1)
+    return 1.0 / torch.sqrt(var + eps)
+def mean_ref(x: torch.Tensor) -> torch.Tensor:
+    return x.float().mean(dim=-1)

quack/reduction_base.py CHANGED Viewed

@@ -68,7 +68,7 @@ class ReductionBase:
         )
     def _allocate_reduction_buffer_and_mbar(
-        self, smem: cutlass.utils.SmemAllocator, tv_layout: cute.Layout
+        self, smem: cutlass.utils.SmemAllocator, tv_layout: cute.Layout, is_persistent: bool = False
     ) -> Tuple[cute.Tensor, Optional[cute.Pointer]]:
         reduction_buffer = smem.allocate_tensor(
             self.reduction_dtype,
@@ -76,20 +76,28 @@ class ReductionBase:
             byte_alignment=4,
         )
         if cutlass.const_expr(self.cluster_n > 1):
-            mbar_ptr = smem.allocate_array(cutlass.Int64, num_elems=self.stage)
+            mbar_ptr = smem.allocate_array(
+                cutlass.Int64, num_elems=self.stage if not is_persistent else self.stage * 2
+            )
         else:
             mbar_ptr = None
         return reduction_buffer, mbar_ptr
     @cute.jit
-    def _initialize_cluster(self, tidx: cutlass.Int32, mbar_ptr: cute.Pointer, num_warps: int):
+    def _initialize_cluster(
+        self,
+        tidx: cutlass.Int32,
+        mbar_ptr: cute.Pointer,
+        num_warps: int,
+        is_persistent: bool = False,
+    ):
         if cutlass.const_expr(self.cluster_n > 1):
-            if tidx < self.stage:
+            if tidx < self.stage:  # Initialize full barrier
                 cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
+                if cutlass.const_expr(is_persistent):  # Initialize empty barrier
+                    cute.arch.mbarrier_init(
+                        mbar_ptr + self.stage + tidx, num_warps * self.cluster_n
+                    )
             cute.arch.mbarrier_init_fence()
-            if tidx < self.stage:
-                cute.arch.mbarrier_arrive_and_expect_tx(
-                    mbar_ptr + tidx, num_warps * self.cluster_n * self.reduction_dtype.width // 8
-                )
             # Cluster arrive after barrier init
             cute.arch.cluster_arrive_relaxed()

quack/rmsnorm.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
 import torch
 from typing import Optional
@@ -117,7 +116,10 @@ class RMSNorm(ReductionBase):
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
+        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
+        mX, mO = [utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
+        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         gRstd = (
             cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
@@ -154,6 +156,7 @@ class RMSNorm(ReductionBase):
         # allocate fragments for gmem->rmem
         tWrW = cute.make_fragment_like(tWgW)
+        tWrW.fill(0.0)
         tXrW = thr_copy_X.retile(tWrW)
         tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
@@ -297,8 +300,14 @@ def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
 class RMSNormBackward(ReductionBase):
     def __init__(self, dtype: cutlass.Numeric, N: int):
-        # 1 stage for computing mean of x_hat * wdy
-        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Float32)
+        # 2 stages for double buffering when computing mean of x_hat * wdy
+        super().__init__(dtype, N, stage=2, reduction_dtype=cutlass.Float32)
+        if self.N > 128 * 1024 and self.dtype.width >= 32:
+            # Not enough smem
+            raise ValueError("RMSNormBackward does not support N > 128k with dtype >= 32 bits")
+    def _get_num_threads(self):
+        return 128 if self.N <= 4096 else 256
     def _calculate_threads_per_row(self):
         N = self.N
@@ -308,44 +317,38 @@ class RMSNormBackward(ReductionBase):
             else (
                 16
                 if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
+                else (32 if N <= 256 else (64 if N <= 512 else (128 if N <= 4096 else 256)))
             )
         )
     def _set_cluster_n(self):
         N = self.N
-        if cutlass.const_expr(self.dtype.width == 16):
-            cluster_n = (
-                1
-                if N <= 16 * 1024
-                else (
-                    2
-                    if N <= 32 * 1024
-                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
-                )
-            )
-        else:  # fp32
-            cluster_n = (
-                1
-                if N <= 32 * 1024
-                else (
-                    2
-                    if N <= 64 * 1024
-                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
-                )
-            )
+        cluster_n = (
+            1
+            if N <= 8 * 1024
+            else (2 if N <= 16 * 1024 else (4 if N <= 32 * 1024 else (8 if N <= 64 * 1024 else 16)))
+        )
         self.cluster_n = cluster_n
+    def _smem_size_in_bytes(self, tiler_mn, num_warps):
+        return (
+            # Multiply by 2 since we need space for X and dOut,
+            # and multiply by another 2 due to double buffering
+            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2 * 2
+            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
+            + self.stage * (cutlass.Int64.width // 8) * 2  # mult 2 as we need 2 mbar per stage
+        )
     @cute.jit
     def __call__(
         self,
         mX: cute.Tensor,
         mW: cute.Tensor,
-        mDout: cute.Tensor,
+        mdOut: cute.Tensor,
         mRstd: cute.Tensor,
-        mDx: cute.Tensor,
-        mDw: cute.Tensor,
-        sm_count: cutlass.Constexpr,
+        mdX: cute.Tensor,
+        mdW: cute.Tensor,
+        sm_count: cutlass.Int32,
         stream: cuda.CUstream,
     ):
         self._set_cluster_n()
@@ -356,14 +359,8 @@ class RMSNormBackward(ReductionBase):
         mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
         mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
-        mRstd_expanded_layout = cute.append(mRstd.layout, cute.make_layout((self.N,), stride=(0,)))
-        mRstd = cute.make_tensor(mRstd.iterator, mRstd_expanded_layout)
-        num_blocks = (
-            sm_count if tiler_mn[0] == 1 else min(sm_count, cute.ceil_div(1024, tiler_mn[0]))
-        )
-        self.kernel(mX, mW, mDout, mRstd, mDx, mDw, sm_count, tv_layout, tiler_mn).launch(
+        num_blocks = sm_count
+        self.kernel(mX, mW, mdOut, mRstd, mdX, mdW, tv_layout, tiler_mn).launch(
             grid=[num_blocks, self.cluster_n, 1],
             block=[num_threads, 1, 1],
             cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
@@ -376,177 +373,244 @@ class RMSNormBackward(ReductionBase):
         self,
         mX: cute.Tensor,
         mW: cute.Tensor,
-        mDout: cute.Tensor,
+        mdOut: cute.Tensor,
         mRstd: cute.Tensor,
-        mDx: cute.Tensor,
-        mDw: cute.Tensor,
-        sm_count: cutlass.Constexpr,
+        mdX: cute.Tensor,
+        mdW: cute.Tensor,
         tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx_start, _, _ = cute.arch.block_idx()
         gdim, _, _ = cute.arch.grid_dim()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape = mX.shape
         M, N = shape[0], shape[1]
+        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         idX = cute.make_identity_tensor(shape)
         smem = cutlass.utils.SmemAllocator()
-        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        smem_layout = cute.make_ordered_layout((tiler_mn[0], tiler_mn[1], 2), order=(1, 0, 2))
+        sX = smem.allocate_tensor(mX.element_type, smem_layout, byte_alignment=16)
+        sdOut = smem.allocate_tensor(mdOut.element_type, smem_layout, byte_alignment=16)
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(
+            smem, tv_layout, is_persistent=True
+        )
+        if cutlass.const_expr(mbar_ptr is not None):
+            mbar_full_ptr, mbar_empty_ptr = mbar_ptr, mbar_ptr + 2
+        else:
+            mbar_full_ptr, mbar_empty_ptr = None, None
         copy_atom_load_X = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=128
         )
+        copy_atom_load_X_async = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=128
+        )
         copy_atom_load_W = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=128
         )
         copy_atom_store_dX = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mDx.element_type, num_bits_per_copy=128
+            cute.nvgpu.CopyUniversalOp(), mdX.element_type, num_bits_per_copy=128
         )
-        copy_atom_dw = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mDw.element_type, num_bits_per_copy=128
+        copy_atom_store_dW = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), mdW.element_type, num_bits_per_copy=128
         )
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_X_async = cute.make_tiled_copy(
+            copy_atom_load_X_async, tv_layout, tiler_mn
+        ).get_slice(tidx)
         thr_copy_W = cute.make_tiled_copy(copy_atom_load_W, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_dw = cute.make_tiled_copy(copy_atom_dw, tv_layout, tiler_mn).get_slice(tidx)
-        thr_store_dx = cute.make_tiled_copy(copy_atom_store_dX, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_dW = cute.make_tiled_copy(copy_atom_store_dW, tv_layout, tiler_mn).get_slice(tidx)
+        thr_store_dX = cute.make_tiled_copy(copy_atom_store_dX, tv_layout, tiler_mn).get_slice(tidx)
-        gW = cute.local_tile(mW, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         tWgW = thr_copy_W.partition_S(gW)
         tWrW = cute.make_fragment_like(tWgW)
+        # Need this, otherwise rW can have arbitrary values that changes the reduction
+        if not is_even_N:
+            tWrW.fill(0.0)
         tXrW = thr_copy_X.retile(tWrW)
-        gW_coord = cute.local_tile(idX, tiler_mn, (0, 0 if self.cluster_n == 1 else cluster_y))
-        tWpW = utils.predicate_k(thr_copy_W.partition_S(gW_coord), limit=shape[1])
+        gW_coord = cute.local_tile(idX, tiler_mn, (0, cluster_y))
+        tWpW = (
+            utils.predicate_k(thr_copy_W.partition_S(gW_coord), limit=shape[1])
+            if not is_even_N
+            else None
+        )
         cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
         weight = tXrW.load().to(cute.Float32)
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        self._initialize_cluster(tidx, mbar_ptr, num_warps)
-        dw_coord = cute.local_tile(idX, tiler_mn, (0, 0 if self.cluster_n == 1 else cluster_y))
-        tDwpDw = utils.predicate_k(thr_copy_dw.partition_S(dw_coord), limit=shape[1])
+        self._initialize_cluster(tidx, mbar_ptr, num_warps, is_persistent=True)
-        gDw = cute.local_tile(mDw, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-        tDwgDw = thr_copy_dw.partition_D(gDw)
-        tDwrDw = cute.make_fragment_like(tDwgDw)
-        dw_accumulator = thr_copy_X.retile(tDwrDw)
-        dw_accumulator.fill(0.0)
-        M_pad = ((M + sm_count - 1) // sm_count) * sm_count
+        dw_coord = cute.local_tile(idX, tiler_mn, (0, cluster_y))
+        tdWpdW = (
+            utils.predicate_k(thr_copy_dW.partition_S(dw_coord), limit=shape[1])
+            if not is_even_N
+            else None
+        )
-        jump = sm_count if tiler_mn[0] == 1 else min(sm_count, cute.ceil_div(1024, tiler_mn[0]))
+        gdW = cute.local_tile(mdW, (1, tiler_mn[1]), (bidx_start, cluster_y))
+        tdWgdW = thr_copy_dW.partition_D(gdW)
+        tdWrdW = cute.make_fragment_like(tdWgdW, cutlass.Float32)
+        tXrdW = thr_copy_X.retile(tdWrdW)
-        if cutlass.const_expr(self.cluster_n > 1):
-            cute.arch.cluster_arrive()
-            cute.arch.cluster_wait()
+        gX = cute.local_tile(mX, tiler_mn, (None, cluster_y))
+        gdOut = cute.local_tile(mdOut, tiler_mn, (None, cluster_y))
+        gdX = cute.local_tile(mdX, tiler_mn, (None, cluster_y))
+        cX = cute.local_tile(idX, tiler_mn, (None, cluster_y))
+        tXgX = thr_copy_X.partition_S(gX)
+        tXsX = thr_copy_X.partition_D(sX)
+        tXgdOut = thr_copy_X.partition_S(gdOut)
+        tXsdOut = thr_copy_X.partition_D(sdOut)
+        tXgdX = thr_store_dX.partition_D(gdX)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None, None]
+        # This doesn't change across iterations
+        tXpX = (
+            utils.predicate_k(thr_copy_X.partition_S(cX[None, None, 0]), limit=shape[1])
+            if not is_even_N
+            else None
+        )
-        ## need to update range_dynamic since it will be deprecated soon
-        for row_offset in cutlass.range_dynamic(bidx, M_pad, jump):
-            gX = cute.local_tile(
-                mX, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+        tXrX, tXrdOut, tXrdX = [
+            cute.make_fragment_like(thr[None, None, None, 0]) for thr in (tXgX, tXgdOut, tXgdX)
+        ]
+        # Prefetch the first batch
+        row = tXcX[None, None, None, bidx_start][0][0]
+        if row < M:
+            tXgX_cur = utils.coord_offset_i64(bidx_start, tXgX, dim=3)[None, None, None, 0]
+            tXgdOut_cur = utils.coord_offset_i64(bidx_start, tXgdOut, dim=3)[None, None, None, 0]
+            cute.copy(
+                copy_atom_load_X_async,
+                tXgX_cur,
+                tXsX[None, None, None, 0],
+                pred=tXpX,
             )
-            gDout = cute.local_tile(
-                mDout, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
+            cute.copy(
+                copy_atom_load_X_async,
+                tXgdOut_cur,
+                tXsdOut[None, None, None, 0],
+                pred=tXpX,
             )
-            gRstd = cute.local_tile(
-                mRstd, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
-            )
-            gDx = cute.local_tile(
-                mDx, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
-            )
-            cX = cute.local_tile(
-                idX, tiler_mn, (row_offset, 0 if self.cluster_n == 1 else cluster_y)
-            )
-            tXgX = thr_copy_X.partition_S(gX)
-            thrDout = thr_copy_X.partition_S(gDout)
-            tXrRstd = thr_copy_W.partition_S(gRstd)
-            thrDx = thr_store_dx.partition_D(gDx)
-            tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
-            tXrX, frgDout, frgDx = [cute.make_fragment_like(thr) for thr in (tXgX, thrDout, thrDx)]
-            tXpX = utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+        elif tiler_mn[0] > 1:
+            # Fill with zero, otherwise smem will be uninitialized, and we could read this back
+            # later into registers, causing wrong dW.
+            utils.fill_oob(tXsX[None, None, None, 0], None, fill_value=mX.element_type.zero)
+            utils.fill_oob(tXsdOut[None, None, None, 0], None, fill_value=mdOut.element_type.zero)
+        cute.arch.cp_async_commit_group()
-            if tXcX[0][0] < shape[0]:
-                cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
-                cute.copy(copy_atom_load_X, thrDout, frgDout, pred=tXpX)
+        if cutlass.const_expr(self.cluster_n > 1):
+            cute.arch.cluster_wait()
+        threads_per_row = tv_layout.shape[0][0]
+        tXrdW.fill(0.0)
+        stage = cutlass.Int32(0)
+        producer_phase = cutlass.Int32(1)
+        consumer_phase = cutlass.Int32(0)
+        for bidx in cutlass.range(bidx_start, cute.ceil_div(M, tiler_mn[0]), gdim):
+            row = tXcX[None, None, None, bidx][0][0]
+            rstd = cutlass.Float.zero
+            if row + gdim * tiler_mn[0] < M:  # Prefetch the next batch
+                tXgX_cur = utils.coord_offset_i64(bidx + gdim, tXgX, dim=3)[None, None, None, 0]
+                tXgdOut_cur = utils.coord_offset_i64(bidx + gdim, tXgdOut, dim=3)[
+                    None, None, None, 0
+                ]
+                cute.copy(
+                    copy_atom_load_X_async,
+                    tXgX_cur,
+                    tXsX[None, None, None, stage ^ 1],
+                    pred=tXpX,
+                )
+                cute.copy(
+                    copy_atom_load_X_async,
+                    tXgdOut_cur,
+                    tXsdOut[None, None, None, stage ^ 1],
+                    pred=tXpX,
+                )
+            elif tiler_mn[0] > 1:
+                utils.fill_oob(
+                    tXsX[None, None, None, stage ^ 1], None, fill_value=mX.element_type.zero
+                )
+                utils.fill_oob(
+                    tXsdOut[None, None, None, stage ^ 1], None, fill_value=mdOut.element_type.zero
+                )
+            cute.arch.cp_async_commit_group()
+            if row < M or tiler_mn[0] == 1:
+                rstd = mRstd[row]
+            cute.arch.cp_async_wait_group(1)
+            cute.autovec_copy(tXsX[None, None, None, stage], tXrX)
             x = tXrX.load().to(cute.Float32)
-            dout = frgDout.load().to(cute.Float32)
-            rstd = tXrRstd[0]
+            cute.autovec_copy(tXsdOut[None, None, None, stage], tXrdOut)
+            dout = tXrdOut.load().to(cute.Float32)
             x_hat = x * rstd
             wdy = dout * weight
-            threads_per_row = tv_layout.shape[0][0]
-            row = tXcX[0][0]
             if cutlass.const_expr(self.cluster_n > 1):
-                cute.arch.cluster_arrive()
-                cute.arch.cluster_wait()
-            else:
-                cute.arch.barrier()
+                cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
             mean_xhat_wdy = (
                 utils.row_reduce(
                     x_hat * wdy,
                     cute.ReductionOp.ADD,
                     threads_per_row,
-                    reduction_buffer[None, None, 0],
-                    mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
+                    reduction_buffer[None, None, stage],
+                    mbar_full_ptr + stage if cutlass.const_expr(self.cluster_n > 1) else None,
+                    phase=consumer_phase,
                     init_val=0.0,
-                    hook_fn=cute.arch.cluster_wait
-                    if cutlass.const_expr(self.cluster_n > 1)
-                    else None,
                 )
                 / shape[1]
             )
-            dx = (wdy - x_hat * mean_xhat_wdy) * rstd
-            frgDx.store(dx.to(frgDout.element_type))
-            if row < M:
-                cute.copy(copy_atom_store_dX, frgDx, thrDx, pred=tXpX)
             if cutlass.const_expr(self.cluster_n > 1):
-                cute.arch.cluster_arrive()
-                cute.arch.cluster_wait()
-            else:
-                cute.arch.barrier()
-            if row < M:
-                dw_row = dout * x_hat
-                current_dw = dw_accumulator.load().to(cute.Float32)
-                updated_dw = current_dw + dw_row
-                dw_accumulator.store(updated_dw.to(dw_accumulator.element_type))
-            """
-            if cutlass.const_expr(self.cluster_n > 1):
-                cute.arch.cluster_arrive()
-                cute.arch.cluster_wait()
-            else:
-                cute.arch.barrier()
-            """
-        """
-        if cutlass.const_expr(self.cluster_n > 1):
-            cute.arch.cluster_arrive()
-            cute.arch.cluster_wait()
-        else:
+                # It's faster to have 1 lane per warp to signal the mbar, rather than all lanes
+                # Requires adjusting the thread_count when initializing the mbar
+                cute.arch.sync_warp()
+                lane_idx = cute.arch.lane_idx()
+                if lane_idx < self.cluster_n:
+                    cute.arch.mbarrier_arrive(
+                        mbar_empty_ptr + stage, peer_cta_rank_in_cluster=lane_idx
+                    )
+            dx = (wdy - x_hat * mean_xhat_wdy) * rstd
+            tXrdX.store(dx.to(tXrdOut.element_type))
+            if row < M or tiler_mn[0] == 1:
+                tXgdX_cur = utils.coord_offset_i64(bidx, tXgdX, dim=3)[None, None, None, 0]
+                cute.copy(copy_atom_store_dX, tXrdX, tXgdX_cur, pred=tXpX)
+            tXrdW.store(tXrdW.load() + dout * x_hat)
+            stage ^= 1
+            if stage == 0:
+                consumer_phase ^= 1
+                producer_phase ^= 1
+        if cutlass.const_expr(self.cluster_n > 1):  # Prevent cluster from exiting early
+            cute.arch.mbarrier_wait(mbar_empty_ptr + stage, producer_phase)
+        if cutlass.const_expr(tiler_mn[0] > 1):
+            # reduction of dw_partial within the same threadblock
+            sdW = cute.make_tensor(
+                cute.recast_ptr(sX.iterator, dtype=cute.Float32),
+                cute.make_ordered_layout(tiler_mn, order=(1, 0)),
+            )
+            tXsdW = thr_copy_X.partition_D(sdW)
             cute.arch.barrier()
-        """
-        cute.autovec_copy(dw_accumulator, tDwrDw)
-        cute.copy(copy_atom_dw, tDwrDw, tDwgDw, pred=tDwpDw)
+            row = tXcX[None, None, None, 0][0][0]
+            if row > 0:
+                cute.autovec_copy(tXrdW, tXsdW)
+            cute.arch.barrier()
+            if row == 0:
+                for i in cutlass.range_constexpr(1, cutlass.const_expr(tiler_mn[0])):
+                    tXrdW_other = cute.make_fragment_like(tXrdW)
+                    tXsdW_other = cute.make_tensor(tXsdW.iterator + i * sdW.stride[0], tXsdW.layout)
+                    cute.autovec_copy(tXsdW_other, tXrdW_other)
+                    tXrdW.store(tXrdW.load() + tXrdW_other.load())
+                cute.copy(copy_atom_store_dW, tdWrdW, tdWgdW, pred=tdWpdW)
+        else:
+            cute.copy(copy_atom_store_dW, tdWrdW, tdWgdW, pred=tdWpdW)
 def _rmsnorm_backward(
@@ -578,8 +642,19 @@ def _rmsnorm_backward(
     device = x.device
-    sm_count = torch.cuda.get_device_properties(device).multi_processor_count * 8
-    dw_partial = torch.zeros((sm_count, N), device=device, dtype=weight.dtype)
+    # This should be tuned on how many CTAs can be launched on each SM
+    sm_count_multiple = (
+        16 if N <= 256 else (8 if N <= 1024 else (4 if N <= 2048 else (2 if N <= 4096 else 1)))
+    )
+    sm_count = torch.cuda.get_device_properties(device).multi_processor_count
+    # By right, if we're using cluster, this should be cluster_count not sm_count.
+    # But for cluster >= 4, due to quantization we would need to query active max cluster.
+    # Instead we just do sm_count * 2, which is reasonably larger than active_cluster_count to
+    # avoid wave quantization.
+    sm_count = (
+        sm_count * sm_count_multiple if N <= 8192 else sm_count // 2 if N <= 16384 else sm_count * 2
+    )
+    dw_partial = torch.empty(sm_count, N, device=device, dtype=weight.dtype)
     dtype = torch2cute_dtype_map[x.dtype]
@@ -622,6 +697,7 @@ def _rmsnorm_backward(
         rstd_tensor,
         dx_tensor,
         dw_partial_tensor,
+        sm_count,
         current_stream,
     )

quack/softmax.py CHANGED Viewed

@@ -98,7 +98,10 @@ class Softmax(ReductionBase):
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
+        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
+        mX, mO = [utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
+        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -130,9 +133,7 @@ class Softmax(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
-            if cutlass.const_expr(not is_even_N)
-            else None
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
         )
         if tXcX[0][0] < shape[0]:
             cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
@@ -312,9 +313,11 @@ class SoftmaxBackward(ReductionBase):
         shape = mdY.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gdY, gY, gdX, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mdY, mY, mdX, idX)
+        mdY, mY, mdX = [
+            utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mdY, mY, mdX)
         ]
+        gdY, gY, gdX = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mdY, mY, mdX)]
+        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         smem = cutlass.utils.SmemAllocator()
         sdY = smem.allocate_tensor(

quack/utils.py CHANGED Viewed

@@ -120,12 +120,20 @@ def cluster_reduce(
     reduction_buffer: cute.Tensor,
     mbar_ptr: cute.Pointer,
     init_val: cute.Numeric = 0.0,
+    phase: Optional[cutlass.Int32] = None,
 ) -> cute.Numeric:
     """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
     cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
     lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
-    warps_per_row, cluster_n = reduction_buffer.shape[1]
+    rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
     row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if warp_idx == 0:
+        with cute.arch.elect_one():
+            num_warps = rows_per_block * warps_per_row
+            cute.arch.mbarrier_arrive_and_expect_tx(
+                mbar_ptr,
+                num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+            )
     if lane_idx < cluster_n:
         store_shared_remote(
             val,
@@ -133,7 +141,7 @@ def cluster_reduce(
             mbar_ptr,
             peer_cta_rank_in_cluster=lane_idx,
         )
-    cute.arch.mbarrier_wait(mbar_ptr, phase=0)
+    cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
     block_reduce_val = init_val
     num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
     for i in cutlass.range_constexpr(num_iter):
@@ -149,13 +157,14 @@ def block_or_cluster_reduce(
     op: Callable,
     reduction_buffer: cute.Tensor,
     mbar_ptr: Optional[cute.Pointer],
+    phase: Optional[cutlass.Int32] = None,
     init_val: cute.Numeric = 0.0,
 ) -> cute.Numeric:
     """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
     if cutlass.const_expr(mbar_ptr is None):
         return block_reduce(val, op, reduction_buffer, init_val=init_val)
     else:
-        return cluster_reduce(val, op, reduction_buffer, mbar_ptr, init_val=init_val)
+        return cluster_reduce(val, op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val)
 @cute.jit
@@ -165,6 +174,7 @@ def row_reduce(
     threads_per_row: cutlass.Constexpr[int],
     reduction_buffer: Optional[cute.Tensor] = None,
     mbar_ptr: Optional[cute.Pointer] = None,
+    phase: Optional[cutlass.Int32] = None,
     init_val: cute.Numeric = 0.0,
     hook_fn: Optional[Callable] = None,
 ) -> cute.Numeric:
@@ -193,7 +203,7 @@ def row_reduce(
         ), "mbar_ptr must be provided for cluster reduction"
         if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
             val = block_or_cluster_reduce(
-                val, warp_op, reduction_buffer, mbar_ptr, init_val=init_val
+                val, warp_op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val
             )
     return val
@@ -205,6 +215,7 @@ def online_softmax_reduce(
     reduction_buffer: Optional[cute.Tensor] = None,
     mbar_ptr: Optional[cute.Pointer] = None,
     hook_fn: Optional[Callable] = None,
+    phase: Optional[cutlass.Int32] = None,
     return_exp_x: bool = False,
 ) -> [Float32, Float32, Optional[cute.TensorSSA]]:
     assert x.dtype == Float32, "x must be of type Float32"
@@ -225,7 +236,7 @@ def online_softmax_reduce(
     if cutlass.const_expr(hook_fn is not None):
         hook_fn()
     if cutlass.const_expr(reduction_buffer is not None):
-        warps_per_row, cluster_n = reduction_buffer.shape[1]
+        rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
         assert (
             cluster_n == 1 or mbar_ptr is not None
         ), "mbar_ptr must be provided for cluster reduction"
@@ -251,6 +262,13 @@ def online_softmax_reduce(
                 max_x = max_x_final
             else:
                 cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+                if warp_idx == 0:
+                    with cute.arch.elect_one():
+                        num_warps = rows_per_block * warps_per_row
+                        cute.arch.mbarrier_arrive_and_expect_tx(
+                            mbar_ptr,
+                            num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+                        )
                 if lane_idx < cluster_n:
                     store_shared_remote(
                         f32x2_to_i64(max_x, sum_exp_x),
@@ -258,7 +276,7 @@ def online_softmax_reduce(
                         mbar_ptr,
                         peer_cta_rank_in_cluster=lane_idx,
                     )
-                cute.arch.mbarrier_wait(mbar_ptr, phase=0)
+                cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
                 num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
                 max_x_single_warp = cute.make_fragment(num_iter, Float32)
                 max_x_single_warp.fill(-Float32.inf)
@@ -351,7 +369,7 @@ def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
 @cute.jit
-def fill_oob(tXsX: cute.Tensor, tXpX: cute.Tensor, fill_value: cute.Numeric) -> None:
+def fill_oob(tXsX: cute.Tensor, tXpX: Optional[cute.Tensor], fill_value: cute.Numeric) -> None:
     """Fill out-of-bounds values in shared memory tensor.
     Args:
@@ -361,9 +379,12 @@ def fill_oob(tXsX: cute.Tensor, tXpX: cute.Tensor, fill_value: cute.Numeric) ->
     """
     tXrX_fill = cute.make_fragment_like(tXsX[(None, 0), 0, 0])
     tXrX_fill.fill(fill_value)
-    for rest_v in cutlass.range_constexpr(tXpX.shape[0]):
-        for rest_k in cutlass.range_constexpr(tXpX.shape[2]):
-            if not tXpX[rest_v, 0, rest_k]:
+    for rest_v in cutlass.range_constexpr(tXsX.shape[0][1]):
+        for rest_k in cutlass.range_constexpr(tXsX.shape[2]):
+            if cutlass.const_expr(tXpX is not None):
+                if not tXpX[rest_v, 0, rest_k]:
+                    cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
+            else:
                 cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
@@ -390,3 +411,38 @@ def i64_to_f32x2(c: cutlass.Int64, *, loc=None, ip=None) -> Tuple[Float32, Float
         vector.extract(vec_f32x2, dynamic_position=[], static_position=[1], loc=loc, ip=ip)
     )
     return res0, res1
+@dsl_user_op
+def domain_offset_i64(coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    flat_coord_i64 = tuple(cutlass.Int64(c) for c in cute.flatten(coord))
+    flat_stride = cute.flatten_to_tuple(tensor.stride)
+    assert len(flat_coord_i64) == len(
+        flat_stride
+    ), "Coordinate and stride must have the same length"
+    offset = sum(c * s for c, s in zip(flat_coord_i64, flat_stride))
+    assert isinstance(tensor.iterator, cute.Pointer)
+    # HACK: we assume that applying the offset does not change the pointer alignment
+    new_ptr = cute.make_ptr(
+        tensor.element_type,
+        tensor.iterator.toint() + offset * tensor.element_type.width // 8,
+        tensor.memspace,
+        assumed_align=tensor.iterator.max_alignment,
+    )
+    return cute.make_tensor(new_ptr, tensor.layout)
+@dsl_user_op
+def coord_offset_i64(
+    idx: cute.typing.Int, tensor: cute.Tensor, dim: int, *, loc=None, ip=None
+) -> cute.Tensor:
+    offset = cutlass.Int64(idx) * cute.size(tensor.stride[dim])
+    assert isinstance(tensor.iterator, cute.Pointer)
+    # HACK: we assume that applying the offset does not change the pointer alignment
+    new_ptr = cute.make_ptr(
+        tensor.element_type,
+        tensor.iterator.toint() + offset * tensor.element_type.width // 8,
+        tensor.memspace,
+        assumed_align=tensor.iterator.max_alignment,
+    )
+    return cute.make_tensor(new_ptr, tensor.layout)

{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.5
+Version: 0.1.7
 Requires-Python: >=3.9
 License-File: LICENSE
 Requires-Dist: nvidia-cutlass-dsl==4.1.0.dev0

quack_kernels-0.1.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+quack/__init__.py,sha256=R9cZd_vslI5oZjjS-ojfWAd9tCZAqsLUiFVqEbUaGnw,203
+quack/cross_entropy.py,sha256=bg66wECki5I71SMPIRUa-6-oFJ93aIKpK1jqT__SCBM,19775
+quack/layernorm.py,sha256=1WUspbr6ktPZ25O00kKs-FK_lm_Fejat72BMV8tBSfw,13504
+quack/reduction_base.py,sha256=4nAzkZR1yoQVA4Lc-GpU0XMjS5ARAmvYdeE0Doy7UCU,3789
+quack/rmsnorm.py,sha256=3jiwWhVmaG0n5vuUnGGrpg3StAB4lnzziNF97QVMLGQ,28870
+quack/softmax.py,sha256=3-5P_ORBrfQ6JYTIzgDs9jwmV7Za73SogaX7q9M7GCM,16698
+quack/utils.py,sha256=aiyzBc9BEwq8s965elfiR331hAaLLBKL9kDHjuls86Q,17791
+quack_kernels-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+quack_kernels-0.1.7.dist-info/METADATA,sha256=9RlqUmX3-7BI2aZk88r84B8o2FzZkQgkfV1UxwN8GlE,289
+quack_kernels-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+quack_kernels-0.1.7.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
+quack_kernels-0.1.7.dist-info/RECORD,,

quack_kernels-0.1.5.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-quack/__init__.py,sha256=GPoImcynY5-OkMep5RhQhXrnZyxgqZG3RoHhsYQFSL4,203
-quack/cross_entropy.py,sha256=WkngPY8uk4RCjCFtHtB7h9GF_8xt4NnyvDzvw73gIL4,19320
-quack/reduction_base.py,sha256=fFuGXPR3lDq2yw_m86ujmkni6R51jzNAzy_r9R6C8tA,3563
-quack/rmsnorm.py,sha256=N9NavrR85ws4cZgkfpeRLjYkVSq2yfyzJQWvfKf98pY,23935
-quack/softmax.py,sha256=VfhlC2huRuv7olFSVFgS8LF1yF8TFV64yjjjQxYX9yk,16364
-quack/utils.py,sha256=6EyWgf0z3wcbhGUivHmWB8hVBnEzMyOhmAuZ2Te82k0,15226
-quack_kernels-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-quack_kernels-0.1.5.dist-info/METADATA,sha256=WI-2CP1mRH05V9Fjdx7HsErNOkrc6fUhheoH4ynlo-U,289
-quack_kernels-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-quack_kernels-0.1.5.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
-quack_kernels-0.1.5.dist-info/RECORD,,

{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{quack_kernels-0.1.5.dist-info → quack_kernels-0.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

quack-kernels 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

quack-kernels 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl