PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/layout_utils.py ADDED Viewed

@@ -0,0 +1,287 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+from quack.utils import prmt
+def transpose_view(a: cute.Tensor) -> cute.Tensor:
+    """Transpose the first two dimensions of a tensor on smem."""
+    shape = (a.shape[1], a.shape[0], *a.shape[2:])
+    order = (1, 0, *range(2, cute.rank(a)))
+    return cute.composition(a, cute.make_ordered_layout(shape, order=order))
+def select(a: cute.Tensor, mode: list[int]) -> cute.Tensor:
+    return cute.make_tensor(a.iterator, cute.select(a.layout, mode))
+def expand(a: cute.Tensor, dim: int, size: Int32 | int) -> cute.Tensor:
+    shape = (*a.shape[:dim], size, *a.shape[dim:])
+    stride = (*a.layout.stride[:dim], 0, *a.layout.stride[dim:])
+    return cute.make_tensor(a.iterator, cute.make_layout(shape, stride=stride))
+@cute.jit
+def permute_gated_Cregs_b16(t: cute.Tensor) -> None:
+    assert t.element_type.width == 16
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b16 permutation"
+    t_u32 = cute.recast_tensor(t, Int32)
+    quad_idx = cute.arch.lane_idx() % 4
+    lane_03 = quad_idx == 0 or quad_idx == 3
+    selector_upper = Int32(0x5410) if lane_03 else Int32(0x1054)
+    selector_lower = Int32(0x7632) if lane_03 else Int32(0x3276)
+    # upper_map = [0, 3, 1, 2]
+    # lower_map = [1, 2, 0, 3]
+    # upper_idx = upper_map[quad_idx]
+    # indexing isn't supported so we have to do arithmetic
+    upper_idx = quad_idx // 2 if quad_idx % 2 == 0 else 3 - quad_idx // 2
+    lower_idx = upper_idx ^ 1
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    for i in cutlass.range(cute.size(t_u32.shape) // 2, unroll_full=True):
+        upper, lower = t_u32[i * 2 + 0], t_u32[i * 2 + 1]
+        upper0 = upper if lane_03 else lower
+        lower0 = lower if lane_03 else upper
+        upper0 = cute.arch.shuffle_sync(upper0, offset=upper_idx, mask_and_clamp=mask_and_clamp)
+        lower0 = cute.arch.shuffle_sync(lower0, offset=lower_idx, mask_and_clamp=mask_and_clamp)
+        t_u32[i * 2 + 0] = prmt(upper0, lower0, selector_upper)
+        t_u32[i * 2 + 1] = prmt(upper0, lower0, selector_lower)
+@cute.jit
+def permute_Cregs_b32_for_stsm(t: cute.Tensor) -> None:
+    """Permute and shuffle within 4 threads to change the layout from
+     T0 | T1  | T2  | T3
+    a b | c d | e f | g h
+    to
+    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
+    a  | b  | c  | d  | e  | f  | g  | h
+    This is so that we can use STSM (instead of STS.64) to store C registers without bank conflict.
+    """
+    assert t.element_type.width == 32
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b32 permutation"
+    quad_idx = cute.arch.lane_idx() % 4
+    # left_map = [0, 2, 1, 3]
+    # right_map = [2, 0, 3, 1]
+    # indexing isn't supported so we have to do arithmetic
+    left_idx = quad_idx // 2 if quad_idx % 2 == 0 else 2 + quad_idx // 2
+    right_idx = left_idx ^ 0b10
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    for i in cutlass.range(cute.size(t.shape) // 4, unroll_full=True):
+        for r in cutlass.range(2, unroll_full=True):
+            left, right = t[i * 4 + r * 2 + 0], t[i * 4 + r * 2 + 1]
+            # a b | c d | e f | g h -> a b | c d | f e | h g
+            left0 = left if quad_idx < 2 else right
+            right0 = right if quad_idx < 2 else left
+            # a b | c d | f e | h g -> a b | f d | c e | h g
+            left0 = cute.arch.shuffle_sync(left0, offset=left_idx, mask_and_clamp=mask_and_clamp)
+            # a b | f d | c e | h g -> a e | f b | c g | h d
+            right0 = cute.arch.shuffle_sync(right0, offset=right_idx, mask_and_clamp=mask_and_clamp)
+            # a e | f b | c g | h d -> a e | b f | c g | d h
+            t[i * 4 + r * 2 + 0] = left0 if quad_idx % 2 == 0 else right0
+            t[i * 4 + r * 2 + 1] = right0 if quad_idx % 2 == 0 else left0
+        t[i * 4 + 1], t[i * 4 + 2] = t[i * 4 + 2], t[i * 4 + 1]
+@cute.jit
+def permute_Cregs_b32_for_ldsm(t: cute.Tensor) -> None:
+    """Permute and shuffle within 4 threads to change the layout from
+    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
+    a  | b  | c  | d  | e  | f  | g  | h
+    to
+     T0 | T1  | T2  | T3
+    a b | c d | e f | g h
+    This is so that we can use LDSM (instead of LDS.64) to store C registers without bank conflict.
+    """
+    assert t.element_type.width == 32
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b32 permutation"
+    quad_idx = cute.arch.lane_idx() % 4
+    # left_map = [0, 2, 1, 3]
+    # right_map = [1, 3, 0, 2]
+    # indexing isn't supported so we have to do arithmetic
+    left_idx = quad_idx // 2 if quad_idx % 2 == 0 else 2 + quad_idx // 2
+    right_idx = left_idx ^ 0b01
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    # This is just the inverse of permute_Cregs_b32_for_stsm
+    for i in cutlass.range(cute.size(t.shape) // 4, unroll_full=True):
+        t[i * 4 + 1], t[i * 4 + 2] = t[i * 4 + 2], t[i * 4 + 1]
+        for r in cutlass.range(2, unroll_full=True):
+            left, right = t[i * 4 + r * 2 + 0], t[i * 4 + r * 2 + 1]
+            # a e | b f | c g | d h -> a e | f b | c g | h d
+            left0 = left if quad_idx % 2 == 0 else right
+            right0 = right if quad_idx % 2 == 0 else left
+            # a e | f b | c g | h d -> a b | f d | c e | h g
+            right0 = cute.arch.shuffle_sync(right0, offset=right_idx, mask_and_clamp=mask_and_clamp)
+            # a b | f d | c e | h g -> a b | c d | f e | h g
+            left0 = cute.arch.shuffle_sync(left0, offset=left_idx, mask_and_clamp=mask_and_clamp)
+            # a b | c d | f e | h g -> a b | c d | e f | g h
+            t[i * 4 + r * 2 + 0] = left0 if quad_idx < 2 else right0
+            t[i * 4 + r * 2 + 1] = right0 if quad_idx < 2 else left0
+@cute.jit
+def concat_layout(*layouts: cute.Layout) -> cute.Layout:
+    return cute.make_layout(
+        tuple(l.shape for l in layouts),
+        stride=tuple(l.stride for l in layouts),
+    )
+def convert_layout_acc_mn(acc_layout: cute.Layout) -> cute.Layout:
+    """
+    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
+    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
+    """
+    acc_layout_col_major = cute.make_layout(acc_layout.shape)
+    acc_layout_mn = cute.make_layout(
+        (
+            (acc_layout_col_major.shape[0][1], acc_layout_col_major.shape[1]),  # MMA_M
+            (
+                acc_layout_col_major.shape[0][0],
+                *acc_layout_col_major.shape[0][2:],
+                acc_layout_col_major.shape[2],
+            ),  # MMA_N
+            *acc_layout_col_major.shape[3:],
+        ),
+        stride=(
+            (acc_layout_col_major.stride[0][1], acc_layout_col_major.stride[1]),  # MMA_M
+            (
+                acc_layout_col_major.stride[0][0],
+                *acc_layout_col_major.stride[0][2:],
+                acc_layout_col_major.stride[2],
+            ),  # MMA_N
+            *acc_layout_col_major.stride[3:],
+        ),
+    )
+    return cute.composition(acc_layout, acc_layout_mn)
+def make_acc_tensor_mn_view(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
+@cute.jit
+def convert_layout_acc_frgA(acc_layout: cute.Layout) -> cute.Layout:
+    # For back to back gemm, convert layout of acc0 to gemm 1 accept layout.
+    # For Sm80, as the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+    # For Sm90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N))
+    # TODO: Sm90 FP8
+    if const_expr(cute.rank(acc_layout.shape[0]) == 3):  # Sm90
+        l = cute.logical_divide(
+            acc_layout, ((None, None, 2), None, None)
+        )  # ((2, 2, (2, N / 16)), MMA_M, MMA_N)
+        rA_mma_view = cute.make_layout(
+            (
+                (l.shape[0][0], l.shape[0][1], l.shape[0][2][0]),
+                l.shape[1],
+                (l.shape[0][2][1], l.shape[2]),
+            ),
+            stride=(
+                (l.stride[0][0], l.stride[0][1], l.stride[0][2][0]),
+                l.stride[1],
+                (l.stride[0][2][1], l.stride[2]),
+            ),
+        )
+    else:  # Sm80
+        # (4, MMA_M, MMA_N) -> (4, MMA_M, (2, MMA_N / 2))
+        l = cute.logical_divide(acc_layout, (None, None, 2))
+        rA_mma_view = cute.make_layout(
+            (
+                (l.shape[0], l.shape[2][0]),
+                l.shape[1],
+                l.shape[2][1],
+            ),
+            stride=(
+                (l.stride[0], l.stride[2][0]),
+                l.stride[1],
+                l.stride[2][1],
+            ),
+        )
+    return rA_mma_view
+def convert_layout_zero_stride(
+    input: cute.Tensor | cute.Layout, ref_layout: cute.Layout
+) -> cute.Layout:
+    layout = input.layout if const_expr(isinstance(input, cute.Tensor)) else input
+    # Group the modes with non-zero stride in the ref_layout together,
+    # and the modes with zero stride together
+    layout_flat = cute.flatten(layout)
+    ref_layout_flat = cute.flatten(ref_layout)
+    nonzero_modes = [i for i in range(cute.rank(layout_flat)) if ref_layout_flat[i].stride != 0]
+    zero_modes = [i for i in range(cute.rank(layout_flat)) if ref_layout_flat[i].stride == 0]
+    # There's an edge case when all modes are zero stride
+    new_shape = (
+        tuple(layout_flat[i].shape for i in nonzero_modes) if len(nonzero_modes) > 0 else (1,),
+        tuple(layout_flat[i].shape for i in zero_modes),
+    )
+    new_stride = (
+        tuple(layout_flat[i].stride for i in nonzero_modes) if len(nonzero_modes) > 0 else (0,),
+        tuple(layout_flat[i].stride for i in zero_modes),
+    )
+    out_layout = cute.make_layout(new_shape, stride=new_stride)
+    if const_expr(isinstance(input, cute.Tensor)):
+        return cute.make_tensor(input.iterator, out_layout)
+    else:
+        return out_layout
+def mma_partition_C_vec(
+    sVec: cute.Tensor, thr_mma: cute.core.ThrMma, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_C(sVec_mma))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def mma_partition_A_vec(
+    sVec: cute.Tensor, thr_mma: cute.core.ThrMma, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]

quack/linear.py CHANGED Viewed

@@ -61,10 +61,11 @@ class LinearFunc(torch.autograd.Function):
     # Use classmethod instead of staticmethod to allow inheritance
     @classmethod
     @custom_fwd(device_type="cuda")
-    def forward(cls, ctx, x, weight, fuse_grad_accum=False):
+    def forward(cls, ctx, x, weight, bias=None, fuse_grad_accum=False):
         """
         x: (..., in_features)
         weight: (out_features, in_features)
+        bias: (out_features,) or None
         out: (..., out_features)
         """
         ctx.weight_dtype = weight.dtype
@@ -74,8 +75,9 @@ class LinearFunc(torch.autograd.Function):
         batch_shape = x.shape[:-1]
         x = x.reshape(-1, x.shape[-1])
         # out = F.linear(x, weight)
-        out = cls.matmul_fwd_fn(x, weight.T)
+        out = cls.matmul_fwd_fn(x, weight.T, bias=bias)
         linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
+        ctx.bias_dtype = bias.dtype if bias is not None else None
         return out.reshape(*batch_shape, out.shape[-1])
     @classmethod
@@ -87,13 +89,18 @@ class LinearFunc(torch.autograd.Function):
         x, weight, weight_og = ctx.saved_tensors  # weight_og is None if not ctx.fuse_grad_accum
         batch_shape = dout.shape[:-1]
         dout = dout.reshape(-1, dout.shape[-1])
+        dbias = (
+            dout.sum(0, dtype=ctx.bias_dtype)
+            if ctx.bias_dtype is not None and ctx.needs_input_grad[2]
+            else None
+        )
         dx = linear_bwd_compute_input_grad(ctx, dout, weight, cls.matmul_bwd_dx)
         dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
         dweight = linear_bwd_compute_weight_grad(
             ctx, dout, x, weight_og, cls.matmul_bwd_dw, cls.matmul_bwd_dw_inplace
         )
         # return extra Nones for other classes that inherit from LinearFunc
-        return dx, dweight, *([None] * 10)
+        return dx, dweight, dbias, *([None] * 10)
 class LinearUntunedFunc(LinearFunc):
@@ -104,9 +111,9 @@ class LinearUntunedFunc(LinearFunc):
     matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
-def linear_func(x, weight, fuse_grad_accum=False, tuned=True):
+def linear_func(x, weight, bias=None, fuse_grad_accum=False, tuned=True):
     fn_cls = LinearFunc if tuned else LinearUntunedFunc
-    return fn_cls.apply(x, weight, fuse_grad_accum)
+    return fn_cls.apply(x, weight, bias, fuse_grad_accum)
 class LinearActFunc(LinearFunc):
@@ -115,10 +122,13 @@ class LinearActFunc(LinearFunc):
     # Use classmethod instead of staticmethod to allow inheritance
     @classmethod
     @custom_fwd(device_type="cuda")
-    def forward(cls, ctx, x, weight, activation, store_preact=True, fuse_grad_accum=False):
+    def forward(
+        cls, ctx, x, weight, activation, bias=None, store_preact=True, fuse_grad_accum=False
+    ):
         """
         x: (..., in_features)
         weight: (out_features, in_features)
+        bias: (out_features,) or None
         out: (..., out_features)
         Return both out and post-activation, but only out is differentiable.
         """
@@ -129,11 +139,12 @@ class LinearActFunc(LinearFunc):
         batch_shape = x.shape[:-1]
         x = x.reshape(-1, x.shape[-1])
         out, postact = cls.matmul_fwd_fn(
-            x, weight.T, activation=activation, store_preact=store_preact
+            x, weight.T, bias=bias, activation=activation, store_preact=store_preact
         )
         linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
         if out is not None:
             out = out.reshape(*batch_shape, out.shape[-1])
+        ctx.bias_dtype = bias.dtype if bias is not None else None
         ctx.mark_non_differentiable(postact)
         ctx.set_materialize_grads(False)  # We don't want to materialize grads for postact
         return out, postact.reshape(*batch_shape, postact.shape[-1])
@@ -147,9 +158,11 @@ class LinearActUntunedFunc(LinearActFunc):
     matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
-def linear_act_func(x, weight, activation, store_preact=True, fuse_grad_accum=False, tuned=True):
+def linear_act_func(
+    x, weight, activation, bias=None, store_preact=True, fuse_grad_accum=False, tuned=True
+):
     fn_cls = LinearActFunc if tuned else LinearActUntunedFunc
-    return fn_cls.apply(x, weight, activation, store_preact, fuse_grad_accum)
+    return fn_cls.apply(x, weight, activation, bias, store_preact, fuse_grad_accum)
 class DActLinearFunc(LinearFunc):
@@ -229,12 +242,7 @@ class Linear(nn.Linear):
         self.fuse_grad_accum = fuse_grad_accum
     def forward(self, input: Tensor) -> Tensor:
-        if (
-            self.bias is None
-            and input.is_cuda
-            and self.in_features % 8 == 0
-            and self.out_features % 8 == 0
-        ):
-            return linear_func(input, self.weight, fuse_grad_accum=self.fuse_grad_accum)
+        if input.is_cuda and self.in_features % 8 == 0 and self.out_features % 8 == 0:
+            return linear_func(input, self.weight, self.bias, fuse_grad_accum=self.fuse_grad_accum)
         else:
             return F.linear(input, self.weight, self.bias)

quack/pipeline.py CHANGED Viewed

@@ -4,9 +4,11 @@ from typing import Optional
 from dataclasses import dataclass
 import cutlass.cute as cute
-from cutlass.cutlass_dsl import Boolean, Int32, if_generate
-from cutlass.pipeline import CooperativeGroup, PipelineOp, pipeline_init_wait
+from cutlass import Boolean, Int32, const_expr
+from cutlass.cutlass_dsl import if_generate, and_
+from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
 from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+from cutlass.pipeline import PipelineTmaUmma
 class PipelineStateWAdvance(PipelineState):
@@ -144,7 +146,160 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
         )
-    def producer_commit(self, state: PipelineState):
+    def producer_cpasync_commit(self, state: PipelineState):
+        """
+        We need the mbarrier to track the completion of cp.async
+        """
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))
+class MbarrierArrayWDropCount(MbarrierArray):
+    def __init__(
+        self,
+        barrier_storage: cute.Pointer,
+        num_stages: int,
+        agent: tuple[PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+        drop_count: Optional[Int32] = None,
+    ) -> None:
+        self.barrier_storage = barrier_storage
+        self.tx_count = tx_count
+        self.num_stages = num_stages
+        self.op_type, self.cg = agent
+        self.arrive_count = self.cg.size
+        self.drop_count = drop_count
+        if self.num_stages <= 0:
+            raise ValueError("Error: Mbarrier stage count must be greater than 0.")
+        if self.arrive_count <= 0:
+            raise ValueError("Error: Mbarrier arrive count must be greater than 0.")
+        if self.op_type is PipelineOp.TmaLoad and self.tx_count < 0:
+            raise ValueError("Error: Mbarrier tx count must not be less than 0 for TMA ops.")
+        if const_expr(drop_count is not None):
+            self.arrive_count = self.arrive_count - drop_count
+        # Store mbarrier base pointer
+        self.mbarrier_base = self.barrier_storage
+        # Mbarrier initialization in constructor
+        self.mbarrier_init()
+    def __extract_mlir_values__(self):
+        return [self.barrier_storage, self.drop_count]
+    def __new_from_mlir_values__(self, values):
+        return MbarrierArrayWDropCount(
+            values[0], self.num_stages, (self.op_type, self.cg), self.tx_count, values[1]
+        )
+@dataclass(frozen=True)
+class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
+    """
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and UMMA consumers
+    (e.g. Blackwell mainloops)
+    """
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        producer_drop_count: Optional[Int32] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.TCGen05Mma
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = MbarrierArrayWDropCount(
+            barrier_storage.align(min_align=8),
+            num_stages,
+            producer,
+            tx_count,
+            drop_count=producer_drop_count,
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk)
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+        consumer_mask = producer_mask
+        pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaCpAsyncUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            cta_group,
+        )
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the
+        transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if_generate(
+            and_(self.is_leader_cta, is_tma_warp),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+    def producer_cpasync_commit(self, state: PipelineState):
         """
         We need the mbarrier to track the completion of cp.async
         """

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl