PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/linear.py CHANGED Viewed

@@ -61,10 +61,11 @@ class LinearFunc(torch.autograd.Function):
     # Use classmethod instead of staticmethod to allow inheritance
     @classmethod
     @custom_fwd(device_type="cuda")
-    def forward(cls, ctx, x, weight, fuse_grad_accum=False):
+    def forward(cls, ctx, x, weight, bias=None, fuse_grad_accum=False):
         """
         x: (..., in_features)
         weight: (out_features, in_features)
+        bias: (out_features,) or None
         out: (..., out_features)
         """
         ctx.weight_dtype = weight.dtype
@@ -74,8 +75,9 @@ class LinearFunc(torch.autograd.Function):
         batch_shape = x.shape[:-1]
         x = x.reshape(-1, x.shape[-1])
         # out = F.linear(x, weight)
-        out = cls.matmul_fwd_fn(x, weight.T)
+        out = cls.matmul_fwd_fn(x, weight.T, bias=bias)
         linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
+        ctx.bias_dtype = bias.dtype if bias is not None else None
         return out.reshape(*batch_shape, out.shape[-1])
     @classmethod
@@ -87,13 +89,18 @@ class LinearFunc(torch.autograd.Function):
         x, weight, weight_og = ctx.saved_tensors  # weight_og is None if not ctx.fuse_grad_accum
         batch_shape = dout.shape[:-1]
         dout = dout.reshape(-1, dout.shape[-1])
+        dbias = (
+            dout.sum(0, dtype=ctx.bias_dtype)
+            if ctx.bias_dtype is not None and ctx.needs_input_grad[2]
+            else None
+        )
         dx = linear_bwd_compute_input_grad(ctx, dout, weight, cls.matmul_bwd_dx)
         dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
         dweight = linear_bwd_compute_weight_grad(
             ctx, dout, x, weight_og, cls.matmul_bwd_dw, cls.matmul_bwd_dw_inplace
         )
         # return extra Nones for other classes that inherit from LinearFunc
-        return dx, dweight, *([None] * 10)
+        return dx, dweight, dbias, *([None] * 10)
 class LinearUntunedFunc(LinearFunc):
@@ -104,9 +111,9 @@ class LinearUntunedFunc(LinearFunc):
     matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
-def linear_func(x, weight, fuse_grad_accum=False, tuned=True):
+def linear_func(x, weight, bias=None, fuse_grad_accum=False, tuned=True):
     fn_cls = LinearFunc if tuned else LinearUntunedFunc
-    return fn_cls.apply(x, weight, fuse_grad_accum)
+    return fn_cls.apply(x, weight, bias, fuse_grad_accum)
 class LinearActFunc(LinearFunc):
@@ -115,10 +122,13 @@ class LinearActFunc(LinearFunc):
     # Use classmethod instead of staticmethod to allow inheritance
     @classmethod
     @custom_fwd(device_type="cuda")
-    def forward(cls, ctx, x, weight, activation, store_preact=True, fuse_grad_accum=False):
+    def forward(
+        cls, ctx, x, weight, activation, bias=None, store_preact=True, fuse_grad_accum=False
+    ):
         """
         x: (..., in_features)
         weight: (out_features, in_features)
+        bias: (out_features,) or None
         out: (..., out_features)
         Return both out and post-activation, but only out is differentiable.
         """
@@ -129,11 +139,12 @@ class LinearActFunc(LinearFunc):
         batch_shape = x.shape[:-1]
         x = x.reshape(-1, x.shape[-1])
         out, postact = cls.matmul_fwd_fn(
-            x, weight.T, activation=activation, store_preact=store_preact
+            x, weight.T, bias=bias, activation=activation, store_preact=store_preact
         )
         linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2])
         if out is not None:
             out = out.reshape(*batch_shape, out.shape[-1])
+        ctx.bias_dtype = bias.dtype if bias is not None else None
         ctx.mark_non_differentiable(postact)
         ctx.set_materialize_grads(False)  # We don't want to materialize grads for postact
         return out, postact.reshape(*batch_shape, postact.shape[-1])
@@ -147,9 +158,11 @@ class LinearActUntunedFunc(LinearActFunc):
     matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
-def linear_act_func(x, weight, activation, store_preact=True, fuse_grad_accum=False, tuned=True):
+def linear_act_func(
+    x, weight, activation, bias=None, store_preact=True, fuse_grad_accum=False, tuned=True
+):
     fn_cls = LinearActFunc if tuned else LinearActUntunedFunc
-    return fn_cls.apply(x, weight, activation, store_preact, fuse_grad_accum)
+    return fn_cls.apply(x, weight, activation, bias, store_preact, fuse_grad_accum)
 class DActLinearFunc(LinearFunc):
@@ -229,12 +242,7 @@ class Linear(nn.Linear):
         self.fuse_grad_accum = fuse_grad_accum
     def forward(self, input: Tensor) -> Tensor:
-        if (
-            self.bias is None
-            and input.is_cuda
-            and self.in_features % 8 == 0
-            and self.out_features % 8 == 0
-        ):
-            return linear_func(input, self.weight, fuse_grad_accum=self.fuse_grad_accum)
+        if input.is_cuda and self.in_features % 8 == 0 and self.out_features % 8 == 0:
+            return linear_func(input, self.weight, self.bias, fuse_grad_accum=self.fuse_grad_accum)
         else:
             return F.linear(input, self.weight, self.bias)

quack/pipeline.py CHANGED Viewed

@@ -4,9 +4,11 @@ from typing import Optional
 from dataclasses import dataclass
 import cutlass.cute as cute
-from cutlass.cutlass_dsl import Boolean, Int32, if_generate
-from cutlass.pipeline import CooperativeGroup, PipelineOp, pipeline_init_wait
+from cutlass import Boolean, Int32, const_expr
+from cutlass.cutlass_dsl import if_generate, and_
+from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
 from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+from cutlass.pipeline import PipelineTmaUmma
 class PipelineStateWAdvance(PipelineState):
@@ -144,7 +146,160 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
         )
-    def producer_commit(self, state: PipelineState):
+    def producer_cpasync_commit(self, state: PipelineState):
+        """
+        We need the mbarrier to track the completion of cp.async
+        """
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))
+class MbarrierArrayWDropCount(MbarrierArray):
+    def __init__(
+        self,
+        barrier_storage: cute.Pointer,
+        num_stages: int,
+        agent: tuple[PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+        drop_count: Optional[Int32] = None,
+    ) -> None:
+        self.barrier_storage = barrier_storage
+        self.tx_count = tx_count
+        self.num_stages = num_stages
+        self.op_type, self.cg = agent
+        self.arrive_count = self.cg.size
+        self.drop_count = drop_count
+        if self.num_stages <= 0:
+            raise ValueError("Error: Mbarrier stage count must be greater than 0.")
+        if self.arrive_count <= 0:
+            raise ValueError("Error: Mbarrier arrive count must be greater than 0.")
+        if self.op_type is PipelineOp.TmaLoad and self.tx_count < 0:
+            raise ValueError("Error: Mbarrier tx count must not be less than 0 for TMA ops.")
+        if const_expr(drop_count is not None):
+            self.arrive_count = self.arrive_count - drop_count
+        # Store mbarrier base pointer
+        self.mbarrier_base = self.barrier_storage
+        # Mbarrier initialization in constructor
+        self.mbarrier_init()
+    def __extract_mlir_values__(self):
+        return [self.barrier_storage, self.drop_count]
+    def __new_from_mlir_values__(self, values):
+        return MbarrierArrayWDropCount(
+            values[0], self.num_stages, (self.op_type, self.cg), self.tx_count, values[1]
+        )
+@dataclass(frozen=True)
+class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
+    """
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and UMMA consumers
+    (e.g. Blackwell mainloops)
+    """
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        producer_drop_count: Optional[Int32] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.TCGen05Mma
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = MbarrierArrayWDropCount(
+            barrier_storage.align(min_align=8),
+            num_stages,
+            producer,
+            tx_count,
+            drop_count=producer_drop_count,
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk)
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+        consumer_mask = producer_mask
+        pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaCpAsyncUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            cta_group,
+        )
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the
+        transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if_generate(
+            and_(self.is_leader_cta, is_tma_warp),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+    def producer_cpasync_commit(self, state: PipelineState):
         """
         We need the mbarrier to track the completion of cp.async
         """

quack/reduce.py CHANGED Viewed

@@ -6,29 +6,11 @@ from typing import Callable, Optional
 import cutlass
 import cutlass.cute as cute
-from cutlass import Float32
+from cutlass import Int32, Int64, Float32, Boolean, const_expr
 import quack.utils as utils
-@cute.jit
-def warp_reduce(
-    val: cute.TensorSSA | cute.Numeric,
-    op: Callable,
-    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
-) -> cute.TensorSSA | cute.Numeric:
-    if cutlass.const_expr(isinstance(val, cute.TensorSSA)):
-        res = cute.make_fragment(val.shape, val.dtype)
-        res.store(val)
-        for i in cutlass.range_constexpr(cute.size(val.shape)):
-            res[i] = warp_reduce(res[i], op, width)
-        return res.load()
-    else:
-        for i in cutlass.range_constexpr(int(math.log2(width))):
-            val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
-    return val
 @cute.jit
 def block_reduce(
     val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, init_val: cute.Numeric = 0.0
@@ -43,7 +25,7 @@ def block_reduce(
     block_reduce_val = init_val
     if lane_idx < warps_per_row:
         block_reduce_val = reduction_buffer[row_idx, lane_idx]
-    return warp_reduce(block_reduce_val, op)
+    return cute.arch.warp_reduction(block_reduce_val, op)
 @cute.jit
@@ -53,7 +35,7 @@ def cluster_reduce(
     reduction_buffer: cute.Tensor,
     mbar_ptr: cute.Pointer,
     init_val: cute.Numeric = 0.0,
-    phase: Optional[cutlass.Int32] = None,
+    phase: Optional[Int32] = None,
 ) -> cute.Numeric:
     """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
     cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
@@ -81,7 +63,7 @@ def cluster_reduce(
         idx = lane_idx + i * cute.arch.WARP_SIZE
         if idx < cute.size(reduction_buffer, mode=[1]):
             block_reduce_val = op(block_reduce_val, reduction_buffer[row_idx, idx])
-    return warp_reduce(block_reduce_val, op)
+    return cute.arch.warp_reduction(block_reduce_val, op)
 @cute.jit
@@ -90,11 +72,11 @@ def block_or_cluster_reduce(
     op: Callable,
     reduction_buffer: cute.Tensor,
     mbar_ptr: Optional[cute.Pointer],
-    phase: Optional[cutlass.Int32] = None,
+    phase: Optional[Int32] = None,
     init_val: cute.Numeric = 0.0,
 ) -> cute.Numeric:
     """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
-    if cutlass.const_expr(mbar_ptr is None):
+    if const_expr(mbar_ptr is None):
         return block_reduce(val, op, reduction_buffer, init_val=init_val)
     else:
         return cluster_reduce(val, op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val)
@@ -107,34 +89,34 @@ def row_reduce(
     threads_per_row: cutlass.Constexpr[int],
     reduction_buffer: Optional[cute.Tensor] = None,
     mbar_ptr: Optional[cute.Pointer] = None,
-    phase: Optional[cutlass.Int32] = None,
+    phase: Optional[Int32] = None,
     init_val: cute.Numeric = 0.0,
     hook_fn: Optional[Callable] = None,
 ) -> cute.Numeric:
     """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
-    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
+    if const_expr(isinstance(x, cute.TensorSSA)):
         val = x.reduce(op, init_val=init_val, reduction_profile=0)
     else:
         val = x
     warp_op = {
         cute.ReductionOp.ADD: operator.add,
-        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
+        cute.ReductionOp.MAX: cute.arch.fmax if const_expr(x.dtype == Float32) else max,
         cute.ReductionOp.MIN: min,
         cute.ReductionOp.MUL: operator.mul,
     }[op]
-    val = warp_reduce(
+    val = cute.arch.warp_reduction(
         val,
         warp_op,
-        width=min(threads_per_row, cute.arch.WARP_SIZE),
+        threads_in_group=min(threads_per_row, cute.arch.WARP_SIZE),
     )
-    if cutlass.const_expr(hook_fn is not None):
+    if const_expr(hook_fn is not None):
         hook_fn()
-    if cutlass.const_expr(reduction_buffer is not None):
+    if const_expr(reduction_buffer is not None):
         warps_per_row, cluster_n = reduction_buffer.shape[1]
         assert cluster_n == 1 or mbar_ptr is not None, (
             "mbar_ptr must be provided for cluster reduction"
         )
-        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+        if const_expr(warps_per_row > 1 or cluster_n > 1):
             val = block_or_cluster_reduce(
                 val, warp_op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val
             )
@@ -148,37 +130,37 @@ def online_softmax_reduce(
     reduction_buffer: Optional[cute.Tensor] = None,
     mbar_ptr: Optional[cute.Pointer] = None,
     hook_fn: Optional[Callable] = None,
-    phase: Optional[cutlass.Int32] = None,
+    phase: Optional[Int32] = None,
     return_exp_x: bool = False,
 ) -> [Float32, Float32, Optional[cute.TensorSSA]]:
     assert x.dtype == Float32, "x must be of type Float32"
     """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n), 2)"""
-    max_x = warp_reduce(
+    max_x = cute.arch.warp_reduction(
         x.reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
         cute.arch.fmax,
-        width=min(threads_per_row, cute.arch.WARP_SIZE),
+        threads_in_group=min(threads_per_row, cute.arch.WARP_SIZE),
     )
     log2_e = math.log2(math.e)
     exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
-    sum_exp_x = warp_reduce(
+    sum_exp_x = cute.arch.warp_reduction(
         exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
         operator.add,
-        width=min(threads_per_row, cute.arch.WARP_SIZE),
+        threads_in_group=min(threads_per_row, cute.arch.WARP_SIZE),
     )
-    if cutlass.const_expr(hook_fn is not None):
+    if const_expr(hook_fn is not None):
         hook_fn()
-    if cutlass.const_expr(reduction_buffer is not None):
+    if const_expr(reduction_buffer is not None):
         rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
         assert cluster_n == 1 or mbar_ptr is not None, (
             "mbar_ptr must be provided for cluster reduction"
         )
-        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
-            assert reduction_buffer.element_type == cutlass.Int64, (
+        if const_expr(warps_per_row > 1 or cluster_n > 1):
+            assert reduction_buffer.element_type == Int64, (
                 "reduction_buffer must be of type cute.Int64"
             )
             lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
             row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
-            if cutlass.const_expr(mbar_ptr is None):
+            if const_expr(mbar_ptr is None):
                 if lane_idx == 0:
                     reduction_buffer[row_idx, col_idx] = utils.f32x2_to_i64(max_x, sum_exp_x)
                 cute.arch.barrier()
@@ -188,10 +170,10 @@ def online_softmax_reduce(
                     max_x_single_warp, sum_exp_x = utils.i64_to_f32x2(
                         reduction_buffer[row_idx, lane_idx]
                     )
-                max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
+                max_x_final = cute.arch.warp_reduction(max_x_single_warp, cute.arch.fmax)
                 sum_exp_x *= cute.math.exp(max_x_single_warp - max_x_final, fastmath=True)
-                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
-                if cutlass.const_expr(return_exp_x):
+                sum_exp_x = cute.arch.warp_reduction(sum_exp_x, operator.add)
+                if const_expr(return_exp_x):
                     exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
                 max_x = max_x_final
             else:
@@ -227,14 +209,71 @@ def online_softmax_reduce(
                 max_x_final = max_x_single_warp.load().reduce(
                     cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0
                 )
-                max_x_final = warp_reduce(max_x_final, cute.arch.fmax)
+                max_x_final = cute.arch.warp_reduction(max_x_final, cute.arch.fmax)
                 sum_exp_x = 0.0
                 for i in cutlass.range_constexpr(num_iter):
                     sum_exp_x += sum_exp_x_single_warp[i] * cute.math.exp(
                         max_x_single_warp[i] - max_x_final, fastmath=True
                     )
-                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
-                if cutlass.const_expr(return_exp_x):
+                sum_exp_x = cute.arch.warp_reduction(sum_exp_x, operator.add)
+                if const_expr(return_exp_x):
                     exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
                 max_x = max_x_final
-    return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)
+    return max_x, sum_exp_x, (exp_x if const_expr(return_exp_x) else None)
+@cute.jit
+def sum_swap_shuffle(
+    X: cute.Tensor, elem_per_lane: int = 1, subwarp_size: int = 1, warp_size: int = 32
+) -> cute.Tensor:
+    """
+    For warp reduction, we use Swap Shuffle
+    The normal way to reduction among threads:
+    use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
+    After each step of reduction, a half of threads won't work in the following steps.
+    That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
+    To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
+    we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
+    After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
+    We can recursively do this until the problem size is 1.
+    """
+    assert (
+        subwarp_size >= 1
+        and subwarp_size <= 32
+        and subwarp_size == 1 << int(math.log2(subwarp_size))
+    )
+    assert (
+        warp_size <= 32
+        and warp_size % subwarp_size == 0
+        and warp_size == 1 << int(math.log2(warp_size))
+    )
+    lane_idx = cute.arch.lane_idx() // subwarp_size
+    X = cute.logical_divide(X, cute.make_layout(elem_per_lane))  # (elem_per_lane, M)
+    numvec = cute.size(X, mode=[1])
+    assert numvec <= 32 // subwarp_size
+    # If X has more values than warp_size // subwarp_size, we first do a normal warp reduction
+    # to sum up values held by lanes further than size(X) away
+    for i in cutlass.range(
+        int(math.log2(numvec)), int(math.log2(warp_size // subwarp_size)), unroll_full=True
+    ):
+        for v in cutlass.range(cute.size(X), unroll_full=True):
+            shfl_val = cute.arch.shuffle_sync_bfly(X[v], offset=(1 << i) * subwarp_size)
+            X[v] = X[v] + shfl_val
+    for logm in cutlass.range_constexpr(int(math.log2(cute.size(X, mode=[1]))) - 1, -1, -1):
+        m = 1 << logm
+        for r in cutlass.range(m, unroll_full=True):
+            frg_A = X[None, r]
+            frg_B = X[None, r + m]
+            #  First half of threads swap fragments from the first half of data to the second
+            should_swap = not Boolean(lane_idx & m)
+            for v in cutlass.range(cute.size(frg_A), unroll_full=True):
+                # Step 1: swap
+                lower, upper = frg_A[v], frg_B[v]
+                frg_A[v] = upper if should_swap else lower
+                frg_B[v] = lower if should_swap else upper
+                # Step 2: shuffle
+                # each half of threads get a half of data from the other half of threads
+                shfl_val = cute.arch.shuffle_sync_bfly(frg_A[v], offset=m * subwarp_size)
+                # Step 3: reduction
+                frg_A[v] = frg_B[v] + shfl_val
+    return X[None, 0]

quack/reduction_base.py CHANGED Viewed

@@ -4,55 +4,44 @@ from typing import Type, Tuple, Optional
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Int64, Float32, const_expr
+import quack.copy_utils as copy_utils
 class ReductionBase:
-    def __init__(
-        self, dtype: Type[cutlass.Numeric], N: int, stage: int, reduction_dtype=cutlass.Float32
-    ):
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int, stage: int, reduction_dtype=Float32):
         self.dtype = dtype
         self.N = N
         self.stage = stage
         self.reduction_dtype = reduction_dtype
-    def _calculate_threads_per_row(self):
+    def _threads_per_row(self):
         raise NotImplementedError()
+    def _num_threads(self):
+        return 128 if self.N <= 16384 else 256
     def _set_cluster_n(self):
         self.cluster_n = 1
-    def _get_num_threads(self):
-        return 128 if self.N <= 16384 else 256
-    def _get_tv_layout(self, num_copy_bits=128):
-        vecsize = num_copy_bits // self.dtype.width
+    def _get_tiled_copy(self, vecsize: int = 1):
         assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
-        num_threads = self._get_num_threads()
+        threads_per_row = self._threads_per_row()
+        num_threads = self._num_threads()
         assert num_threads % cute.arch.WARP_SIZE == 0
-        threads_per_row = self._calculate_threads_per_row()
         num_blocks_N = cute.ceil_div(self.N // vecsize, threads_per_row * self.cluster_n)
-        cols_per_block = num_threads // threads_per_row
-        tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
-        tv_layout = cute.make_layout(
-            ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
-            stride=(
-                (vecsize * cols_per_block, 1),
-                (cols_per_block, cols_per_block * vecsize * threads_per_row),
-            ),
-        )
-        return tiler_mn, tv_layout
-    def _smem_size_in_bytes(self, tiler_mn, num_warps):
-        return (
-            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn))
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
-            + self.stage * (cutlass.Int64.width // 8)
-        )
+        tiler_mn = (num_threads // threads_per_row, vecsize * num_blocks_N * threads_per_row)
+        tiled_copy = copy_utils.tiled_copy_2d(self.dtype, threads_per_row, num_threads, vecsize)
+        return tiled_copy, tiler_mn, threads_per_row
     def _get_reduction_buffer_layout(self, tv_layout: cute.Layout, cluster_n: int):
         num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        warps_per_row = max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        warps_per_row = (
+            num_warps
+            if cute.rank(tv_layout.shape[0]) == 1
+            else max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        )
         return cute.make_ordered_layout(
             (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
             order=(1, 0, 2),
@@ -64,11 +53,11 @@ class ReductionBase:
         reduction_buffer = smem.allocate_tensor(
             self.reduction_dtype,
             self._get_reduction_buffer_layout(tv_layout, self.cluster_n),
-            byte_alignment=4,
+            byte_alignment=8,
         )
-        if cutlass.const_expr(self.cluster_n > 1):
+        if const_expr(self.cluster_n > 1):
             mbar_ptr = smem.allocate_array(
-                cutlass.Int64, num_elems=self.stage if not is_persistent else self.stage * 2
+                Int64, num_elems=self.stage if not is_persistent else self.stage * 2
             )
         else:
             mbar_ptr = None
@@ -77,15 +66,15 @@ class ReductionBase:
     @cute.jit
     def _initialize_cluster(
         self,
-        tidx: cutlass.Int32,
+        tidx: Int32,
         mbar_ptr: cute.Pointer,
         num_warps: int,
         is_persistent: bool = False,
     ):
-        if cutlass.const_expr(self.cluster_n > 1):
+        if const_expr(self.cluster_n > 1):
             if tidx < self.stage:  # Initialize full barrier
                 cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
-                if cutlass.const_expr(is_persistent):  # Initialize empty barrier
+                if const_expr(is_persistent):  # Initialize empty barrier
                     cute.arch.mbarrier_init(
                         mbar_ptr + self.stage + tidx, num_warps * self.cluster_n
                     )

quack-kernels 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl