PyPI - quack-kernels - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

quack-kernels 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

quack/__init__.py +11 -1
quack/activation.py +72 -64
quack/broadcast_utils.py +1 -1
quack/copy_utils.py +143 -20
quack/cute_dsl_ptxas.py +151 -0
quack/fast_math.py +29 -76
quack/gemm_act.py +296 -8
quack/gemm_dact.py +520 -4
quack/gemm_default_epi.py +4 -4
quack/gemm_interface.py +363 -0
quack/gemm_sm100.py +62 -88
quack/gemm_sm90.py +68 -114
quack/gemm_symmetric.py +2 -6
quack/layout_utils.py +10 -4
quack/linear.py +37 -0
quack/pipeline.py +87 -99
quack/reduce.py +2 -2
quack/rmsnorm.py +1 -3
quack/sm90_utils.py +34 -2
quack/sort/bitonic_sort.py +4 -4
quack/tile_scheduler.py +310 -256
quack/topk.py +4 -4
quack/utils.py +76 -40
{quack_kernels-0.2.4.dist-info → quack_kernels-0.2.6.dist-info}/METADATA +2 -2
quack_kernels-0.2.6.dist-info/RECORD +45 -0
{quack_kernels-0.2.4.dist-info → quack_kernels-0.2.6.dist-info}/WHEEL +1 -1
quack_kernels-0.2.4.dist-info/RECORD +0 -44
{quack_kernels-0.2.4.dist-info → quack_kernels-0.2.6.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.4.dist-info → quack_kernels-0.2.6.dist-info}/top_level.txt +0 -0

quack/pipeline.py CHANGED Viewed

@@ -5,14 +5,16 @@ from dataclasses import dataclass
 import cutlass.cute as cute
 from cutlass import Boolean, Int32, const_expr
-from cutlass.cutlass_dsl import if_generate, and_
-from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
-from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+from cutlass.cutlass_dsl import if_generate, and_, dsl_user_op
+from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp
+from cutlass.pipeline import PipelineTmaAsync, PipelineState, PipelineUserType
 from cutlass.pipeline import PipelineTmaUmma
+from cutlass.pipeline import Agent, agent_sync
 class PipelineStateWAdvance(PipelineState):
-    def advance_iters(self, num_iterations: Int32):
+    @dsl_user_op
+    def advance_iters(self, num_iterations: Int32, *, loc=None, ip=None):
         self._count += Int32(num_iterations)
         new_index = self._index + Int32(num_iterations)
         # How many times did we cross the stages boundary
@@ -56,104 +58,53 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
     """
     @staticmethod
-    def create(
-        *,
-        num_stages: int,
-        producer_group: CooperativeGroup,
-        consumer_group: CooperativeGroup,
-        tx_count: int,
-        barrier_storage: cute.Pointer = None,
-        cta_layout_vmnk: Optional[cute.Layout] = None,
-        tidx: Optional[Int32] = None,
-    ):
-        """
-        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
-        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
-        :type barrier_storage: cute.Pointer
-        :param num_stages: Number of buffer stages for this pipeline
-        :type num_stages: Int32
-        :param producer_group: CooperativeGroup for the producer agent
-        :type producer_group: CooperativeGroup
-        :param consumer_group: CooperativeGroup for the consumer agent
-        :type consumer_group: CooperativeGroup
-        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
-        :type tx_count: int
-        :param cta_layout_vmnk: Layout of the cluster shape
-        :type cta_layout_vmnk: cute.Layout | None
-        :param tidx: thread index to consumer async threads
-        :type tidx: Int32 | None
-        """
-        if not isinstance(barrier_storage, cute.Pointer):
-            raise ValueError(
-                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
-            )
-        producer_type = PipelineOp.TmaLoad
-        consumer_type = PipelineOp.AsyncThread
-        producer = (producer_type, producer_group)
-        consumer = (consumer_type, consumer_group)
-        sync_object_full = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8), num_stages, producer, tx_count
-        )
-        sync_object_empty = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
-        )
-        if tidx is None:
-            tidx, _, _ = cute.arch.thread_idx()
-        if cta_layout_vmnk is None:
-            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
-        (
-            dst_rank,
-            is_signalling_thread,
-        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
-        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
-            dst_rank = None
-        else:
-            dst_rank = dst_rank
-        producer_mask = None
-        pipeline_init_wait(cta_layout_vmnk)
-        return PipelineTmaCpAsync(
-            sync_object_full,
-            sync_object_empty,
-            num_stages,
-            producer_mask,
-            dst_rank,
-            is_signalling_thread,
-        )
+    def create(*args, **kwargs):
+        obj = PipelineTmaAsync.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        # obj.__class__ = PipelineTmaCpAsync
+        object.__setattr__(obj, "__class__", PipelineTmaCpAsync)
+        return obj
+    @dsl_user_op
     def producer_acquire(
         self,
         state: PipelineState,
         try_acquire_token: Optional[Boolean] = None,
         is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
     ):
         """
         TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
         """
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(state.index, state.phase),
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             is_tma_warp,
-            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
-    def producer_cpasync_commit(self, state: PipelineState):
+    @dsl_user_op
+    def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))
+        cute.arch.cp_async_mbarrier_arrive_noinc(
+            self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip
+        )
 class MbarrierArrayWDropCount(MbarrierArray):
+    @dsl_user_op
     def __init__(
         self,
         barrier_storage: cute.Pointer,
@@ -161,6 +112,9 @@ class MbarrierArrayWDropCount(MbarrierArray):
         agent: tuple[PipelineOp, CooperativeGroup],
         tx_count: int = 0,
         drop_count: Optional[Int32] = None,
+        *,
+        loc=None,
+        ip=None,
     ) -> None:
         self.barrier_storage = barrier_storage
         self.tx_count = tx_count
@@ -183,7 +137,7 @@ class MbarrierArrayWDropCount(MbarrierArray):
         self.mbarrier_base = self.barrier_storage
         # Mbarrier initialization in constructor
-        self.mbarrier_init()
+        self.mbarrier_init(loc=loc, ip=ip)
     def __extract_mlir_values__(self):
         return [self.barrier_storage, self.drop_count]
@@ -201,6 +155,7 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
     (e.g. Blackwell mainloops)
     """
+    @dsl_user_op
     @staticmethod
     def create(
         *,
@@ -210,25 +165,34 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         tx_count: int,
         barrier_storage: cute.Pointer = None,
         cta_layout_vmnk: Optional[cute.Layout] = None,
+        mcast_mode_mn: tuple[int, int] = (1, 1),
+        defer_sync: bool = False,
         producer_drop_count: Optional[Int32] = None,
+        loc=None,
+        ip=None,
     ):
-        """
-        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
-        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
-        :type barrier_storage: cute.Pointer
+        """Creates and initializes a new PipelineTmaUmma instance.
         :param num_stages: Number of buffer stages for this pipeline
-        :type num_stages: Int32
-        :param producer_group: `CooperativeGroup` for the producer agent
+        :type num_stages: int
+        :param producer_group: CooperativeGroup for the producer agent
         :type producer_group: CooperativeGroup
-        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :param consumer_group: CooperativeGroup for the consumer agent
         :type consumer_group: CooperativeGroup
         :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
         :type tx_count: int
+        :param barrier_storage: Pointer to the shared memory address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer, optional
         :param cta_layout_vmnk: Layout of the cluster shape
-        :type cta_layout_vmnk: cute.Layout | None
+        :type cta_layout_vmnk: cute.Layout, optional
+        :param mcast_mode_mn: Tuple specifying multicast modes for m and n dimensions (each 0 or 1)
+        :type mcast_mode_mn: tuple[int, int], optional
+        :raises ValueError: If barrier_storage is not a cute.Pointer instance
+        :return: A new PipelineTmaUmma instance configured with the provided parameters
+        :rtype: PipelineTmaUmma
         """
         if not isinstance(barrier_storage, cute.Pointer):
-            raise ValueError(
+            raise TypeError(
                 f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
             )
@@ -244,29 +208,42 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             producer,
             tx_count,
             drop_count=producer_drop_count,
+            loc=loc,
+            ip=ip,
         )
-        sync_object_empty = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        sync_object_empty = PipelineTmaUmma._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages,
+            num_stages,
+            consumer,
+            loc=loc,
+            ip=ip,
         )
-        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, loc=loc, ip=ip) == 1:
             # No mcast mask if not using clusters
             producer_mask = None
             # All threadblocks are leaders if not using clusters
             is_leader_cta = True
         else:
-            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk)
-            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(
+                cta_layout_vmnk, mcast_mode_mn, loc=loc, ip=ip
+            )
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk, loc=loc, ip=ip)
         cta_group = (
             cute.nvgpu.tcgen05.CtaGroup.ONE
-            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0], loc=loc, ip=ip) == 1
             else cute.nvgpu.tcgen05.CtaGroup.TWO
         )
         consumer_mask = producer_mask
-        pipeline_init_wait(cta_layout_vmnk)
+        if not defer_sync:
+            cute.arch.mbarrier_init_fence()
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, loc=loc, ip=ip) == 1:
+                agent_sync(Agent.ThreadBlock)
+            else:
+                agent_sync(Agent.ThreadBlockCluster, is_relaxed=True)
         return PipelineTmaCpAsyncUmma(
             sync_object_full,
@@ -278,11 +255,15 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             cta_group,
         )
+    @dsl_user_op
     def producer_acquire(
         self,
         state: PipelineState,
         try_acquire_token: Optional[Boolean] = None,
         is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
     ):
         """
         TMA producer commit conditionally waits on buffer empty and sets the
@@ -290,17 +271,24 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         """
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(state.index, state.phase),
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             and_(self.is_leader_cta, is_tma_warp),
-            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
-    def producer_cpasync_commit(self, state: PipelineState):
+    @dsl_user_op
+    def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))
+        cute.arch.cp_async_mbarrier_arrive_noinc(
+            self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip
+        )

quack/reduce.py CHANGED Viewed

@@ -196,9 +196,9 @@ def online_softmax_reduce(
                     )
                 cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
                 num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
-                max_x_single_warp = cute.make_fragment(num_iter, Float32)
+                max_x_single_warp = cute.make_rmem_tensor(num_iter, Float32)
                 max_x_single_warp.fill(-Float32.inf)
-                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
+                sum_exp_x_single_warp = cute.make_rmem_tensor(num_iter, Float32)
                 sum_exp_x_single_warp.fill(0.0)
                 for i in cutlass.range_constexpr(num_iter):
                     idx = lane_idx + i * cute.arch.WARP_SIZE

quack/rmsnorm.py CHANGED Viewed

@@ -686,9 +686,7 @@ class RMSNormBackward(ReductionBase):
             if const_expr(self.cluster_n > 1):
                 # Need this fence since the STAS from the producer is using the async proxy.
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                )
+                cute.arch.fence_view_async_shared()
                 # It's faster to have 1 lane per warp to signal the mbar, rather than all lanes
                 # Requires adjusting the thread_count when initializing the mbar
                 cute.arch.sync_warp()

quack/sm90_utils.py CHANGED Viewed

@@ -27,10 +27,11 @@ def make_smem_layout(
         sm90_utils_og.get_smem_layout_atom(layout, dtype, major_mode_size),
         dtype,
     )
+    order = (1, 0, 2) if const_expr(layout.is_m_major_c()) else (0, 1, 2)
     smem_layout_staged = cute.tile_to_shape(
         smem_layout_atom,
         cute.append(shape, stage) if const_expr(stage is not None) else shape,
-        order=(1, 0, 2) if layout.is_m_major_c() else (0, 1, 2),
+        order=order if const_expr(stage is not None) else order[:2],
     )
     return smem_layout_staged
@@ -101,7 +102,7 @@ def gemm_zero_init(
             tiled_mma, shape[::-1], tCrB, tCrA, B_idx, A_idx, wg_wait, swap_AB=False
         )
     else:
-        acc = cute.make_fragment(tiled_mma.partition_shape_C(shape), Float32)
+        acc = cute.make_rmem_tensor(tiled_mma.partition_shape_C(shape), Float32)
         rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
         rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
         gemm(tiled_mma, acc, rA, rB, zero_init=True, wg_wait=wg_wait)
@@ -125,3 +126,34 @@ def gemm_w_idx(
         rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
         rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
         gemm(tiled_mma, acc, rA, rB, zero_init=zero_init, wg_wait=wg_wait)
+def partition_fragment_ABC(
+    thr_mma: cute.ThrMma,
+    shape_mnk: cute.Shape,
+    sA: Optional[cute.Tensor],
+    sB: Optional[cute.Tensor],
+    swap_AB: bool = False,
+):
+    is_rs = thr_mma.op.a_src == warpgroup.OperandSource.RMEM
+    if const_expr(not swap_AB):
+        acc = cute.make_rmem_tensor(thr_mma.partition_shape_C(shape_mnk[:2]), Float32)
+        if const_expr(not is_rs):
+            assert sA is not None
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_A(sA))
+        else:
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[0], shape_mnk[2])))
+        assert sB is not None
+        tCrB = thr_mma.make_fragment_B(thr_mma.partition_B(sB))
+    else:
+        acc = cute.make_rmem_tensor(
+            thr_mma.partition_shape_C((shape_mnk[1], shape_mnk[0])), Float32
+        )
+        if const_expr(not is_rs):
+            assert sB is not None
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_A(sB))
+        else:  # B in rmem
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[1], shape_mnk[2])))
+        assert sA is not None
+        tCrA = thr_mma.make_fragment_B(thr_mma.partition_B(sA))
+    return acc, tCrA, tCrB

quack/sort/bitonic_sort.py CHANGED Viewed

@@ -83,7 +83,7 @@ def bitonic_topk_merge(
     else:
         minmax_fn = min if ascending else max
     # Write the top k elements to the first half of the array
-    for i in cutlass.range(k, unfoll_full=True):
+    for i in cutlass.range(k, unroll_full=True):
         arr0[start0 + i] = minmax_fn(arr0[start0 + i], arr1[start1 + k - 1 - i])
     # Now the 1st half is bitonic, we just need to merge it
     bitonic_merge(arr0, k, start0, ascending)
@@ -108,12 +108,12 @@ def bitonic_topk(
     n = cute.size(arr.shape)
     assert k == 1 << int(math.log2(k)), "k must be a power of 2"
     assert n % k == 0, "n must be divisible by k"
-    topk_vals = cute.make_fragment(k, arr.element_type)
+    topk_vals = cute.make_rmem_tensor(k, arr.element_type)
     for v in cutlass.range(k, unroll_full=True):
         topk_vals[v] = arr[v]
     bitonic_sort(topk_vals, ascending=ascending)
     for i in cutlass.range(1, n // k, unroll_full=True):
-        other_vals = cute.make_fragment(k, arr.element_type)
+        other_vals = cute.make_rmem_tensor(k, arr.element_type)
         for v in cutlass.range(k, unroll_full=True):
             other_vals[v] = arr[i * k + v]
         bitonic_sort(other_vals, ascending=ascending)
@@ -122,7 +122,7 @@ def bitonic_topk(
     # TODO: this is not efficient for large k (e.g. >= 16) since threads in the same warps
     # do duplicate work.
     for i in cutlass.range(int(math.log2(warp_width)), unroll_full=True):
-        other_vals = cute.make_fragment(k, arr.element_type)
+        other_vals = cute.make_rmem_tensor(k, arr.element_type)
         for v in cutlass.range(k, unroll_full=True):
             other_vals[v] = cute.arch.shuffle_sync_bfly(topk_vals[v], offset=1 << i)
         bitonic_topk_merge(topk_vals, other_vals, ascending=ascending)

quack-kernels 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

quack-kernels 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl