PyPI - quack-kernels - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

quack-kernels 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

quack/__init__.py +1 -1
quack/activation.py +72 -64
quack/broadcast_utils.py +1 -1
quack/copy_utils.py +14 -18
quack/fast_math.py +29 -76
quack/gemm_act.py +296 -8
quack/gemm_dact.py +520 -4
quack/gemm_default_epi.py +4 -4
quack/gemm_interface.py +363 -0
quack/gemm_sm100.py +62 -88
quack/gemm_sm90.py +68 -114
quack/gemm_symmetric.py +2 -6
quack/layout_utils.py +2 -4
quack/linear.py +37 -0
quack/pipeline.py +59 -89
quack/reduce.py +2 -2
quack/rmsnorm.py +1 -3
quack/sm90_utils.py +5 -3
quack/sort/bitonic_sort.py +3 -3
quack/tile_scheduler.py +310 -256
quack/topk.py +4 -4
quack/utils.py +76 -40
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/METADATA +2 -2
quack_kernels-0.2.6.dist-info/RECORD +45 -0
quack_kernels-0.2.5.dist-info/RECORD +0 -45
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/top_level.txt +0 -0

quack/pipeline.py CHANGED Viewed

@@ -6,9 +6,10 @@ from dataclasses import dataclass
 import cutlass.cute as cute
 from cutlass import Boolean, Int32, const_expr
 from cutlass.cutlass_dsl import if_generate, and_, dsl_user_op
-from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
-from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp
+from cutlass.pipeline import PipelineTmaAsync, PipelineState, PipelineUserType
 from cutlass.pipeline import PipelineTmaUmma
+from cutlass.pipeline import Agent, agent_sync
 class PipelineStateWAdvance(PipelineState):
@@ -57,75 +58,12 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
     """
     @staticmethod
-    def create(
-        *,
-        num_stages: int,
-        producer_group: CooperativeGroup,
-        consumer_group: CooperativeGroup,
-        tx_count: int,
-        barrier_storage: cute.Pointer = None,
-        cta_layout_vmnk: Optional[cute.Layout] = None,
-        tidx: Optional[Int32] = None,
-    ):
-        """
-        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
-        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
-        :type barrier_storage: cute.Pointer
-        :param num_stages: Number of buffer stages for this pipeline
-        :type num_stages: Int32
-        :param producer_group: CooperativeGroup for the producer agent
-        :type producer_group: CooperativeGroup
-        :param consumer_group: CooperativeGroup for the consumer agent
-        :type consumer_group: CooperativeGroup
-        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
-        :type tx_count: int
-        :param cta_layout_vmnk: Layout of the cluster shape
-        :type cta_layout_vmnk: cute.Layout | None
-        :param tidx: thread index to consumer async threads
-        :type tidx: Int32 | None
-        """
-        if not isinstance(barrier_storage, cute.Pointer):
-            raise ValueError(
-                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
-            )
-        producer_type = PipelineOp.TmaLoad
-        consumer_type = PipelineOp.AsyncThread
-        producer = (producer_type, producer_group)
-        consumer = (consumer_type, consumer_group)
-        sync_object_full = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8), num_stages, producer, tx_count
-        )
-        sync_object_empty = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
-        )
-        if tidx is None:
-            tidx, _, _ = cute.arch.thread_idx()
-        if cta_layout_vmnk is None:
-            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
-        (
-            dst_rank,
-            is_signalling_thread,
-        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
-        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
-            dst_rank = None
-        else:
-            dst_rank = dst_rank
-        producer_mask = None
-        pipeline_init_wait(cta_layout_vmnk)
-        return PipelineTmaCpAsync(
-            sync_object_full,
-            sync_object_empty,
-            num_stages,
-            producer_mask,
-            dst_rank,
-            is_signalling_thread,
-        )
+    def create(*args, **kwargs):
+        obj = PipelineTmaAsync.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        # obj.__class__ = PipelineTmaCpAsync
+        object.__setattr__(obj, "__class__", PipelineTmaCpAsync)
+        return obj
     @dsl_user_op
     def producer_acquire(
@@ -143,12 +81,16 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
             lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             is_tma_warp,
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
     @dsl_user_op
@@ -156,7 +98,9 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)
+        cute.arch.cp_async_mbarrier_arrive_noinc(
+            self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip
+        )
 class MbarrierArrayWDropCount(MbarrierArray):
@@ -211,6 +155,7 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
     (e.g. Blackwell mainloops)
     """
+    @dsl_user_op
     @staticmethod
     def create(
         *,
@@ -220,28 +165,34 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         tx_count: int,
         barrier_storage: cute.Pointer = None,
         cta_layout_vmnk: Optional[cute.Layout] = None,
-        producer_drop_count: Optional[Int32] = None,
         mcast_mode_mn: tuple[int, int] = (1, 1),
+        defer_sync: bool = False,
+        producer_drop_count: Optional[Int32] = None,
+        loc=None,
+        ip=None,
     ):
-        """
-        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
-        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
-        :type barrier_storage: cute.Pointer
+        """Creates and initializes a new PipelineTmaUmma instance.
         :param num_stages: Number of buffer stages for this pipeline
-        :type num_stages: Int32
-        :param producer_group: `CooperativeGroup` for the producer agent
+        :type num_stages: int
+        :param producer_group: CooperativeGroup for the producer agent
         :type producer_group: CooperativeGroup
-        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :param consumer_group: CooperativeGroup for the consumer agent
         :type consumer_group: CooperativeGroup
         :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
         :type tx_count: int
+        :param barrier_storage: Pointer to the shared memory address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer, optional
         :param cta_layout_vmnk: Layout of the cluster shape
-        :type cta_layout_vmnk: cute.Layout | None
+        :type cta_layout_vmnk: cute.Layout, optional
         :param mcast_mode_mn: Tuple specifying multicast modes for m and n dimensions (each 0 or 1)
         :type mcast_mode_mn: tuple[int, int], optional
+        :raises ValueError: If barrier_storage is not a cute.Pointer instance
+        :return: A new PipelineTmaUmma instance configured with the provided parameters
+        :rtype: PipelineTmaUmma
         """
         if not isinstance(barrier_storage, cute.Pointer):
-            raise ValueError(
+            raise TypeError(
                 f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
             )
@@ -257,29 +208,42 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             producer,
             tx_count,
             drop_count=producer_drop_count,
+            loc=loc,
+            ip=ip,
         )
         sync_object_empty = PipelineTmaUmma._make_sync_object(
-            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+            barrier_storage.align(min_align=8) + num_stages,
+            num_stages,
+            consumer,
+            loc=loc,
+            ip=ip,
         )
-        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, loc=loc, ip=ip) == 1:
             # No mcast mask if not using clusters
             producer_mask = None
             # All threadblocks are leaders if not using clusters
             is_leader_cta = True
         else:
-            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk, mcast_mode_mn)
-            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(
+                cta_layout_vmnk, mcast_mode_mn, loc=loc, ip=ip
+            )
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk, loc=loc, ip=ip)
         cta_group = (
             cute.nvgpu.tcgen05.CtaGroup.ONE
-            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0], loc=loc, ip=ip) == 1
             else cute.nvgpu.tcgen05.CtaGroup.TWO
         )
         consumer_mask = producer_mask
-        pipeline_init_wait(cta_layout_vmnk)
+        if not defer_sync:
+            cute.arch.mbarrier_init_fence()
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, loc=loc, ip=ip) == 1:
+                agent_sync(Agent.ThreadBlock)
+            else:
+                agent_sync(Agent.ThreadBlockCluster, is_relaxed=True)
         return PipelineTmaCpAsyncUmma(
             sync_object_full,
@@ -308,12 +272,16 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
             lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             and_(self.is_leader_cta, is_tma_warp),
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
     @dsl_user_op
@@ -321,4 +289,6 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)
+        cute.arch.cp_async_mbarrier_arrive_noinc(
+            self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip
+        )

quack/reduce.py CHANGED Viewed

@@ -196,9 +196,9 @@ def online_softmax_reduce(
                     )
                 cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
                 num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
-                max_x_single_warp = cute.make_fragment(num_iter, Float32)
+                max_x_single_warp = cute.make_rmem_tensor(num_iter, Float32)
                 max_x_single_warp.fill(-Float32.inf)
-                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
+                sum_exp_x_single_warp = cute.make_rmem_tensor(num_iter, Float32)
                 sum_exp_x_single_warp.fill(0.0)
                 for i in cutlass.range_constexpr(num_iter):
                     idx = lane_idx + i * cute.arch.WARP_SIZE

quack/rmsnorm.py CHANGED Viewed

@@ -686,9 +686,7 @@ class RMSNormBackward(ReductionBase):
             if const_expr(self.cluster_n > 1):
                 # Need this fence since the STAS from the producer is using the async proxy.
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                )
+                cute.arch.fence_view_async_shared()
                 # It's faster to have 1 lane per warp to signal the mbar, rather than all lanes
                 # Requires adjusting the thread_count when initializing the mbar
                 cute.arch.sync_warp()

quack/sm90_utils.py CHANGED Viewed

@@ -102,7 +102,7 @@ def gemm_zero_init(
             tiled_mma, shape[::-1], tCrB, tCrA, B_idx, A_idx, wg_wait, swap_AB=False
         )
     else:
-        acc = cute.make_fragment(tiled_mma.partition_shape_C(shape), Float32)
+        acc = cute.make_rmem_tensor(tiled_mma.partition_shape_C(shape), Float32)
         rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
         rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
         gemm(tiled_mma, acc, rA, rB, zero_init=True, wg_wait=wg_wait)
@@ -137,7 +137,7 @@ def partition_fragment_ABC(
 ):
     is_rs = thr_mma.op.a_src == warpgroup.OperandSource.RMEM
     if const_expr(not swap_AB):
-        acc = cute.make_fragment(thr_mma.partition_shape_C(shape_mnk[:2]), Float32)
+        acc = cute.make_rmem_tensor(thr_mma.partition_shape_C(shape_mnk[:2]), Float32)
         if const_expr(not is_rs):
             assert sA is not None
             tCrA = thr_mma.make_fragment_A(thr_mma.partition_A(sA))
@@ -146,7 +146,9 @@ def partition_fragment_ABC(
         assert sB is not None
         tCrB = thr_mma.make_fragment_B(thr_mma.partition_B(sB))
     else:
-        acc = cute.make_fragment(thr_mma.partition_shape_C((shape_mnk[1], shape_mnk[0])), Float32)
+        acc = cute.make_rmem_tensor(
+            thr_mma.partition_shape_C((shape_mnk[1], shape_mnk[0])), Float32
+        )
         if const_expr(not is_rs):
             assert sB is not None
             tCrB = thr_mma.make_fragment_A(thr_mma.partition_A(sB))

quack/sort/bitonic_sort.py CHANGED Viewed

@@ -108,12 +108,12 @@ def bitonic_topk(
     n = cute.size(arr.shape)
     assert k == 1 << int(math.log2(k)), "k must be a power of 2"
     assert n % k == 0, "n must be divisible by k"
-    topk_vals = cute.make_fragment(k, arr.element_type)
+    topk_vals = cute.make_rmem_tensor(k, arr.element_type)
     for v in cutlass.range(k, unroll_full=True):
         topk_vals[v] = arr[v]
     bitonic_sort(topk_vals, ascending=ascending)
     for i in cutlass.range(1, n // k, unroll_full=True):
-        other_vals = cute.make_fragment(k, arr.element_type)
+        other_vals = cute.make_rmem_tensor(k, arr.element_type)
         for v in cutlass.range(k, unroll_full=True):
             other_vals[v] = arr[i * k + v]
         bitonic_sort(other_vals, ascending=ascending)
@@ -122,7 +122,7 @@ def bitonic_topk(
     # TODO: this is not efficient for large k (e.g. >= 16) since threads in the same warps
     # do duplicate work.
     for i in cutlass.range(int(math.log2(warp_width)), unroll_full=True):
-        other_vals = cute.make_fragment(k, arr.element_type)
+        other_vals = cute.make_rmem_tensor(k, arr.element_type)
         for v in cutlass.range(k, unroll_full=True):
             other_vals[v] = cute.arch.shuffle_sync_bfly(topk_vals[v], offset=1 << i)
         bitonic_topk_merge(topk_vals, other_vals, ascending=ascending)

quack-kernels 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

quack-kernels 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl