PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/METADATA +4 -2
quack_kernels-0.2.4.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/top_level.txt +0 -0

quack/tile_scheduler.py CHANGED Viewed

@@ -6,7 +6,7 @@ from enum import IntEnum
 import cutlass
 import cutlass.cute as cute
-from cutlass import Int32, Boolean, const_expr
+from cutlass import Int32, Float32, Boolean, const_expr
 import quack.utils as utils
 from quack.fast_math import FastDivmod
@@ -287,6 +287,7 @@ class TileScheduler:
     ):
         tidx = cute.arch.thread_idx()[0]
         bidx = cute.arch.block_idx()[0]
+        bidz = cute.arch.block_idx()[2]
         params = self.params
         if const_expr(params.is_persistent):
             num_persistent_clusters = cute.arch.grid_dim()[2]
@@ -300,7 +301,7 @@ class TileScheduler:
                     self._scheduler_pipeline.producer_acquire(self._pipeline_state)
                     lane_idx = cute.arch.lane_idx()
                     if lane_idx < cute.size(params.cluster_shape_mn):
-                        # cute.printf("Producer bidx = {}, tidx = {}, after empty wait, idx = {}", bidx, tidx, current_work_linear_idx)
+                        # cute.printf("Producer bidx = {}, bidz = {}, tidx = {}, after empty wait, idx = {}", bidx, bidz, tidx, current_work_linear_idx)
                         if const_expr(cute.size(params.cluster_shape_mn) == 1):
                             self._tile_count[self._pipeline_state.index] = current_work_linear_idx
                             self._scheduler_pipeline.producer_commit(self._pipeline_state)
@@ -318,18 +319,25 @@ class TileScheduler:
                                 mbar_ptr=mbar_ptr,
                                 peer_cta_rank_in_cluster=peer_cta_rank_in_cluster,
                             )
-                        # cute.printf("Producer bidx = {}, tidx = {}, after full arrive", bidx, tidx)
+                        # cute.printf("Producer bidx = {}, bidz = {}, tidx = {}, after full arrive", bidx, bidz, tidx)
                 else:
-                    # if tidx % 64 == 0: cute.printf("bidx = {},tidx = {}, before full wait, idx = {}", bidx, tidx, current_work_linear_idx)
+                    # if tidx % 32 == 0: cute.printf("bidx = {}, bidz = {}, tidx = {}, before full wait, idx = {}", bidx, bidz, tidx, current_work_linear_idx)
                     self._scheduler_pipeline.consumer_wait(self._pipeline_state)
-                    # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, after full wait, idx = {}", bidx, tidx, current_work_linear_idx)
+                    # if tidx % 32 == 0: cute.printf("bidx = {}, bidz = {}, tidx = {}, after full wait, idx = {}", bidx, bidz, tidx, current_work_linear_idx)
                     current_work_linear_idx = self._tile_count[self._pipeline_state.index]
-                    # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, after smem read, idx = {}", bidx, tidx, current_work_linear_idx)
+                    # if tidx % 32 == 0: cute.printf("bidx = {}, bidz = {}, tidx = {}, after smem read, idx = {}", bidx, bidz, tidx, current_work_linear_idx)
+                    # Need this fence since the STAS from the producer is using the async proxy.
+                    # Without this, we get race condition / deadlock.
+                    if const_expr(cute.size(params.cluster_shape_mn) > 1):
+                        cute.arch.fence_proxy(
+                            cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                        )
                     cute.arch.sync_warp()
                     with cute.arch.elect_one():
-                        # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, before empty arrive", bidx, tidx)
+                        # if tidx % 32 == 0: cute.printf("bidx = {}, bidz = {}, tidx = {}, before empty arrive", bidx, bidz, tidx)
                         self._scheduler_pipeline.consumer_release(self._pipeline_state)
-                        # if tidx % 64 == 0: cute.printf("bidx = {}, tidx = {}, after empty arrive", bidx, tidx)
+                        # if tidx == 320: cute.printf("bidx = {}, bidz = {}, tidx = {}, idx = {}, after empty arrive", bidx, bidz, tidx, current_work_linear_idx)
+                    # if tidx == 320: cute.printf("bidx = {}, bidz = {}, tidx = {}, idx = {}, after empty arrive", bidx, bidz, tidx, current_work_linear_idx)
                 self._current_work_linear_idx = current_work_linear_idx
                 self._pipeline_state.advance()
         self.num_tiles_executed += Int32(advance_count)
@@ -377,7 +385,7 @@ def triangular_idx_to_coord(idx: Int32) -> Tuple[Int32, Int32]:
     Convert a triangular index to 2D coordinates.
     This is used to convert the linear index to 2D coordinates for triangular matrices.
     """
-    row = utils.ceil((cute.math.sqrt(2 * idx + 2.25, fastmath=True) - 0.5)) - 1
+    row = utils.ceil((utils.sqrt(2 * idx + 2.25) - 0.5)) - 1
     col = idx - (row * (row + 1)) // 2
     return row, col
@@ -389,7 +397,7 @@ class TriangularTileScheduler(TileScheduler):
     class Params(ParamsBase):
         problem_shape_ncluster_mnl: cute.Shape
         num_clusters_per_problem_divmod: FastDivmod
-        group_size_inv_f32: cutlass.Float32
+        group_size_inv_f32: Float32
         num_groups_regular: Int32
         group_size_divmod: FastDivmod
         group_size_tail_divmod: FastDivmod
@@ -420,7 +428,7 @@ class TriangularTileScheduler(TileScheduler):
             return TriangularTileScheduler.Params(
                 problem_shape_ncluster_mnl,
                 FastDivmod.create(num_clusters_per_problem),
-                cutlass.Float32(1.0 / group_size),
+                Float32(1.0 / group_size),
                 num_groups_regular,
                 FastDivmod.create(group_size),
                 # Don't divide by 0
@@ -511,8 +519,7 @@ class TriangularTileScheduler(TileScheduler):
         group_size = params.group_size_divmod.divisor
         group_id = (
             utils.ceil(
-                (cute.math.sqrt(2 * cluster_id_in_problem + 2.25, fastmath=True) - 0.5)
-                * params.group_size_inv_f32
+                (utils.sqrt(2 * cluster_id_in_problem + 2.25) - 0.5) * params.group_size_inv_f32
             )
             - 1
         )
@@ -871,19 +878,19 @@ class VarlenMTileScheduler(TileScheduler):
         return cutlass.utils.WorkTileInfo(tile_coord_mnkl, is_valid)
     @cute.jit
-    def fetch_next_work(self, is_scheduler_warp: bool | Boolean, *, loc=None, ip=None):
+    def fetch_next_work(self, is_scheduler_warp: bool | Boolean = False, *, loc=None, ip=None):
         """is_scheduler_warp should only be true for one warp in the whole cluster"""
         if const_expr(self.params.tile_count_semaphore is not None):
             params = self.params
             current_work_linear_idx = self._current_work_linear_idx
             if is_scheduler_warp:
                 if cute.arch.lane_idx() == 0:
-                    # cute.printf("before atomicadd, tidx = {}, idx = {}", cute.arch.thread_idx()[0], current_work_linear_idx)
+                    # cute.printf("before atomicadd, tidx = {}, bidz = {}, idx = {}", cute.arch.thread_idx()[0], cute.arch.block_idx()[2], current_work_linear_idx)
                     num_persistent_clusters = cute.arch.grid_dim()[2]
                     current_work_linear_idx = num_persistent_clusters + utils.atomic_add_i32(
                         1, params.tile_count_semaphore
                     )
-                    # cute.printf("after atomicadd, tidx = {}, idx = {}", cute.arch.thread_idx()[0], current_work_linear_idx)
+                    # cute.printf("after atomicadd, tidx = {}, bidz = {}, idx = {}", cute.arch.thread_idx()[0], cute.arch.block_idx()[2], current_work_linear_idx)
                 # lane 0 already has the right tile_idx, just need to broadcast
                 current_work_linear_idx = cute.arch.shuffle_sync(current_work_linear_idx, 0)
             self._current_work_linear_idx = current_work_linear_idx

quack-kernels 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl