PyPI - quack-kernels - Versions diffs - 0.1.3__tar.gz → 0.1.4__tar.gz - Mend

quack-kernels 0.1.3tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{quack_kernels-0.1.3/quack_kernels.egg-info → quack_kernels-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.3
+Version: 0.1.4
 Requires-Python: >=3.9
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.0.0
+Requires-Dist: nvidia-cutlass-dsl==4.1.0.dev0
 Requires-Dist: torch
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/README.md RENAMED Viewed

@@ -20,6 +20,10 @@ pip install quack-kernels
 - 🦆 Softmax forward and backward
 - 🦆 Cross entropy forward
+Upcoming:
+- 🦆 Cross entropy backward
+- 🦆 RMSNorm backward
+- 🦆 Rotary forward + backward
 ## Usage
@@ -32,6 +36,6 @@ from quack import rmsnorm, softmax, cross_entropy
 To set up the development environment:
 ```bash
-pip install -e .[dev]
+pip install -e '.[dev]'
 pre-commit install
 ```

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ name = "quack-kernels"
 dynamic = ["version"]
 requires-python = ">=3.9"
 dependencies = [
-    "nvidia-cutlass-dsl==4.0.0",
+    "nvidia-cutlass-dsl==4.1.0.dev0",
     "torch",
 ]

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/quack/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 from quack.rmsnorm import rmsnorm
 from quack.softmax import softmax

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/quack/cross_entropy.py RENAMED Viewed

@@ -77,7 +77,7 @@ class CrossEntropy(ReductionBase):
         self.kernel(mX, mTarget, mLoss, mLSE, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -93,15 +93,16 @@ class CrossEntropy(ReductionBase):
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape: cute.Shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mX, idX)
-        ]
+        gX, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, idX)]
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -131,7 +132,9 @@ class CrossEntropy(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
         )
         if row < shape[0]:
             cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
@@ -154,7 +157,7 @@ class CrossEntropy(ReductionBase):
                 cute.ReductionOp.MAX,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
-                mbar_ptr + 0 if self.cluster_n > 1 else None,
+                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=-cutlass.Float32.inf,
                 hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
             )
@@ -172,7 +175,7 @@ class CrossEntropy(ReductionBase):
                 cute.ReductionOp.ADD,
                 threads_per_row,
                 reduction_buffer[None, None, 1],
-                mbar_ptr + 1 if self.cluster_n > 1 else None,
+                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=0.0,
             )
         else:

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/quack/reduction_base.py RENAMED Viewed

@@ -88,10 +88,10 @@ class ReductionBase:
     def _initialize_cluster(self, tidx: cutlass.Int32, mbar_ptr: cute.Pointer, num_warps: int):
         if cutlass.const_expr(self.cluster_n > 1):
             if tidx < self.stage:
-                cute.arch.mbarrier_init_arrive_cnt(mbar_ptr + tidx, 1)
+                cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
             cute.arch.mbarrier_init_fence()
             if tidx < self.stage:
-                cute.arch.mbarrier_init_tx_bytes(
+                cute.arch.mbarrier_arrive_and_expect_tx(
                     mbar_ptr + tidx, num_warps * self.cluster_n * self.reduction_dtype.width // 8
                 )
             # Cluster arrive after barrier init

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/quack/rmsnorm.py RENAMED Viewed

@@ -84,7 +84,7 @@ class RMSNorm(ReductionBase):
         self.kernel(mX, mW, mO, mRstd, eps, tv_layout, tiler_mn, self.reload_from).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -103,7 +103,11 @@ class RMSNorm(ReductionBase):
         delay_w_load: cutlass.Constexpr = False,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -114,13 +118,10 @@ class RMSNorm(ReductionBase):
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, gO, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mX, mO, idX)
-        ]
-        gW = cute.local_tile(mW, tiler_mn, (0, 0 if self.cluster_n == 1 else cluster_y))
+        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
+        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
         gRstd = (
-            cute.local_tile(mRstd, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
+            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
             if cutlass.const_expr(mRstd is not None)
             else None
         )
@@ -167,7 +168,7 @@ class RMSNorm(ReductionBase):
         cute.arch.cp_async_commit_group()
         tWpW = utils.predicate_k(thr_copy_W.partition_S(cX), limit=shape[1])
-        if not delay_w_load:
+        if cutlass.const_expr(not delay_w_load):
             cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
         cute.arch.cp_async_wait_group(0)
@@ -192,12 +193,12 @@ class RMSNorm(ReductionBase):
                 and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
             ):
                 tXrRstd[0] = rstd
-        if delay_w_load:
+        if cutlass.const_expr(delay_w_load):
             cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
-        if reload_from == "smem":
+        if cutlass.const_expr(reload_from == "smem"):
             cute.autovec_copy(tXsX, tXrX)
             x = tXrX.load().to(cute.Float32)
-        elif reload_from == "gmem":
+        elif cutlass.const_expr(reload_from == "gmem"):
             cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
             x = tXrX.load().to(cute.Float32)
         x_hat = x * rstd

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/quack/softmax.py RENAMED Viewed

@@ -75,7 +75,7 @@ class Softmax(ReductionBase):
         self.kernel(mX, mO, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -89,15 +89,16 @@ class Softmax(ReductionBase):
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        gX, gO, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mX, mO, idX)
-        ]
+        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -129,7 +130,9 @@ class Softmax(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
         )
         if tXcX[0][0] < shape[0]:
             cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
@@ -148,7 +151,7 @@ class Softmax(ReductionBase):
                 cute.ReductionOp.MAX,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
-                mbar_ptr + 0 if self.cluster_n > 1 else None,
+                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=-cutlass.Float32.inf,
                 hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
             )
@@ -159,7 +162,7 @@ class Softmax(ReductionBase):
                 cute.ReductionOp.ADD,
                 threads_per_row,
                 reduction_buffer[None, None, 1],
-                mbar_ptr + 1 if self.cluster_n > 1 else None,
+                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
                 init_val=0.0,
             )
         else:
@@ -174,7 +177,9 @@ class Softmax(ReductionBase):
         y = exp_x * (1.0 / denom)
         tXrO.store(y.to(tXrO.element_type))
         tOpO = (
-            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1])
+            if cutlass.const_expr(not is_even_N)
+            else None
         )
         if tXcX[0][0] < shape[0]:
             cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
@@ -283,7 +288,7 @@ class SoftmaxBackward(ReductionBase):
         self.kernel(mdY, mY, mdX, tv_layout, tiler_mn).launch(
             grid=[cute.ceil_div(mdY.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if self.cluster_n > 1 else None,
+            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
             smem=self._smem_size_in_bytes(tiler_mn, num_warps),
             stream=stream,
         )
@@ -298,14 +303,17 @@ class SoftmaxBackward(ReductionBase):
         tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
-        bidx, cluster_y, _ = cute.arch.block_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        if cutlass.const_expr(self.cluster_n > 1):
+            cluster_y = cute.arch.block_idx()[1]
+        else:
+            cluster_y = cutlass.const_expr(0)
         shape = mdY.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
         gdY, gY, gdX, cX = [
-            cute.local_tile(mT, tiler_mn, (bidx, 0 if self.cluster_n == 1 else cluster_y))
-            for mT in (mdY, mY, mdX, idX)
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mdY, mY, mdX, idX)
         ]
         smem = cutlass.utils.SmemAllocator()
@@ -344,7 +352,7 @@ class SoftmaxBackward(ReductionBase):
         is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tdYpdY = (
             utils.predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
-            if not is_even_N
+            if cutlass.const_expr(not is_even_N)
             else None
         )
@@ -366,7 +374,7 @@ class SoftmaxBackward(ReductionBase):
             cute.ReductionOp.ADD,
             threads_per_row,
             reduction_buffer[None, None, 0],
-            mbar_ptr if self.cluster_n > 1 else None,
+            mbar_ptr if cutlass.const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
             hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
         )
@@ -376,7 +384,7 @@ class SoftmaxBackward(ReductionBase):
         tdXrdX.store(dx.to(tdXrdX.element_type))
         tdXpdX = (
             utils.predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
-            if not is_even_N
+            if cutlass.const_expr(not is_even_N)
             else None
         )
         if tXcX[0][0] < shape[0]:

{quack_kernels-0.1.3 → quack_kernels-0.1.4}/quack/utils.py RENAMED Viewed

@@ -37,19 +37,20 @@ def min_constexpr(
     return a if a < b else b
+@cute.jit
 def warp_reduce(
     val: cute.TensorSSA | cute.Numeric,
     op: Callable,
     width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
 ) -> cute.TensorSSA | cute.Numeric:
-    if isinstance(val, cute.TensorSSA):
+    if cutlass.const_expr(isinstance(val, cute.TensorSSA)):
         res = cute.make_fragment(val.shape, val.dtype)
         res.store(val)
-        for i in range(cute.size(val.shape)):
+        for i in cutlass.range_constexpr(cute.size(val.shape)):
             res[i] = warp_reduce(res[i], op, width)
         return res.load()
     else:
-        for i in range(int(math.log2(width))):
+        for i in cutlass.range_constexpr(int(math.log2(width))):
             val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
     return val
@@ -111,15 +112,15 @@ def store_shared_remote(
     remote_mbar_ptr_i32 = set_block_rank(
         mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
     ).ir_value()
-    if isinstance(val, float):
+    if cutlass.const_expr(isinstance(val, float)):
         val = Float32(val)
     assert isinstance(val, (Float32, cutlass.Int64)), "val must be Float32 or Int64"
-    suffix = "f32" if isinstance(val, Float32) else "s64"
+    suffix = "f32" if cutlass.const_expr(isinstance(val, Float32)) else "s64"
     llvm.inline_asm(
         None,
         [remote_smem_ptr_i32, val.ir_value(loc=loc, ip=ip), remote_mbar_ptr_i32],
         f"st.async.shared::cluster.mbarrier::complete_tx::bytes.{suffix} [$0], $1, [$2];",
-        f"r,{'f' if isinstance(val, Float32) else 'l'},r",
+        f"r,{'f' if cutlass.const_expr(isinstance(val, Float32)) else 'l'},r",
         has_side_effects=True,
         is_align_stack=False,
         asm_dialect=llvm.AsmDialect.AD_ATT,
@@ -299,6 +300,7 @@ def online_softmax_reduce(
     return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)
+@cute.jit
 def exp2f(x: cute.TensorSSA | Float32) -> cute.TensorSSA | Float32:
     """exp2f calculation for both vector and scalar.
@@ -307,10 +309,10 @@ def exp2f(x: cute.TensorSSA | Float32) -> cute.TensorSSA | Float32:
     :return: exp2 value
     :rtype: cute.TensorSSA or Float32
     """
-    if isinstance(x, cute.TensorSSA):
+    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
         res = cute.make_fragment(x.shape, Float32)
         res.store(x)
-        for i in range(cute.size(x.shape)):
+        for i in cutlass.range_constexpr(cute.size(x.shape)):
             res[i] = cute.arch.exp2(res[i])
         return res.load()
     else:
@@ -347,6 +349,7 @@ def rsqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
     )
+@cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
     tApA = cute.make_fragment(
@@ -356,8 +359,8 @@ def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
         ),
         cutlass.Boolean,
     )
-    for rest_v in range(tApA.shape[0]):
-        for rest_k in range(tApA.shape[2]):
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
             tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
     return tApA
@@ -373,8 +376,8 @@ def fill_oob(tXsX: cute.Tensor, tXpX: cute.Tensor, fill_value: cute.Numeric) ->
     """
     tXrX_fill = cute.make_fragment_like(tXsX[(None, 0), 0, 0])
     tXrX_fill.fill(fill_value)
-    for rest_v in range(tXpX.shape[0]):
-        for rest_k in range(tXpX.shape[2]):
+    for rest_v in cutlass.range_constexpr(tXpX.shape[0]):
+        for rest_k in cutlass.range_constexpr(tXpX.shape[2]):
             if not tXpX[rest_v, 0, rest_k]:
                 cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])

{quack_kernels-0.1.3 → quack_kernels-0.1.4/quack_kernels.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.3
+Version: 0.1.4
 Requires-Python: >=3.9
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.0.0
+Requires-Dist: nvidia-cutlass-dsl==4.1.0.dev0
 Requires-Dist: torch
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"