PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/softmax.py CHANGED Viewed

@@ -1,14 +1,20 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
 import math
-import torch
 from typing import Type
+from functools import partial
+import torch
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
-from cutlass.cute.runtime import from_dlpack
+from cutlass import Int64, Float32, const_expr
 import quack.utils as utils
+import quack.copy_utils as copy_utils
+from quack.compile_utils import make_fake_tensor as fake_tensor
 from quack.reduce import row_reduce, online_softmax_reduce
 from quack.reduction_base import ReductionBase
 from quack.cute_dsl_utils import torch2cute_dtype_map
@@ -21,45 +27,28 @@ class Softmax(ReductionBase):
             dtype,
             N,
             stage=2 if not online_softmax else 1,
-            reduction_dtype=cutlass.Float32 if not online_softmax else cutlass.Int64,
+            reduction_dtype=Float32 if not online_softmax else Int64,
         )
         self.online_softmax = online_softmax
-    def _calculate_threads_per_row(self):
+    def _threads_per_row(self):
         N = self.N
-        return (
-            8
-            if N <= 64
-            else (
-                16
-                if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
-            )
-        )
+        for limit, threads in [(64, 8), (128, 16), (3072, 32), (6144, 64), (16384, 128)]:
+            if N <= limit:
+                return threads
+        return 256
     def _set_cluster_n(self):
         N = self.N
-        if cutlass.const_expr(self.dtype.width == 16):
-            cluster_n = (
-                1
-                if N <= 16 * 1024
-                else (
-                    2
-                    if N <= 32 * 1024
-                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
-                )
-            )
-        else:  # fp32
-            cluster_n = (
-                1
-                if N <= 32 * 1024
-                else (
-                    2
-                    if N <= 64 * 1024
-                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
-                )
-            )
-        self.cluster_n = cluster_n
+        if const_expr(self.dtype.width == 16):
+            thresholds = [(16 * 1024, 1), (32 * 1024, 2), (64 * 1024, 4), (128 * 1024, 8)]
+        else:
+            thresholds = [(32 * 1024, 1), (64 * 1024, 2), (128 * 1024, 4), (256 * 1024, 8)]
+        for limit, cluster in thresholds:
+            if N <= limit:
+                self.cluster_n = cluster
+                return
+        self.cluster_n = 16
     @cute.jit
     def __call__(
@@ -69,16 +58,16 @@ class Softmax(ReductionBase):
         stream: cuda.CUstream,
     ):
         assert mX.element_type == self.dtype
-        assert mO.element_type == self.dtype
         self._set_cluster_n()
-        tiler_mn, tv_layout = self._get_tv_layout()
-        num_threads = cute.size(tv_layout, mode=[0])
-        num_warps = num_threads // cute.arch.WARP_SIZE
-        self.kernel(mX, mO, tv_layout, tiler_mn).launch(
+        largest_dtype_width = const_expr(max(t.element_type.width for t in [mX, mO]))
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(
+            vecsize=128 // largest_dtype_width
+        )
+        num_threads = tiled_copy.size
+        self.kernel(mX, mO, tiler_mn, tiled_copy, threads_per_row).launch(
             grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
-            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
             stream=stream,
         )
@@ -87,23 +76,20 @@ class Softmax(ReductionBase):
         self,
         mX: cute.Tensor,
         mO: cute.Tensor,
-        tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
     ):
+        tv_layout = tiled_copy.layout_tv_tiled
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
-        if cutlass.const_expr(self.cluster_n > 1):
-            cluster_y = cute.arch.block_idx()[1]
-        else:
-            cluster_y = cutlass.const_expr(0)
+        cluster_y = const_expr(0) if const_expr(self.cluster_n == 1) else cute.arch.block_idx()[1]
         shape = mX.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
-        mX, mO = [utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
-        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
+        gX, gO, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, mO, idX)]
         smem = cutlass.utils.SmemAllocator()
         sX = smem.allocate_tensor(
@@ -111,52 +97,45 @@ class Softmax(ReductionBase):
         )
         reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
-        # declare the atoms which will be used later for memory copy
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=128
-        )
-        copy_atom_store_O = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), gO.element_type, num_bits_per_copy=128
-        )
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
+        thr_copy_X = tiled_copy.get_slice(tidx)
         tXgX = thr_copy_X.partition_S(gX)
         tXsX = thr_copy_X.partition_D(sX)
-        tXgO = thr_copy_O.partition_D(gO)
+        tXgO = thr_copy_X.partition_D(gO)
         tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
-        # allocate fragments for gmem->rmem
         tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
-        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        self._initialize_cluster(tidx, mbar_ptr, num_warps)
-        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1]) if not is_even_N else None
+            None
+            if is_even_N
+            else copy_utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
         )
+        # Each copy will use the same predicate
+        copy = partial(copy_utils.copy, pred=tXpX)
+        num_warps = cute.size(tiled_copy) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
         if tXcX[0][0] < shape[0]:
-            cute.copy(copy_atom_load_X, tXgX, tXsX, pred=tXpX)
+            copy(tXgX, tXsX, is_async=True)
         cute.arch.cp_async_commit_group()
         cute.arch.cp_async_wait_group(0)
         # Fill OOB values with -inf
-        if cutlass.const_expr(not is_even_N):
+        if const_expr(not is_even_N):
             utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
         cute.autovec_copy(tXsX, tXrX)
         x = tXrX.load().to(cute.Float32)
-        threads_per_row = tv_layout.shape[0][0]
-        if cutlass.const_expr(not self.online_softmax):
+        if const_expr(not self.online_softmax):
             max_x = row_reduce(
                 x,
                 cute.ReductionOp.MAX,
                 threads_per_row,
                 reduction_buffer[None, None, 0],
-                mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
-                init_val=-cutlass.Float32.inf,
-                hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+                mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
+                init_val=-Float32.inf,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
             )
             log2_e = math.log2(math.e)
             exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
@@ -165,7 +144,7 @@ class Softmax(ReductionBase):
                 cute.ReductionOp.ADD,
                 threads_per_row,
                 reduction_buffer[None, None, 1],
-                mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
+                mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
                 init_val=0.0,
             )
         else:
@@ -174,18 +153,14 @@ class Softmax(ReductionBase):
                 threads_per_row,
                 reduction_buffer[None, None, 0],
                 mbar_ptr,
-                hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
                 return_exp_x=True,
             )
-        y = exp_x * (1.0 / denom)
+        # y = exp_x * (1.0 / denom)
+        y = exp_x * cute.arch.rcp_approx(denom)
         tXrO.store(y.to(tXrO.element_type))
-        tOpO = (
-            utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1])
-            if cutlass.const_expr(not is_even_N)
-            else None
-        )
         if tXcX[0][0] < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
+            copy(tXrO, tXgO)
 @torch.library.custom_op("quack::_softmax_fwd", mutates_args={"out"})
@@ -200,21 +175,21 @@ def _softmax_fwd(x: torch.Tensor, out: torch.Tensor) -> None:
     assert x.is_cuda, "Tensor must be on CUDA device"
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
     N = x.size(1)
-    dtype = torch2cute_dtype_map[x.dtype]
-    convert_from_dlpack = lambda tensor: (
-        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
-            mode=0, stride_order=(0, 1)
-        )
-    )
-    x_tensor, out_tensor = [convert_from_dlpack(tensor) for tensor in (x, out)]
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N)
+    dtype, out_dtype = [torch2cute_dtype_map[t.dtype] for t in [x, out]]
+    compile_key = (dtype, out_dtype, N)
     if compile_key not in _softmax_fwd.compile_cache:
+        batch_sym = cute.sym_int()
+        div = math.gcd(128 // dtype.width, N)
+        x_cute, out_cute = [fake_tensor(dt, (batch_sym, N), div) for dt in [dtype, out_dtype]]
         softmax_op = Softmax(dtype, N)
         _softmax_fwd.compile_cache[compile_key] = cute.compile(
-            softmax_op, x_tensor, out_tensor, current_stream
+            softmax_op,
+            x_cute,
+            out_cute,
+            cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+            options="--enable-tvm-ffi",
         )
-    _softmax_fwd.compile_cache[compile_key](x_tensor, out_tensor, current_stream)
+    _softmax_fwd.compile_cache[compile_key](x, out)
 _softmax_fwd.compile_cache = {}
@@ -229,55 +204,30 @@ def softmax_fwd(x: torch.Tensor) -> torch.Tensor:
 class SoftmaxBackward(ReductionBase):
     def __init__(self, dtype: Type[cutlass.Numeric], N: int):
         # 1 stage for computing dot product
-        super().__init__(dtype, N, stage=1, reduction_dtype=cutlass.Float32)
+        super().__init__(dtype, N, stage=1, reduction_dtype=Float32)
-    def _calculate_threads_per_row(self):
+    def _threads_per_row(self):
         N = self.N
-        return (
-            8
-            if N <= 64
-            else (
-                16
-                if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 8192 else 256)))
-            )
-        )
+        for limit, threads in [(64, 8), (128, 16), (3072, 32), (6144, 64), (8192, 128)]:
+            if N <= limit:
+                return threads
+        return 256
     def _set_cluster_n(self):
         N = self.N
-        if cutlass.const_expr(self.dtype.width == 16):
-            cluster_n = (
-                1
-                if N <= 16 * 1024
-                else (
-                    2
-                    if N <= 32 * 1024
-                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
-                )
-            )
-        else:  # fp32
-            cluster_n = (
-                1
-                if N <= 16 * 1024
-                else (
-                    2
-                    if N <= 32 * 1024
-                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
-                )
-            )
-        self.cluster_n = cluster_n
-    def _get_num_threads(self):
+        if const_expr(self.dtype.width == 16):
+            thresholds = [(16 * 1024, 1), (32 * 1024, 2), (64 * 1024, 4), (128 * 1024, 8)]
+        else:
+            thresholds = [(16 * 1024, 1), (32 * 1024, 2), (64 * 1024, 4), (128 * 1024, 8)]
+        for limit, cluster in thresholds:
+            if N <= limit:
+                self.cluster_n = cluster
+                return
+        self.cluster_n = 16
+    def _num_threads(self):
         return 128 if self.N <= 8192 else 256
-    def _smem_size_in_bytes(self, tiler_mn, num_warps):
-        return (
-            # Multiply by 2 since we need space for Y and dY
-            cute.size_in_bytes(self.dtype, cute.make_layout(tiler_mn)) * 2
-            + self.stage * num_warps * self.cluster_n * (self.reduction_dtype.width // 8)
-            + self.stage * (cutlass.Int64.width // 8)
-        )
     @cute.jit
     def __call__(
         self,
@@ -287,17 +237,16 @@ class SoftmaxBackward(ReductionBase):
         stream: cuda.CUstream,
     ):
         assert mdY.element_type == self.dtype
-        assert mY.element_type == self.dtype
-        assert mdX.element_type == self.dtype
         self._set_cluster_n()
-        tiler_mn, tv_layout = self._get_tv_layout()
-        num_threads = cute.size(tv_layout, mode=[0])
-        num_warps = num_threads // cute.arch.WARP_SIZE
-        self.kernel(mdY, mY, mdX, tv_layout, tiler_mn).launch(
+        largest_dtype_width = const_expr(max(t.element_type.width for t in [mdY, mY, mdX]))
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(
+            vecsize=128 // largest_dtype_width
+        )
+        num_threads = tiled_copy.size
+        self.kernel(mdY, mY, mdX, tiler_mn, tiled_copy, threads_per_row).launch(
             grid=[cute.ceil_div(mdY.shape[0], tiler_mn[0]), self.cluster_n, 1],
             block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
-            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
             stream=stream,
         )
@@ -307,24 +256,21 @@ class SoftmaxBackward(ReductionBase):
         mdY: cute.Tensor,
         mY: cute.Tensor,
         mdX: cute.Tensor,
-        tv_layout: cute.Layout,
         tiler_mn: cute.Shape,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
-        if cutlass.const_expr(self.cluster_n > 1):
-            cluster_y = cute.arch.block_idx()[1]
-        else:
-            cluster_y = cutlass.const_expr(0)
+        cluster_y = const_expr(0) if const_expr(self.cluster_n == 1) else cute.arch.block_idx()[1]
+        tv_layout = tiled_copy.layout_tv_tiled
         shape = mdY.shape
         idX = cute.make_identity_tensor(shape)
         # slice for CTAs
-        mdY, mY, mdX = [
-            utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mdY, mY, mdX)
+        gdY, gY, gdX, cX = [
+            cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mdY, mY, mdX, idX)
         ]
-        gdY, gY, gdX = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mdY, mY, mdX)]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         smem = cutlass.utils.SmemAllocator()
         sdY = smem.allocate_tensor(
@@ -335,42 +281,32 @@ class SoftmaxBackward(ReductionBase):
         )
         reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
-        # declare the atoms which will be used later for memory copy
-        copy_atom_load = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mdY.element_type, num_bits_per_copy=128
-        )
-        copy_atom_store = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), gdX.element_type, num_bits_per_copy=128
-        )
+        thr_copy = tiled_copy.get_slice(tidx)
-        thr_copy_load = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_store = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn).get_slice(tidx)
-        tdYgdY = thr_copy_load.partition_S(gdY)
-        tdYsdY = thr_copy_load.partition_D(sdY)
-        tYgY = thr_copy_load.partition_S(gY)
-        tYsY = thr_copy_load.partition_D(sY)
-        tdXgdX = thr_copy_store.partition_D(gdX)
-        tXcX = thr_copy_load.partition_S(cX)[(0, None), None, None]
-        # allocate fragments for gmem->rmem
+        tdYgdY = thr_copy.partition_S(gdY)
+        tdYsdY = thr_copy.partition_D(sdY)
+        tYgY = thr_copy.partition_S(gY)
+        tYsY = thr_copy.partition_D(sY)
+        tdXgdX = thr_copy.partition_D(gdX)
+        tXcX = thr_copy.partition_S(cX)[(0, None), None, None]
         tdYrdY, tYrY, tdXrdX = [cute.make_fragment_like(thr) for thr in (tdYgdY, tYgY, tdXgdX)]
-        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        self._initialize_cluster(tidx, mbar_ptr, num_warps)
-        is_even_N = cutlass.const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
-        tdYpdY = (
-            utils.predicate_k(thr_copy_load.partition_S(cX), limit=shape[1])
-            if cutlass.const_expr(not is_even_N)
-            else None
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            None if is_even_N else copy_utils.predicate_k(thr_copy.partition_S(cX), limit=shape[1])
         )
+        # Each copy will use the same predicate
+        copy = partial(copy_utils.copy, pred=tXpX)
+        num_warps = cute.size(tiled_copy) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
         if tXcX[0][0] < shape[0]:
-            cute.copy(copy_atom_load, tdYgdY, tdYsdY, pred=tdYpdY)
-            cute.copy(copy_atom_load, tYgY, tYsY, pred=tdYpdY)
+            copy(tdYgdY, tdYsdY, is_async=True)
+            copy(tYgY, tYsY, is_async=True)
         cute.arch.cp_async_commit_group()
         cute.arch.cp_async_wait_group(0)
+        # Don't need fill_oob since cp.async will automatically fills OOB elements with zeros
         cute.autovec_copy(tdYsdY, tdYrdY)
         cute.autovec_copy(tYsY, tYrY)
@@ -378,27 +314,21 @@ class SoftmaxBackward(ReductionBase):
         y = tYrY.load().to(cute.Float32)
         # Compute dot product: dot = Σⱼ dy_j × y_j
-        threads_per_row = tv_layout.shape[0][0]
         dot = row_reduce(
             dy * y,
             cute.ReductionOp.ADD,
             threads_per_row,
             reduction_buffer[None, None, 0],
-            mbar_ptr if cutlass.const_expr(self.cluster_n > 1) else None,
+            mbar_ptr if const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
-            hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
+            hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
         )
         # Compute gradient: dx_i = y_i × (dy_i - dot)
         dx = y * (dy - dot)
         tdXrdX.store(dx.to(tdXrdX.element_type))
-        tdXpdX = (
-            utils.predicate_k(thr_copy_store.partition_S(cX), limit=shape[1])
-            if cutlass.const_expr(not is_even_N)
-            else None
-        )
         if tXcX[0][0] < shape[0]:
-            cute.copy(copy_atom_store, tdXrdX, tdXgdX, pred=tdXpdX)
+            copy(tdXrdX, tdXgdX)
 @torch.library.custom_op("quack::_softmax_backward", mutates_args={"dx"})
@@ -418,22 +348,24 @@ def _softmax_backward(dy: torch.Tensor, y: torch.Tensor, dx: torch.Tensor) -> No
     assert y.dtype == dy.dtype, "dy and y must have same dtype"
     N = dy.size(1)
-    dtype = torch2cute_dtype_map[dy.dtype]
-    convert_from_dlpack = lambda tensor: (
-        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(
-            mode=0, stride_order=(0, 1)
-        )
-    )
-    dy_tensor, y_tensor, dx_tensor = [convert_from_dlpack(tensor) for tensor in (dy, y, dx)]
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N)
+    dtype, y_dtype, dx_dtype = [torch2cute_dtype_map[t.dtype] for t in [dy, y, dx]]
+    compile_key = (dtype, y_dtype, dx_dtype, N)
     if compile_key not in _softmax_backward.compile_cache:
+        batch_sym = cute.sym_int()
+        div = math.gcd(128 // dtype.width, N)
+        dy_cute, y_cute, dx_cute = [
+            fake_tensor(dt, (batch_sym, N), div) for dt in [dtype, y_dtype, dx_dtype]
+        ]
         softmax_backward_op = SoftmaxBackward(dtype, N)
         _softmax_backward.compile_cache[compile_key] = cute.compile(
-            softmax_backward_op, dy_tensor, y_tensor, dx_tensor, current_stream
+            softmax_backward_op,
+            dy_cute,
+            y_cute,
+            dx_cute,
+            cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+            options="--enable-tvm-ffi",
         )
-    _softmax_backward.compile_cache[compile_key](dy_tensor, y_tensor, dx_tensor, current_stream)
+    _softmax_backward.compile_cache[compile_key](dy, y, dx)
 _softmax_backward.compile_cache = {}

quack/sort/bitonic_sort.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Optional
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Float32, const_expr
 import quack.utils as utils
 from quack.sort.utils import compare_and_swap
@@ -14,12 +15,14 @@ from quack.sort.sorting_networks import optimal_sort
 @cute.jit
 def bitonic_merge(
     arr: cute.Tensor,
-    n: cutlass.Constexpr[int],
-    start: cutlass.Constexpr[int],
+    n: Optional[cutlass.Constexpr[int]] = None,
+    start: cutlass.Constexpr[int] = 0,
     ascending: cutlass.Constexpr[bool] = True,
 ) -> None:
     """Merge a bitonic sequence into a sorted sequence using iterative approach."""
-    if cutlass.const_expr(n > 1):
+    if const_expr(n is None):
+        n = cute.size(arr.shape)
+    if const_expr(n > 1):
         num_levels = int(math.log2(n))
         assert n == 2**num_levels, "n must be a power of 2"
         # This one must be range_constexpr otherwise it's very slow for n = 128
@@ -48,11 +51,11 @@ def bitonic_sort(
         start: Starting index (default 0)
         ascending: Sort in ascending order (default True)
     """
-    if cutlass.const_expr(n is None):
+    if const_expr(n is None):
         n = cute.size(arr.shape)
     assert n <= 128
-    if cutlass.const_expr(n > 1):
-        if cutlass.const_expr(n in [2, 4, 8, 16, 32, 64]):
+    if const_expr(n > 1):
+        if const_expr(n in [2, 4, 8, 16, 32, 64]):
             optimal_sort(arr, n, start, ascending)
         else:  # Fall back to bitonic sort
             assert n % 2 == 0
@@ -73,9 +76,9 @@ def bitonic_topk_merge(
     start1: cutlass.Constexpr[int] = 0,
     ascending: cutlass.Constexpr[bool] = False,
 ) -> None:
-    if cutlass.const_expr(k is None):
+    if const_expr(k is None):
         k = cute.size(arr0.shape)
-    if cutlass.const_expr(arr0.element_type == cutlass.Float32):
+    if const_expr(arr0.element_type == Float32):
         minmax_fn = utils.fmin if ascending else cute.arch.fmax
     else:
         minmax_fn = min if ascending else max
@@ -101,7 +104,7 @@ def bitonic_topk(
         k: must be power of 2 and <= 128
         ascending: Sort in ascending order (default False)
     """
-    assert arr.element_type in [cutlass.Float32, cutlass.Int32]
+    assert arr.element_type in [Float32, Int32]
     n = cute.size(arr.shape)
     assert k == 1 << int(math.log2(k)), "k must be a power of 2"
     assert n % k == 0, "n must be divisible by k"
@@ -109,8 +112,8 @@ def bitonic_topk(
     for v in cutlass.range(k, unroll_full=True):
         topk_vals[v] = arr[v]
     bitonic_sort(topk_vals, ascending=ascending)
-    other_vals = cute.make_fragment(k, arr.element_type)
     for i in cutlass.range(1, n // k, unroll_full=True):
+        other_vals = cute.make_fragment(k, arr.element_type)
         for v in cutlass.range(k, unroll_full=True):
             other_vals[v] = arr[i * k + v]
         bitonic_sort(other_vals, ascending=ascending)

quack/sort/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import cutlass
 import cutlass.cute as cute
+from cutlass import Float32, const_expr
 import quack.utils as utils
@@ -9,12 +9,12 @@ def compare_and_swap(
     arr: cute.Tensor, i: int, j: int, ascending: bool = True, use_selection: bool = False
 ) -> None:
     """Compare and swap elements at indices i and j in ascending or descending order."""
-    if cutlass.const_expr(use_selection):
+    if const_expr(use_selection):
         a, b = arr[i], arr[j]
         if (a > b) ^ (not ascending):
             arr[i] = b
             arr[j] = a
-        # if cutlass.const_expr(ascending):
+        # if const_expr(ascending):
         #     if a > b:
         #         arr[i] = b
         #         arr[j] = a
@@ -23,9 +23,9 @@ def compare_and_swap(
         #         arr[i] = b
         #         arr[j] = a
     else:
-        min_fn = min if cutlass.const_expr(arr.element_type != cutlass.Float32) else utils.fmin
-        max_fn = max if cutlass.const_expr(arr.element_type != cutlass.Float32) else cute.arch.fmax
-        if cutlass.const_expr(ascending):
+        min_fn = min if const_expr(arr.element_type != Float32) else utils.fmin
+        max_fn = max if const_expr(arr.element_type != Float32) else cute.arch.fmax
+        if const_expr(ascending):
             arr[i], arr[j] = min_fn(arr[i], arr[j]), max_fn(arr[i], arr[j])
         else:
             arr[i], arr[j] = max_fn(arr[i], arr[j]), min_fn(arr[i], arr[j])

quack-kernels 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl