PyPI - quack-kernels - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

quack-kernels 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

quack/__init__.py +7 -1
quack/cross_entropy.py +197 -166
quack/reduction_base.py +98 -0
quack/rmsnorm.py +211 -181
quack/softmax.py +409 -156
quack/utils.py +197 -39
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/METADATA +4 -1
quack_kernels-0.1.3.dist-info/RECORD +11 -0
quack_kernels-0.1.2.dist-info/RECORD +0 -10
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/top_level.txt +0 -0

quack/utils.py CHANGED Viewed

@@ -2,13 +2,14 @@
 import operator
 import math
-from typing import Type, Callable, Optional
+from typing import Callable, Optional, Tuple
 import cutlass
 import cutlass.cute as cute
+from cutlass import Float32
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir.dialects import llvm, vector
 from cutlass.cute.runtime import from_dlpack
@@ -39,7 +40,7 @@ def min_constexpr(
 def warp_reduce(
     val: cute.TensorSSA | cute.Numeric,
     op: Callable,
-    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE
+    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
 ) -> cute.TensorSSA | cute.Numeric:
     if isinstance(val, cute.TensorSSA):
         res = cute.make_fragment(val.shape, val.dtype)
@@ -54,9 +55,10 @@ def warp_reduce(
 @cute.jit
-def block_reduce(val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, init_val: cute.Numeric = 0.0) -> cute.Numeric:
-    """reduction_buffer has shape (num_warps / warp_per_row, warps_per_row)
-    """
+def block_reduce(
+    val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, init_val: cute.Numeric = 0.0
+) -> cute.Numeric:
+    """reduction_buffer has shape (num_warps / warp_per_row, warps_per_row)"""
     lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
     warps_per_row = cute.size(reduction_buffer.shape[1])
     row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
@@ -75,9 +77,10 @@ def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cut
 @dsl_user_op
-def set_block_rank(smem_ptr: cute.Pointer, peer_cta_rank_in_cluster: cute.Int32, *, loc=None, ip=None) -> cutlass.Int32:
-    """Map the given smem pointer to the address at another CTA rank in the cluster.
-    """
+def set_block_rank(
+    smem_ptr: cute.Pointer, peer_cta_rank_in_cluster: cute.Int32, *, loc=None, ip=None
+) -> cutlass.Int32:
+    """Map the given smem pointer to the address at another CTA rank in the cluster."""
     smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
     return cutlass.Int32(
         llvm.inline_asm(
@@ -94,16 +97,29 @@ def set_block_rank(smem_ptr: cute.Pointer, peer_cta_rank_in_cluster: cute.Int32,
 @dsl_user_op
 def store_shared_remote(
-    val: float | cute.Float32, smem_ptr: cute.Pointer, mbar_ptr: cute.Pointer,
-    peer_cta_rank_in_cluster: cute.typing.Int, *, loc=None, ip=None
+    val: float | Float32 | cutlass.Int64,
+    smem_ptr: cute.Pointer,
+    mbar_ptr: cute.Pointer,
+    peer_cta_rank_in_cluster: cute.typing.Int,
+    *,
+    loc=None,
+    ip=None,
 ) -> None:
-    remote_smem_ptr_i32 = set_block_rank(smem_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip).ir_value()
-    remote_mbar_ptr_i32 = set_block_rank(mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip).ir_value()
+    remote_smem_ptr_i32 = set_block_rank(
+        smem_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    remote_mbar_ptr_i32 = set_block_rank(
+        mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    if isinstance(val, float):
+        val = Float32(val)
+    assert isinstance(val, (Float32, cutlass.Int64)), "val must be Float32 or Int64"
+    suffix = "f32" if isinstance(val, Float32) else "s64"
     llvm.inline_asm(
         None,
-        [remote_smem_ptr_i32, cute.Float32(val).ir_value(loc=loc, ip=ip), remote_mbar_ptr_i32],
-        "st.async.shared::cluster.mbarrier::complete_tx::bytes.f32 [$0], $1, [$2];",
-        "r,f,r",
+        [remote_smem_ptr_i32, val.ir_value(loc=loc, ip=ip), remote_mbar_ptr_i32],
+        f"st.async.shared::cluster.mbarrier::complete_tx::bytes.{suffix} [$0], $1, [$2];",
+        f"r,{'f' if isinstance(val, Float32) else 'l'},r",
         has_side_effects=True,
         is_align_stack=False,
         asm_dialect=llvm.AsmDialect.AD_ATT,
@@ -111,17 +127,24 @@ def store_shared_remote(
 @cute.jit
-def cluster_reduce(val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, mbar_ptr: cute.Pointer, init_val: cute.Numeric = 0.0) -> cute.Numeric:
-    """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))
-    """
+def cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: cute.Pointer,
+    init_val: cute.Numeric = 0.0,
+) -> cute.Numeric:
+    """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
     cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
     lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
     warps_per_row, cluster_n = reduction_buffer.shape[1]
     row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
     if lane_idx < cluster_n:
         store_shared_remote(
-            val, elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
-            mbar_ptr, peer_cta_rank_in_cluster=lane_idx
+            val,
+            elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+            mbar_ptr,
+            peer_cta_rank_in_cluster=lane_idx,
         )
     cute.arch.mbarrier_wait(mbar_ptr, phase=0)
     block_reduce_val = init_val
@@ -134,9 +157,14 @@ def cluster_reduce(val: cute.Numeric, op: Callable, reduction_buffer: cute.Tenso
 @cute.jit
-def block_or_cluster_reduce(val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, mbar_ptr: Optional[cute.Pointer], init_val: cute.Numeric = 0.0) -> cute.Numeric:
-    """Perform either block or cluster reduction based on whether mbar_ptr is provided.
-    """
+def block_or_cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: Optional[cute.Pointer],
+    init_val: cute.Numeric = 0.0,
+) -> cute.Numeric:
+    """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
     if cutlass.const_expr(mbar_ptr is None):
         return block_reduce(val, op, reduction_buffer, init_val=init_val)
     else:
@@ -153,15 +181,14 @@ def row_reduce(
     init_val: cute.Numeric = 0.0,
     hook_fn: Optional[Callable] = None,
 ) -> cute.Numeric:
-    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))
-    """
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
     if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
         val = x.reduce(op, init_val=init_val, reduction_profile=0)
     else:
         val = x
     warp_op = {
         cute.ReductionOp.ADD: operator.add,
-        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == cute.Float32) else max,
+        cute.ReductionOp.MAX: cute.arch.fmax if cutlass.const_expr(x.dtype == Float32) else max,
         cute.ReductionOp.MIN: min,
         cute.ReductionOp.MUL: operator.mul,
     }[op]
@@ -174,7 +201,9 @@ def row_reduce(
         hook_fn()
     if cutlass.const_expr(reduction_buffer is not None):
         warps_per_row, cluster_n = reduction_buffer.shape[1]
-        assert cluster_n == 1 or mbar_ptr is not None, "mbar_ptr must be provided for cluster reduction"
+        assert (
+            cluster_n == 1 or mbar_ptr is not None
+        ), "mbar_ptr must be provided for cluster reduction"
         if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
             val = block_or_cluster_reduce(
                 val, warp_op, reduction_buffer, mbar_ptr, init_val=init_val
@@ -182,17 +211,104 @@ def row_reduce(
     return val
-def exp2f(x: cute.TensorSSA | cutlass.Float32) -> cute.TensorSSA | cutlass.Float32:
+@cute.jit
+def online_softmax_reduce(
+    x: cute.TensorSSA,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    hook_fn: Optional[Callable] = None,
+    return_exp_x: bool = False,
+) -> [Float32, Float32, Optional[cute.TensorSSA]]:
+    assert x.dtype == Float32, "x must be of type Float32"
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n), 2)"""
+    max_x = warp_reduce(
+        x.reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
+        cute.arch.fmax,
+        width=min_constexpr(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    log2_e = math.log2(math.e)
+    exp_x = exp2f(x * log2_e - (max_x * log2_e))
+    # exp_x = exp2f((x - max_x) * log2_e)
+    sum_exp_x = warp_reduce(
+        exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
+        operator.add,
+        width=min_constexpr(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if cutlass.const_expr(hook_fn is not None):
+        hook_fn()
+    if cutlass.const_expr(reduction_buffer is not None):
+        warps_per_row, cluster_n = reduction_buffer.shape[1]
+        assert (
+            cluster_n == 1 or mbar_ptr is not None
+        ), "mbar_ptr must be provided for cluster reduction"
+        if cutlass.const_expr(warps_per_row > 1 or cluster_n > 1):
+            assert (
+                reduction_buffer.element_type == cutlass.Int64
+            ), "reduction_buffer must be of type cute.Int64"
+            lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+            row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+            if cutlass.const_expr(mbar_ptr is None):
+                if lane_idx == 0:
+                    reduction_buffer[row_idx, col_idx] = f32x2_to_i64(max_x, sum_exp_x)
+                cute.arch.barrier()
+                max_x_single_warp = -Float32.inf
+                sum_exp_x = 0.0
+                if lane_idx < warps_per_row:
+                    max_x_single_warp, sum_exp_x = i64_to_f32x2(reduction_buffer[row_idx, lane_idx])
+                max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
+                sum_exp_x *= exp2f((max_x_single_warp - max_x_final) * log2_e)
+                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
+                if cutlass.const_expr(return_exp_x):
+                    exp_x *= exp2f((max_x - max_x_final) * log2_e)
+                max_x = max_x_final
+            else:
+                cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+                if lane_idx < cluster_n:
+                    store_shared_remote(
+                        f32x2_to_i64(max_x, sum_exp_x),
+                        elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+                        mbar_ptr,
+                        peer_cta_rank_in_cluster=lane_idx,
+                    )
+                cute.arch.mbarrier_wait(mbar_ptr, phase=0)
+                num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+                max_x_single_warp = cute.make_fragment(num_iter, Float32)
+                max_x_single_warp.fill(-Float32.inf)
+                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
+                sum_exp_x_single_warp.fill(0.0)
+                for i in cutlass.range_constexpr(num_iter):
+                    idx = lane_idx + i * cute.arch.WARP_SIZE
+                    if idx < cute.size(reduction_buffer, mode=[1]):
+                        max_x_single_warp[i], sum_exp_x_single_warp[i] = i64_to_f32x2(
+                            reduction_buffer[row_idx, idx]
+                        )
+                max_x_final = max_x_single_warp.load().reduce(
+                    cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0
+                )
+                max_x_final = warp_reduce(max_x_final, cute.arch.fmax)
+                sum_exp_x = 0.0
+                for i in cutlass.range_constexpr(num_iter):
+                    sum_exp_x += sum_exp_x_single_warp[i] * exp2f(
+                        (max_x_single_warp[i] - max_x_final) * log2_e
+                    )
+                sum_exp_x = warp_reduce(sum_exp_x, operator.add)
+                if cutlass.const_expr(return_exp_x):
+                    exp_x *= exp2f((max_x - max_x_final) * log2_e)
+                max_x = max_x_final
+    return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)
+def exp2f(x: cute.TensorSSA | Float32) -> cute.TensorSSA | Float32:
     """exp2f calculation for both vector and scalar.
     :param x: input value
-    :type x: cute.TensorSSA or cutlass.Float32
+    :type x: cute.TensorSSA or Float32
     :return: exp2 value
-    :rtype: cute.TensorSSA or cutlass.Float32
+    :rtype: cute.TensorSSA or Float32
     """
     if isinstance(x, cute.TensorSSA):
-        res = cute.make_fragment(x.shape, cutlass.Float32)
+        res = cute.make_fragment(x.shape, Float32)
         res.store(x)
         for i in range(cute.size(x.shape)):
             res[i] = cute.arch.exp2(res[i])
@@ -202,11 +318,11 @@ def exp2f(x: cute.TensorSSA | cutlass.Float32) -> cute.TensorSSA | cutlass.Float
 @dsl_user_op
-def log2f(a: float | cutlass.Float32, *, loc=None, ip=None) -> cutlass.Float32:
-    return cutlass.Float32(
+def log2f(a: float | Float32, *, loc=None, ip=None) -> Float32:
+    return Float32(
         llvm.inline_asm(
             T.f32(),
-            [cutlass.Float32(a).ir_value(loc=loc, ip=ip)],
+            [Float32(a).ir_value(loc=loc, ip=ip)],
             "lg2.approx.ftz.f32 $0, $1;",
             "=f,f",
             has_side_effects=False,
@@ -217,11 +333,11 @@ def log2f(a: float | cutlass.Float32, *, loc=None, ip=None) -> cutlass.Float32:
 @dsl_user_op
-def rsqrt(a: float | cute.Float32, *, loc=None, ip=None) -> cute.Float32:
-    return cute.Float32(
+def rsqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
+    return Float32(
         llvm.inline_asm(
             T.f32(),
-            [cute.Float32(a).ir_value(loc=loc, ip=ip)],
+            [Float32(a).ir_value(loc=loc, ip=ip)],
             "rsqrt.approx.ftz.f32 $0, $1;",
             "=f,f",
             has_side_effects=False,
@@ -244,3 +360,45 @@ def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
         for rest_k in range(tApA.shape[2]):
             tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
     return tApA
+@cute.jit
+def fill_oob(tXsX: cute.Tensor, tXpX: cute.Tensor, fill_value: cute.Numeric) -> None:
+    """Fill out-of-bounds values in shared memory tensor.
+    Args:
+        tXsX: Shared memory tensor to fill
+        tXpX: Predicate tensor indicating valid elements
+        fill_value: Value to fill OOB locations with
+    """
+    tXrX_fill = cute.make_fragment_like(tXsX[(None, 0), 0, 0])
+    tXrX_fill.fill(fill_value)
+    for rest_v in range(tXpX.shape[0]):
+        for rest_k in range(tXpX.shape[2]):
+            if not tXpX[rest_v, 0, rest_k]:
+                cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
+@dsl_user_op
+def f32x2_to_i64(a: Float32, b: Float32, *, loc=None, ip=None) -> cutlass.Int64:
+    vec_f32x2 = vector.from_elements(
+        T.vector(2, T.f32()), (a.ir_value(), b.ir_value()), loc=loc, ip=ip
+    )
+    vec_i64x1 = vector.bitcast(T.vector(1, T.i64()), vec_f32x2)
+    res = cutlass.Int64(
+        vector.extract(vec_i64x1, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+    )
+    return res
+@dsl_user_op
+def i64_to_f32x2(c: cutlass.Int64, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
+    vec_i64x1 = vector.from_elements(T.vector(1, T.i64()), (c.ir_value(),), loc=loc, ip=ip)
+    vec_f32x2 = vector.bitcast(T.vector(2, T.f32()), vec_i64x1)
+    res0 = Float32(
+        vector.extract(vec_f32x2, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+    )
+    res1 = Float32(
+        vector.extract(vec_f32x2, dynamic_position=[], static_position=[1], loc=loc, ip=ip)
+    )
+    return res0, res1

{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,8 +1,11 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.1.2
+Version: 0.1.3
 Requires-Python: >=3.9
 License-File: LICENSE
 Requires-Dist: nvidia-cutlass-dsl==4.0.0
 Requires-Dist: torch
+Provides-Extra: dev
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
 Dynamic: license-file

quack_kernels-0.1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+quack/__init__.py,sha256=aUR7drzgaqmbzw9H_eoFselMUVQVF3BHc9VOzZg5d-Q,203
+quack/cross_entropy.py,sha256=_Xlyifd_YS8LaYxYlZEsuBfsi8zTH4At3i9DDggGCf8,9319
+quack/reduction_base.py,sha256=nrRsXwTpLVQkPp2Gr_FgHRPnifqkMHRodve5ciHzx58,3667
+quack/rmsnorm.py,sha256=YqGTTKHHXYzw3xnnjBRfaN9TDlhG8D_fSI9CHKAU40A,10548
+quack/softmax.py,sha256=mWaUfaY6PBtO1ioYxXxS-yodQmcBNGasWVMUg9G066Y,15938
+quack/utils.py,sha256=1-HMcFTEvGdAtqC3ucQGZ3DLa_PoJQsqwYlKd9bcXO8,15347
+quack_kernels-0.1.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+quack_kernels-0.1.3.dist-info/METADATA,sha256=DDuEKHLjFx9dFTQV5YtXsnKVFZVoueO7NwhcwOtpw6g,284
+quack_kernels-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+quack_kernels-0.1.3.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
+quack_kernels-0.1.3.dist-info/RECORD,,

quack_kernels-0.1.2.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-quack/__init__.py,sha256=Nf01m1CGrOjSkqGJom6P65hSLkckljRMhlkSoqqlO9k,137
-quack/cross_entropy.py,sha256=gdo8sR9KT5TsrShbgAmy-bwRZLu0gTs_ykXBF2RMbFI,8900
-quack/rmsnorm.py,sha256=JhwJSAPDDpB_hV90xU9ymiLU-zu4WScrSHc5JX2JarY,10470
-quack/softmax.py,sha256=C8e8ZNaF5ePJ1NlrWZN1goCcvsx1C60FWlRyuFCcYoM,7737
-quack/utils.py,sha256=PRdu-P7azA_PeHUNdtoy1zyxZwg_QyVrSiVwE1iXaWo,8961
-quack_kernels-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-quack_kernels-0.1.2.dist-info/METADATA,sha256=3WjugLu1IhLlgsg2qUcLBZq1HI4-BIyyJIuQc5Hk-rU,186
-quack_kernels-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-quack_kernels-0.1.2.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
-quack_kernels-0.1.2.dist-info/RECORD,,

{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{quack_kernels-0.1.2.dist-info → quack_kernels-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

quack-kernels 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

quack-kernels 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl