PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/__init__.py CHANGED Viewed

@@ -1,16 +1,9 @@
-__version__ = "0.2.2"
-import cutlass.cute as cute
+__version__ = "0.2.3"
 from quack.rmsnorm import rmsnorm
 from quack.softmax import softmax
 from quack.cross_entropy import cross_entropy
-import quack.cute_dsl_utils
-# Patch cute.compile to optionally dump SASS
-cute.compile = quack.cute_dsl_utils.cute_compile_patched
 __all__ = [
     "rmsnorm",
     "softmax",

quack/activation.py CHANGED Viewed

@@ -3,37 +3,86 @@
 import math
 from typing import Tuple
-import cutlass
 import cutlass.cute as cute
-from cutlass import Float32
-from cutlass.cutlass_dsl import dsl_user_op
+from cutlass import Float32, Boolean, const_expr
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+import quack.utils as utils
+F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
 @dsl_user_op
-def sigmoid(x: Float32, *, loc=None, ip=None) -> Float32:
-    return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
+def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "tanh.approx.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
 @dsl_user_op
-def relu(x: Float32, *, loc=None, ip=None) -> Float32:
-    return cute.arch.fmax(x, Float32(0.0))
+def sigmoid(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
+        return 0.5 + 0.5 * tanh(0.5 * x)
+    else:
+        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
+        tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return utils.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
-@cute.jit
 @dsl_user_op
-def drelu(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
-    x_pos = cutlass.Boolean(x > 0)
-    return dout if x_pos else Float32(0.0), cute.arch.fmax(x, Float32(0.0))
+def dsigmoid_from_output(out: Float32, dout: Float32, *, loc=None, ip=None) -> Float32:
+    # return dout * out * (1.0 - out)
+    return dout * (out - out * out)
 @dsl_user_op
-def relu_sq(x: Float32, *, loc=None, ip=None) -> Float32:
-    return cute.arch.fmax(x, Float32(0.0)) * x
+def relu(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0))
+    else:
+        return cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0))
+@dsl_user_op
 @cute.jit
+def drelu(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    if const_expr(not isinstance(x, tuple)):
+        x_pos = Boolean(x > 0)
+        return dout if x_pos else Float32(0.0), cute.arch.fmax(x, Float32(0.0))
+    else:
+        x0_pos = Boolean(x[0] > 0)
+        x1_pos = Boolean(x[1] > 0)
+        dx = (dout[0] if x0_pos else Float32(0.0), dout[1] if x1_pos else Float32(0.0))
+        return dx, relu(x)
+@dsl_user_op
+def relu_sq(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0)) * x
+    else:
+        relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
+        return utils.mul_packed_f32x2(relu_x, x)
 @dsl_user_op
-def drelu_sq(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
+@cute.jit
+def drelu_sq(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
     """
     ReLU squared backward pass: computes gradient w.r.t. x and recomputes forward
     Given: relu_sq_out = max(x, 0) * x, and dout = grad w.r.t. relu_sq_out
@@ -41,29 +90,49 @@ def drelu_sq(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[Float32,
     - dx = dout * 2 * x if x > 0, else 0
     - relu_sq_out = max(x, 0) * x
     """
-    x_pos = cutlass.Boolean(x > 0)
-    relu_sq_out = cute.arch.fmax(x, Float32(0.0)) * x
-    # Derivative: d/dx[max(x,0) * x] = 2*x if x > 0, else 0
-    dx = (2.0 * dout * x) if x_pos else Float32(0.0)
-    return dx, relu_sq_out
+    if const_expr(not isinstance(x, tuple)):
+        relu_x = relu(x)
+        relu_sq_out = relu_x * x
+        # Derivative: d/dx[max(x,0) * x] = 2*x if x > 0, else 0
+        dx = 2.0 * (dout * relu_x)
+        return dx, relu_sq_out
+    else:
+        relu_x = relu(x)
+        relu_sq_out = utils.mul_packed_f32x2(relu_x, x)
+        dx = utils.mul_packed_f32x2((2.0, 2.0), utils.mul_packed_f32x2(dout, relu_x))
+        return dx, relu_sq_out
 @dsl_user_op
-def gelu_tanh_approx(x: Float32, *, loc=None, ip=None) -> Float32:
+def gelu_tanh_approx(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
     """
     gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
             = 0.5 * x * (1 + tanh(x * (0.797885 + 0.0356774 * x * x)))
     """
     sqrt_2_over_pi = math.sqrt(2 / math.pi)  # ~0.797885
     sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # ~0.0356774
-    return 0.5 * (
-        x
-        * (1 + cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)), fastmath=True))
-    )
+    if const_expr(not isinstance(x, tuple)):
+        return 0.5 * (
+            x
+            # Currently cute.math.tanh(x, fastmath=True) generates very slow code
+            # * (1 + cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)), fastmath=True))
+            * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
+        )
+    else:
+        x_sq = utils.mul_packed_f32x2(x, x)
+        x_sq_scaled = utils.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        tanh_z = (tanh(z[0]), tanh(z[1]))
+        x_tanh_z = utils.fma_packed_f32x2(tanh_z, x, x)
+        return utils.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
 @dsl_user_op
-def dgelu_tanh_approx(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
+def dgelu_tanh_approx(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
     """
     GELU tanh approximation backward pass: computes gradient w.r.t. x and recomputes forward
     Given: gelu_out = 0.5 * x * (1 + tanh(x * (c1 + c2 * x^2))), and dout = grad w.r.t. gelu_out
@@ -78,43 +147,123 @@ def dgelu_tanh_approx(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[
     sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # c2 ~0.0356774
     sqrt_2_over_pi_coeff_3 = 3.0 * sqrt_2_over_pi_coeff  # c3 ~0.01070322
-    # Compute z = x * (c1 + c2 * x^2)
-    x_sq = x * x
-    tanh_z = cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq), fastmath=True)
-    half_tanh_z_plus_one = 0.5 + 0.5 * tanh_z
-    gelu_out = x * half_tanh_z_plus_one
+    if const_expr(not isinstance(x, tuple)):
+        # Compute z = x * (c1 + c2 * x^2)
+        x_sq = x * x
+        # tanh_z = cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq), fastmath=True)
+        tanh_z = tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq))
+        half_tanh_z_plus_one = 0.5 + 0.5 * tanh_z
+        gelu_out = x * half_tanh_z_plus_one
+        # Compute gradient
+        # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = 1 - tanh_z * tanh_z
+        # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = sqrt_2_over_pi + sqrt_2_over_pi_coeff_3 * x_sq
+        # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        dgelu = half_tanh_z_plus_one + x * (0.5 * (sech2_z * dz_dx))
+        dx = dout * dgelu
+        return dx, gelu_out
+    else:
+        # Compute z = x * (c1 + c2 * x^2)
+        x_sq = utils.mul_packed_f32x2(x, x)
+        x_sq_scaled = utils.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        tanh_z = (tanh(z[0]), tanh(z[1]))
+        half_tanh_z_plus_one = utils.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
+        gelu_out = utils.mul_packed_f32x2(x, half_tanh_z_plus_one)
+        # Compute gradient
+        # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = utils.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
+        # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = utils.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        sech2_dz_dx = utils.mul_packed_f32x2(sech2_z, dz_dx)
+        x_sech2_dz_dx = utils.mul_packed_f32x2(x, sech2_dz_dx)
+        dgelu = utils.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
+        dx = utils.mul_packed_f32x2(dout, dgelu)
+        return dx, gelu_out
-    # Compute gradient
-    # sech^2(z) = 1 - tanh^2(z)
-    sech2_z = 1 - tanh_z * tanh_z
-    # dz/dx = c1 + 3 * c2 * x^2
-    dz_dx = sqrt_2_over_pi + sqrt_2_over_pi_coeff_3 * x_sq
-    # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
-    dgelu = half_tanh_z_plus_one + x * (0.5 * (sech2_z * dz_dx))
-    dx = dout * dgelu
-    return dx, gelu_out
+@dsl_user_op
+@cute.jit
+def softplus(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        use_linear = Boolean(x > 20.0)
+        return (
+            cute.math.log(Float32(cute.math.exp(x, fastmath=True)) + 1.0, fastmath=True)
+            if not use_linear
+            else x
+        )
+    else:
+        log2_e = math.log2(math.e)
+        x_log2e = utils.mul_packed_f32x2(x, (log2_e, log2_e))
+        x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
+        x_exp_p1 = utils.add_packed_f32x2(x_exp, (1.0, 1.0))
+        log_x_exp_p1 = (
+            cute.math.log2(x_exp_p1[0], fastmath=True),
+            cute.math.log2(x_exp_p1[1], fastmath=True),
+        )
+        ln2 = math.log(2.0)
+        softplus_x = utils.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
+        use_linear_0 = Boolean(x[0] > 20.0)
+        use_linear_1 = Boolean(x[1] > 20.0)
+        return (
+            softplus_x[0] if not use_linear_0 else x[0],
+            softplus_x[1] if not use_linear_1 else x[1],
+        )
 @dsl_user_op
-def silu(x: Float32, *, loc=None, ip=None) -> Float32:
+@cute.jit
+def dsoftplus_from_output(out: Float32, dout: Float32, *, loc=None, ip=None) -> Float32:
+    use_linear = Boolean(out > 20.0)
+    # dx = dout * (1.0 - cute.math.exp(-out, fastmath=True)) if not use_linear else dout
+    dx = dout - dout * cute.math.exp(-out, fastmath=True)
+    return dx if not use_linear else dout
+@dsl_user_op
+def silu(x: F32_or_F32x2, *, already_halved: bool = False, loc=None, ip=None) -> F32_or_F32x2:
     """
     silu(x) = x * sigmoid(x) = x * (1 + tanh(x / 2)) / 2 = (0.5 * x) * tanh(0.5 * x) + (0.5 * x)
     This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
     """
-    x_half = 0.5 * x
-    return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
+    if const_expr(not isinstance(x, tuple)):
+        x_half = 0.5 * x if const_expr(not already_halved) else x
+        # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
+        return x_half * tanh(x_half) + x_half
+    else:
+        x_half = utils.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
+        tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return utils.fma_packed_f32x2(x_half, tanh_x_half, x_half)
 @dsl_user_op
-def swiglu(x: Float32, y: Float32, *, loc=None, ip=None) -> Float32:
-    return silu(x) * y
+def swiglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return silu(x) * y
+    else:
+        return utils.mul_packed_f32x2(silu(x), y)
 @dsl_user_op
 def dswiglu(
-    x: Float32, y: Float32, dout: Float32, *, loc=None, ip=None
-) -> Tuple[Float32, Float32, Float32]:
+    x: F32_or_F32x2,
+    y: F32_or_F32x2,
+    dout: F32_or_F32x2,
+    *,
+    already_halved: bool = False,
+    loc=None,
+    ip=None,
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
     """
     SwiGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
     Given: swiglu_out = silu(x) * y, and dout = grad w.r.t. swiglu_out
@@ -125,42 +274,77 @@ def dswiglu(
     This has been optimized to use fewer instructions (i.e. we expand things out
     to use FFMA instead of FADD and FMUL).
     """
-    # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(0.5 * x))
-    # FMUL, MUFU.TANH, then FFMA
-    sigmoid_x = sigmoid(x)
-    silu_x = x * sigmoid_x  # FMUL
-    silu_x_dout = silu_x * dout  # FMUL
-    #   d_silu(x) * dout
-    # = sigmoid_x * (1 + x * (1 - sigmoid_x)) * dout
-    # = (sigmoid_x + sigmoid_x * x * (1 - sigmoid_x)) * dout
-    # = (sigmoid_x + silu_x * (1 - sigmoid_x)) * dout
-    # = (sigmoid_x + silu_x - silu_x * sigmoid_x) * dout
-    # = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
-    d_silu_x_dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x_dout  # FFMA, FFMA
-    dx = d_silu_x_dout * y  # FMUL
-    dy = silu_x_dout
-    swiglu_out = silu_x * y  # FMUL
-    # Overall it's 1 MUFU.TANH, 5 FMUL, 3 FFMA
-    return dx, dy, swiglu_out
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(0.5 * x))
+        # FMUL, MUFU.TANH, then FFMA
+        if const_expr(not already_halved):
+            sigmoid_x = sigmoid(x)
+            silu_x = x * sigmoid_x  # FMUL
+        else:
+            tanh_x = tanh(x)  # MUFU.TANH
+            sigmoid_x = 0.5 * tanh_x + 0.5  # FFMA
+            silu_x = x * tanh_x + x  # FFMA
+        silu_x_dout = silu_x * dout  # FMUL
+        #   d_silu(x) * dout
+        # = sigmoid_x * (1 + x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + sigmoid_x * x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + silu_x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + silu_x - silu_x * sigmoid_x) * dout
+        # = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        d_silu_x_dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x_dout  # FFMA, FFMA
+        dx = d_silu_x_dout * y  # FMUL
+        dy = silu_x_dout
+        swiglu_out = silu_x * y  # FMUL
+        # Overall it's 1 MUFU.TANH, 5 FMUL, 3 FFMA
+        return dx, dy, swiglu_out
+    else:
+        # Compute sigmoid(x) and silu(x)
+        if const_expr(not already_halved):
+            sigmoid_x = sigmoid(x)
+            silu_x = utils.mul_packed_f32x2(x, sigmoid_x)
+        else:
+            tanh_x = (tanh(x[0]), tanh(x[1]))
+            sigmoid_x = utils.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
+            silu_x = utils.fma_packed_f32x2(x, tanh_x, x)
+        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+        # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        sigmoid_x_minus_silu_x_sigmoid_x = utils.fma_packed_f32x2(
+            sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
+        )
+        d_silu_x_dout = utils.fma_packed_f32x2(sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout)
+        dx = utils.mul_packed_f32x2(d_silu_x_dout, y)
+        dy = silu_x_dout
+        swiglu_out = utils.mul_packed_f32x2(silu_x, y)
+        return dx, dy, swiglu_out
 @dsl_user_op
-def swiglu_oai(x: Float32, y: Float32, alpha: float = 1.702, *, loc=None, ip=None) -> Float32:
+def swiglu_oai(
+    x: F32_or_F32x2, y: F32_or_F32x2, alpha: float = 1.702, *, loc=None, ip=None
+) -> F32_or_F32x2:
     """The swiglu variant used in gpt-oss, which has a scaling factor on x and bias of 1 to y.
     https://github.com/openai/gpt-oss/blob/7be9334950053a888e24887a57dac797a17d6e00/gpt_oss/torch/model.py#L249
     x * sigmoid(alpha * x) * (y + 1)
     Compile down to FMUL, FMUL, TANH, FFMA, FFMA
     """
     # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
-    x_half = 0.5 * x
-    silu_x = x_half * cute.math.tanh(alpha * x_half, fastmath=True) + x_half
-    return silu_x * y + silu_x
+    if const_expr(not isinstance(x, tuple)):
+        x_half = 0.5 * x
+        # silu_x = x_half * cute.math.tanh(alpha * x_half, fastmath=True) + x_half
+        silu_x = x_half * tanh(alpha * x_half) + x_half
+        return silu_x * y + silu_x
+    else:
+        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
+        alpha_x_half = utils.mul_packed_f32x2((alpha, alpha), x_half)
+        tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        silu_x = utils.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
+        return utils.fma_packed_f32x2(silu_x, y, silu_x)
 @dsl_user_op
 def dswiglu_oai(
-    x: Float32, y: Float32, dout: Float32, alpha: float = 1.702, *, loc=None, ip=None
-) -> Tuple[Float32, Float32, Float32]:
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, alpha: float = 1.702, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
     """
     Swiglu OAI backward pass: computes gradients w.r.t. x and y
     Given: swiglu_oai_out = x * sigmoid(alpha * x) * (y + 1), and dout = grad w.r.t. swiglu_oai_out
@@ -169,35 +353,60 @@ def dswiglu_oai(
     Derivative of x * sigmoid(alpha * x) w.r.t. x:
     d/dx[x * sigmoid(alpha * x)] = sigmoid(alpha * x) + alpha * x * sigmoid(alpha * x) * (1 - sigmoid(alpha * x))
     """
-    # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
-    alpha_x_half = (0.5 * alpha) * x  # FMUL
-    # MUFU.TANH, then FFMA
-    sigmoid_alpha_x = 0.5 + 0.5 * cute.math.tanh(alpha_x_half, fastmath=True)
-    silu_x = x * sigmoid_alpha_x  # FMUL
-    silu_x_dout = silu_x * dout  # FMUL
-    # FFMA, FFMA, FMUL
-    d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
-    dx = d_silu_x_dout * y + d_silu_x_dout  # FFMA, instead of multiply by y + 1
-    dy = silu_x_dout
-    swiglu_out = silu_x * y + silu_x  # FFMA, instead of multiply by y + 1
-    # Overall it's 1 MUFU.TANH, 4 FMUL, 5 FFMA
-    return dx, dy, swiglu_out
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
+        alpha_x_half = (0.5 * alpha) * x  # FMUL
+        # MUFU.TANH, then FFMA
+        # sigmoid_alpha_x = 0.5 + 0.5 * cute.math.tanh(alpha_x_half, fastmath=True)
+        sigmoid_alpha_x = 0.5 + 0.5 * tanh(alpha_x_half)
+        silu_x = x * sigmoid_alpha_x  # FMUL
+        silu_x_dout = silu_x * dout  # FMUL
+        # FFMA, FFMA, FMUL
+        d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        dx = d_silu_x_dout * y + d_silu_x_dout  # FFMA, instead of multiply by y + 1
+        dy = silu_x_dout
+        swiglu_out = silu_x * y + silu_x  # FFMA, instead of multiply by y + 1
+        # Overall it's 1 MUFU.TANH, 4 FMUL, 5 FFMA
+        return dx, dy, swiglu_out
+    else:
+        # Compute sigmoid(alpha * x)
+        alpha_x_half = utils.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
+        tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        sigmoid_alpha_x = utils.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
+        silu_x = utils.mul_packed_f32x2(x, sigmoid_alpha_x)
+        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+        # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        silu_x_minus_product = utils.fma_packed_f32x2(
+            silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
+        )
+        sigmoid_plus_alpha_diff = utils.fma_packed_f32x2(
+            (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
+        )
+        d_silu_x_dout = utils.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
+        dx = utils.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
+        dy = silu_x_dout
+        swiglu_out = utils.fma_packed_f32x2(silu_x, y, silu_x)
+        return dx, dy, swiglu_out
 @dsl_user_op
-def glu(x: Float32, y: Float32, *, loc=None, ip=None) -> Float32:
+def glu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
     """GLU: Gated Linear Unit
     glu(x, y) = sigmoid(x) * y
     Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
     """
-    sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
-    return sigmoid_x * y  # FMUL
+    if const_expr(not isinstance(x, tuple)):
+        sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
+        return sigmoid_x * y  # FMUL
+    else:
+        sigmoid_x = sigmoid(x)
+        return utils.mul_packed_f32x2(sigmoid_x, y)
 @dsl_user_op
 def dglu(
-    x: Float32, y: Float32, dout: Float32, *, loc=None, ip=None
-) -> Tuple[Float32, Float32, Float32]:
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
     """
     GLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
     Given: glu_out = sigmoid(x) * y, and dout = grad w.r.t. glu_out
@@ -206,33 +415,47 @@ def dglu(
     - dy = dout * sigmoid(x)
     - glu_out = sigmoid(x) * y
     """
-    # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(x/2))
-    sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
-    sigmoid_x_dout = sigmoid_x * dout  # FMUL
-    glu_out = sigmoid_x * y  # FMUL
-    # dx = y * sigmoid(x) * (1 - sigmoid(x)) * dout
-    #    = y * (1 - sigmoid(x)) * sigmoid_x_dout
-    #    = (y - y * sigmoid(x)) * sigmoid_x_dout
-    #    = (y - glu_out) * sigmoid_x_dout
-    dx = (y - glu_out) * sigmoid_x_dout  # FADD, FMUL
-    dy = sigmoid_x_dout
-    # Total: 1 MUFU.TANH, 4 FMUL, 1 FADD, 1 FFMA
-    return dx, dy, glu_out
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(x/2))
+        sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
+        sigmoid_x_dout = sigmoid_x * dout  # FMUL
+        glu_out = sigmoid_x * y  # FMUL
+        # dx = y * sigmoid(x) * (1 - sigmoid(x)) * dout
+        #    = y * (1 - sigmoid(x)) * sigmoid_x_dout
+        #    = (y - y * sigmoid(x)) * sigmoid_x_dout
+        #    = (y - glu_out) * sigmoid_x_dout
+        dx = (y - glu_out) * sigmoid_x_dout  # FADD, FMUL
+        dy = sigmoid_x_dout
+        # Total: 1 MUFU.TANH, 4 FMUL, 1 FADD, 1 FFMA
+        return dx, dy, glu_out
+    else:
+        sigmoid_x = sigmoid(x)
+        sigmoid_x_dout = utils.mul_packed_f32x2(sigmoid_x, dout)
+        glu_out = utils.mul_packed_f32x2(sigmoid_x, y)
+        # dx = (y - glu_out) * sigmoid_x_dout
+        y_minus_glu_out = utils.sub_packed_f32x2(y, glu_out)
+        dx = utils.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
+        dy = sigmoid_x_dout
+        return dx, dy, glu_out
 @dsl_user_op
-def reglu(x: Float32, y: Float32, *, loc=None, ip=None) -> Float32:
+def reglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
     """ReGLU: ReLU Gated Linear Unit
     reglu(x, y) = relu(x) * y = max(x, 0) * y
     """
-    return cute.arch.fmax(x, Float32(0.0)) * y
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0)) * y
+    else:
+        relu_x = relu(x)
+        return utils.mul_packed_f32x2(relu_x, y)
-@cute.jit
 @dsl_user_op
+@cute.jit
 def dreglu(
-    x: Float32, y: Float32, dout: Float32, *, loc=None, ip=None
-) -> Tuple[Float32, Float32, Float32]:
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
     """
     ReGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
     Given: reglu_out = relu(x) * y, and dout = grad w.r.t. reglu_out
@@ -241,27 +464,40 @@ def dreglu(
     - dy = dout * relu(x)
     - reglu_out = relu(x) * y
     """
-    x_pos = cutlass.Boolean(x > 0)
-    relu_x = cute.arch.fmax(x, Float32(0.0))
-    dx = (dout * y) if x_pos else Float32(0.0)
-    dy = dout * relu_x
-    reglu_out = relu_x * y
-    return dx, dy, reglu_out
+    if const_expr(not isinstance(x, tuple)):
+        x_pos = Boolean(x > 0)
+        relu_x = cute.arch.fmax(x, Float32(0.0))
+        dx = (dout * y) if x_pos else Float32(0.0)
+        dy = dout * relu_x
+        reglu_out = relu_x * y
+        return dx, dy, reglu_out
+    else:
+        x0_pos = Boolean(x[0] > 0)
+        x1_pos = Boolean(x[1] > 0)
+        relu_x = relu(x)
+        dout_y = utils.mul_packed_f32x2(dout, y)
+        dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
+        dy = utils.mul_packed_f32x2(dout, relu_x)
+        reglu_out = utils.mul_packed_f32x2(relu_x, y)
+        return dx, dy, reglu_out
 @dsl_user_op
-def geglu(x: Float32, y: Float32, *, loc=None, ip=None) -> Float32:
+def geglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
     """GeGLU: GELU Gated Linear Unit
     geglu(x, y) = gelu(x) * y
     Uses the tanh approximation of GELU
     """
-    return gelu_tanh_approx(x) * y
+    if const_expr(not isinstance(x, tuple)):
+        return gelu_tanh_approx(x) * y
+    else:
+        return utils.mul_packed_f32x2(gelu_tanh_approx(x), y)
 @dsl_user_op
 def dgeglu(
-    x: Float32, y: Float32, dout: Float32, *, loc=None, ip=None
-) -> Tuple[Float32, Float32, Float32]:
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
     """
     GeGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
     Given: geglu_out = gelu(x) * y, and dout = grad w.r.t. geglu_out
@@ -270,10 +506,19 @@ def dgeglu(
     - dy = dout * gelu(x)
     - geglu_out = gelu(x) * y
     """
-    # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
-    dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
-    # Compute gradients for geglu
-    dx = dgelu_x_dout * y
-    dy = gelu_x * dout
-    geglu_out = gelu_x * y
-    return dx, dy, geglu_out
+    if const_expr(not isinstance(x, tuple)):
+        # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
+        dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
+        # Compute gradients for geglu
+        dx = dgelu_x_dout * y
+        dy = gelu_x * dout
+        geglu_out = gelu_x * y
+        return dx, dy, geglu_out
+    else:
+        # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
+        dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
+        # Compute gradients for geglu
+        dx = utils.mul_packed_f32x2(dgelu_x_dout, y)
+        dy = utils.mul_packed_f32x2(gelu_x, dout)
+        geglu_out = utils.mul_packed_f32x2(gelu_x, y)
+        return dx, dy, geglu_out

quack-kernels 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl