PyPI - quack-kernels - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

quack-kernels 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

quack/__init__.py +1 -1
quack/activation.py +72 -64
quack/broadcast_utils.py +1 -1
quack/copy_utils.py +14 -18
quack/fast_math.py +29 -76
quack/gemm_act.py +296 -8
quack/gemm_dact.py +520 -4
quack/gemm_default_epi.py +4 -4
quack/gemm_interface.py +363 -0
quack/gemm_sm100.py +62 -88
quack/gemm_sm90.py +68 -114
quack/gemm_symmetric.py +2 -6
quack/layout_utils.py +2 -4
quack/linear.py +37 -0
quack/pipeline.py +59 -89
quack/reduce.py +2 -2
quack/rmsnorm.py +1 -3
quack/sm90_utils.py +5 -3
quack/sort/bitonic_sort.py +3 -3
quack/tile_scheduler.py +310 -256
quack/topk.py +4 -4
quack/utils.py +76 -40
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/METADATA +2 -2
quack_kernels-0.2.6.dist-info/RECORD +45 -0
quack_kernels-0.2.5.dist-info/RECORD +0 -45
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/top_level.txt +0 -0

quack/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.2.5"
+__version__ = "0.2.6"
 import os

quack/activation.py CHANGED Viewed

@@ -2,18 +2,24 @@
 import math
 from typing import Tuple
+from functools import partial
 import cutlass.cute as cute
 from cutlass import Float32, Boolean, const_expr
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
-import quack.utils as utils
+from cutlass._mlir.dialects import llvm, nvvm
 F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
+sub_packed_f32x2 = partial(
+    cute.arch.calc_packed_f32x2_op,
+    src_c=None,
+    calc_func=nvvm.sub_packed_f32x2,
+)
 @dsl_user_op
 def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
     return Float32(
@@ -35,9 +41,9 @@ def sigmoid(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
         return 0.5 + 0.5 * tanh(0.5 * x)
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
-        return utils.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
+        return cute.arch.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
 @dsl_user_op
@@ -75,7 +81,7 @@ def relu_sq(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         return cute.arch.fmax(x, Float32(0.0)) * x
     else:
         relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
-        return utils.mul_packed_f32x2(relu_x, x)
+        return cute.arch.mul_packed_f32x2(relu_x, x)
 @dsl_user_op
@@ -98,8 +104,8 @@ def drelu_sq(
         return dx, relu_sq_out
     else:
         relu_x = relu(x)
-        relu_sq_out = utils.mul_packed_f32x2(relu_x, x)
-        dx = utils.mul_packed_f32x2((2.0, 2.0), utils.mul_packed_f32x2(dout, relu_x))
+        relu_sq_out = cute.arch.mul_packed_f32x2(relu_x, x)
+        dx = cute.arch.mul_packed_f32x2((2.0, 2.0), cute.arch.mul_packed_f32x2(dout, relu_x))
         return dx, relu_sq_out
@@ -119,14 +125,14 @@ def gelu_tanh_approx(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
             * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
         )
     else:
-        x_sq = utils.mul_packed_f32x2(x, x)
-        x_sq_scaled = utils.fma_packed_f32x2(
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
-        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
-        x_tanh_z = utils.fma_packed_f32x2(tanh_z, x, x)
-        return utils.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
+        x_tanh_z = cute.arch.fma_packed_f32x2(tanh_z, x, x)
+        return cute.arch.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
 @dsl_user_op
@@ -167,28 +173,28 @@ def dgelu_tanh_approx(
         return dx, gelu_out
     else:
         # Compute z = x * (c1 + c2 * x^2)
-        x_sq = utils.mul_packed_f32x2(x, x)
-        x_sq_scaled = utils.fma_packed_f32x2(
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
-        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
-        half_tanh_z_plus_one = utils.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
-        gelu_out = utils.mul_packed_f32x2(x, half_tanh_z_plus_one)
+        half_tanh_z_plus_one = cute.arch.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
+        gelu_out = cute.arch.mul_packed_f32x2(x, half_tanh_z_plus_one)
         # Compute gradient
         # sech^2(z) = 1 - tanh^2(z)
-        sech2_z = utils.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
+        sech2_z = cute.arch.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
         # dz/dx = c1 + 3 * c2 * x^2
-        dz_dx = utils.fma_packed_f32x2(
+        dz_dx = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
         # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
-        sech2_dz_dx = utils.mul_packed_f32x2(sech2_z, dz_dx)
-        x_sech2_dz_dx = utils.mul_packed_f32x2(x, sech2_dz_dx)
-        dgelu = utils.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
+        sech2_dz_dx = cute.arch.mul_packed_f32x2(sech2_z, dz_dx)
+        x_sech2_dz_dx = cute.arch.mul_packed_f32x2(x, sech2_dz_dx)
+        dgelu = cute.arch.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
-        dx = utils.mul_packed_f32x2(dout, dgelu)
+        dx = cute.arch.mul_packed_f32x2(dout, dgelu)
         return dx, gelu_out
@@ -204,15 +210,15 @@ def softplus(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         )
     else:
         log2_e = math.log2(math.e)
-        x_log2e = utils.mul_packed_f32x2(x, (log2_e, log2_e))
+        x_log2e = cute.arch.mul_packed_f32x2(x, (log2_e, log2_e))
         x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
-        x_exp_p1 = utils.add_packed_f32x2(x_exp, (1.0, 1.0))
+        x_exp_p1 = cute.arch.add_packed_f32x2(x_exp, (1.0, 1.0))
         log_x_exp_p1 = (
             cute.math.log2(x_exp_p1[0], fastmath=True),
             cute.math.log2(x_exp_p1[1], fastmath=True),
         )
         ln2 = math.log(2.0)
-        softplus_x = utils.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
+        softplus_x = cute.arch.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
         use_linear_0 = Boolean(x[0] > 20.0)
         use_linear_1 = Boolean(x[1] > 20.0)
         return (
@@ -241,9 +247,9 @@ def silu(x: F32_or_F32x2, *, already_halved: bool = False, loc=None, ip=None) ->
         # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
         return x_half * tanh(x_half) + x_half
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
-        return utils.fma_packed_f32x2(x_half, tanh_x_half, x_half)
+        return cute.arch.fma_packed_f32x2(x_half, tanh_x_half, x_half)
 @dsl_user_op
@@ -251,7 +257,7 @@ def swiglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32
     if const_expr(not isinstance(x, tuple)):
         return silu(x) * y
     else:
-        return utils.mul_packed_f32x2(silu(x), y)
+        return cute.arch.mul_packed_f32x2(silu(x), y)
 @dsl_user_op
@@ -301,20 +307,22 @@ def dswiglu(
         # Compute sigmoid(x) and silu(x)
         if const_expr(not already_halved):
             sigmoid_x = sigmoid(x)
-            silu_x = utils.mul_packed_f32x2(x, sigmoid_x)
+            silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_x)
         else:
             tanh_x = (tanh(x[0]), tanh(x[1]))
-            sigmoid_x = utils.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
-            silu_x = utils.fma_packed_f32x2(x, tanh_x, x)
-        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+            sigmoid_x = cute.arch.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
+            silu_x = cute.arch.fma_packed_f32x2(x, tanh_x, x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
         # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
-        sigmoid_x_minus_silu_x_sigmoid_x = utils.fma_packed_f32x2(
+        sigmoid_x_minus_silu_x_sigmoid_x = cute.arch.fma_packed_f32x2(
             sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
         )
-        d_silu_x_dout = utils.fma_packed_f32x2(sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout)
-        dx = utils.mul_packed_f32x2(d_silu_x_dout, y)
+        d_silu_x_dout = cute.arch.fma_packed_f32x2(
+            sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout
+        )
+        dx = cute.arch.mul_packed_f32x2(d_silu_x_dout, y)
         dy = silu_x_dout
-        swiglu_out = utils.mul_packed_f32x2(silu_x, y)
+        swiglu_out = cute.arch.mul_packed_f32x2(silu_x, y)
         return dx, dy, swiglu_out
@@ -334,11 +342,11 @@ def swiglu_oai(
         silu_x = x_half * tanh(alpha * x_half) + x_half
         return silu_x * y + silu_x
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
-        alpha_x_half = utils.mul_packed_f32x2((alpha, alpha), x_half)
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
+        alpha_x_half = cute.arch.mul_packed_f32x2((alpha, alpha), x_half)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
-        silu_x = utils.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
-        return utils.fma_packed_f32x2(silu_x, y, silu_x)
+        silu_x = cute.arch.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
+        return cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
 @dsl_user_op
@@ -370,22 +378,22 @@ def dswiglu_oai(
         return dx, dy, swiglu_out
     else:
         # Compute sigmoid(alpha * x)
-        alpha_x_half = utils.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
+        alpha_x_half = cute.arch.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
-        sigmoid_alpha_x = utils.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
-        silu_x = utils.mul_packed_f32x2(x, sigmoid_alpha_x)
-        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+        sigmoid_alpha_x = cute.arch.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
+        silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_alpha_x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
         # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
-        silu_x_minus_product = utils.fma_packed_f32x2(
+        silu_x_minus_product = cute.arch.fma_packed_f32x2(
             silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
         )
-        sigmoid_plus_alpha_diff = utils.fma_packed_f32x2(
+        sigmoid_plus_alpha_diff = cute.arch.fma_packed_f32x2(
             (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
         )
-        d_silu_x_dout = utils.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
-        dx = utils.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
+        d_silu_x_dout = cute.arch.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
+        dx = cute.arch.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
         dy = silu_x_dout
-        swiglu_out = utils.fma_packed_f32x2(silu_x, y, silu_x)
+        swiglu_out = cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
         return dx, dy, swiglu_out
@@ -400,7 +408,7 @@ def glu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         return sigmoid_x * y  # FMUL
     else:
         sigmoid_x = sigmoid(x)
-        return utils.mul_packed_f32x2(sigmoid_x, y)
+        return cute.arch.mul_packed_f32x2(sigmoid_x, y)
 @dsl_user_op
@@ -430,11 +438,11 @@ def dglu(
         return dx, dy, glu_out
     else:
         sigmoid_x = sigmoid(x)
-        sigmoid_x_dout = utils.mul_packed_f32x2(sigmoid_x, dout)
-        glu_out = utils.mul_packed_f32x2(sigmoid_x, y)
+        sigmoid_x_dout = cute.arch.mul_packed_f32x2(sigmoid_x, dout)
+        glu_out = cute.arch.mul_packed_f32x2(sigmoid_x, y)
         # dx = (y - glu_out) * sigmoid_x_dout
-        y_minus_glu_out = utils.sub_packed_f32x2(y, glu_out)
-        dx = utils.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
+        y_minus_glu_out = sub_packed_f32x2(y, glu_out)
+        dx = cute.arch.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
         dy = sigmoid_x_dout
         return dx, dy, glu_out
@@ -448,7 +456,7 @@ def reglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x
         return cute.arch.fmax(x, Float32(0.0)) * y
     else:
         relu_x = relu(x)
-        return utils.mul_packed_f32x2(relu_x, y)
+        return cute.arch.mul_packed_f32x2(relu_x, y)
 @dsl_user_op
@@ -475,10 +483,10 @@ def dreglu(
         x0_pos = Boolean(x[0] > 0)
         x1_pos = Boolean(x[1] > 0)
         relu_x = relu(x)
-        dout_y = utils.mul_packed_f32x2(dout, y)
+        dout_y = cute.arch.mul_packed_f32x2(dout, y)
         dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
-        dy = utils.mul_packed_f32x2(dout, relu_x)
-        reglu_out = utils.mul_packed_f32x2(relu_x, y)
+        dy = cute.arch.mul_packed_f32x2(dout, relu_x)
+        reglu_out = cute.arch.mul_packed_f32x2(relu_x, y)
         return dx, dy, reglu_out
@@ -491,7 +499,7 @@ def geglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x
     if const_expr(not isinstance(x, tuple)):
         return gelu_tanh_approx(x) * y
     else:
-        return utils.mul_packed_f32x2(gelu_tanh_approx(x), y)
+        return cute.arch.mul_packed_f32x2(gelu_tanh_approx(x), y)
 @dsl_user_op
@@ -518,7 +526,7 @@ def dgeglu(
         # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
         dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
         # Compute gradients for geglu
-        dx = utils.mul_packed_f32x2(dgelu_x_dout, y)
-        dy = utils.mul_packed_f32x2(gelu_x, dout)
-        geglu_out = utils.mul_packed_f32x2(gelu_x, y)
+        dx = cute.arch.mul_packed_f32x2(dgelu_x_dout, y)
+        dy = cute.arch.mul_packed_f32x2(gelu_x, dout)
+        geglu_out = cute.arch.mul_packed_f32x2(gelu_x, y)
         return dx, dy, geglu_out

quack/broadcast_utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ from quack.layout_utils import make_acc_tensor_mn_view
 @cute.jit
 def vec_op(tCrC: cute.Tensor, tCrVec: cute.Tensor, op: Callable, is_colvec: bool) -> None:
     if const_expr(tCrC.element_type != Float32):  # Convert to f32
-        tCrC_f32 = cute.make_fragment(tCrC.shape, Float32)
+        tCrC_f32 = cute.make_rmem_tensor(tCrC.shape, Float32)
         tCrC_f32.store(tCrC.load().to(Float32))
     else:
         tCrC_f32 = tCrC

quack/copy_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Boolean, const_expr
-from cutlass.cute.nvgpu import cpasync, warpgroup
+from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 from cutlass.cutlass_dsl import dsl_user_op
 import cutlass.pipeline
@@ -52,7 +52,7 @@ def load_s2r_retile(
 ) -> cute.Tensor:
     # Will also accept dst_shape being a tensor, in which case we write into that tensor
     if const_expr(not isinstance(dst_shape, cute.Tensor)):
-        dst = cute.make_fragment(dst_shape, src.element_type, loc=loc, ip=ip)
+        dst = cute.make_rmem_tensor(dst_shape, src.element_type, loc=loc, ip=ip)
     else:
         dst = dst_shape
     cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
@@ -117,7 +117,7 @@ def tiled_copy_2d(
 @cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
-    tApA = cute.make_fragment(
+    tApA = cute.make_rmem_tensor(
         cute.make_layout(
             (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
@@ -242,9 +242,7 @@ def sm90_get_smem_load_op(
         raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
     is_m_major = layout_c.is_m_major_c()
     if elem_ty_c.width == 16:
-        return cute.make_copy_atom(
-            cute.nvgpu.warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip
-        )
+        return cute.make_copy_atom(warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip)
     else:
         return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
@@ -260,7 +258,7 @@ def get_smem_store_atom(
         )
     else:
         return cute.make_copy_atom(
-            cute.nvgpu.warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
             element_type,
         )
@@ -276,7 +274,7 @@ def get_smem_load_atom(
         )
     else:
         return cute.make_copy_atom(
-            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
             element_type,
         )
@@ -368,8 +366,6 @@ def get_smem_load_A(
         tSR_sA = thr_copy.partition_S(sA)
     else:
         tSR_sA = partition_S_position_independent(thr_copy, sA)
-    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
-    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
     tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
     def copy_fn(src_idx: Int32, **new_kwargs):
@@ -464,10 +460,10 @@ def gather_m_get_copy_fn(
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
-    m_idx = cute.make_fragment(rows_per_thread, Int32)
+    m_idx = cute.make_rmem_tensor(rows_per_thread, Int32)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         row_idx = tAcA[0, m, 0][0]
         if tApA_m[m]:
@@ -480,7 +476,7 @@ def gather_m_get_copy_fn(
     def copy_fn(src_idx, dst_idx, pred: bool = False):
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
@@ -538,7 +534,7 @@ def gather_k_get_copy_fn(
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
     threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
@@ -554,12 +550,12 @@ def gather_k_get_copy_fn(
         # Prefetch mAIdx early, even before smem is free
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         gAIdx_cur = gAIdx[None, src_idx]
-        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             if const_expr(not pred):
@@ -576,13 +572,13 @@ def gather_k_get_copy_fn(
     ) -> Tuple[cute.Tensor, cute.Tensor]:
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
         sAIdx_cur = sAIdx[None, dst_idx]
-        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             k_idx[k] = sAIdx_cur[col_idx]

quack/fast_math.py CHANGED Viewed

@@ -1,80 +1,33 @@
 # Copyright (c) 2025, Tri Dao.
-from typing import Tuple
-from dataclasses import dataclass
 import cutlass
 import cutlass.cute as cute
-from cutlass import Int32, Uint32
-from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
-from quack.cute_dsl_utils import ParamsBase
-@cute.jit
-def clz(x: Int32) -> Int32:
-    # for i in cutlass.range_constexpr(32):
-    #     if (1 << (31 - i)) & x:
-    #         return Int32(i)
-    # return Int32(32)
-    # Early exit is not supported yet
-    res = Int32(32)
-    done = False
-    for i in cutlass.range(32):
-        if ((1 << (31 - i)) & x) and not done:
-            res = Int32(i)
-            done = True
-    return res
-def find_log2(x: Int32) -> Int32:
-    a: Int32 = Int32(31 - clz(x))
-    return a + ((x & (x - 1)) != 0)  # Round up, add 1 if not a power of 2.
-@dsl_user_op
-def umulhi(a: Int32, b: Int32, *, loc=None, ip=None) -> Uint32:
-    return Uint32(
-        llvm.inline_asm(
-            T.i32(),
-            [Int32(a).ir_value(loc=loc, ip=ip), Int32(b).ir_value(loc=loc, ip=ip)],
-            "mul.hi.u32 $0, $1, $2;",
-            "=r,r,r",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
-@dataclass
-class FastDivmod(ParamsBase):
-    divisor: Int32
-    multiplier: Uint32
-    shift_right: Uint32
-    # called by host
-    @staticmethod
-    def create(divisor: Int32) -> "FastDivmod":
-        """Construct the FastDivmod object, in host code.
-        This precomputes some values based on the divisor and is computationally expensive.
-        """
-        p = Uint32(31 + find_log2(divisor))
-        divisor_u32 = Uint32(divisor)
-        multiplier = Uint32(((cutlass.Uint64(1) << p) + divisor_u32 - 1) // divisor_u32)
-        shift_right = Uint32(p - 32)
-        return FastDivmod(divisor, multiplier, shift_right)
-    @cute.jit
-    def div(self, dividend: Int32) -> Int32:
-        return (
-            Int32(umulhi(dividend, self.multiplier) >> self.shift_right)
-            if self.divisor != 1
-            else dividend
-        )
-    def divmod(self, dividend: Int32) -> Tuple[Int32, Int32]:
-        quotient = self.div(dividend)
-        remainder = dividend - quotient * self.divisor
-        return quotient, remainder
+from cutlass.base_dsl.typing import Integer
+from cutlass.cutlass_dsl import dsl_user_op
+class FastDivmod(cute.FastDivmodDivisor):
+    """We store the divisor along with the FastDivmodDivisor."""
+    @dsl_user_op
+    def __init__(
+        self,
+        divisor: Integer,
+        is_power_of_2: bool = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        super().__init__(divisor, is_power_of_2=is_power_of_2, loc=loc, ip=ip)
+        self.divisor = divisor
+    def __extract_mlir_values__(self):
+        """Extract MLIR values for Host->Device transfer."""
+        return [self._divisor] + cutlass.extract_mlir_values(self.divisor)
+    def __new_from_mlir_values__(self, values):
+        """Reconstruct FastDivmodDivisor from MLIR values."""
+        new_obj = object.__new__(FastDivmod)
+        new_obj._divisor = values[0]
+        new_obj.divisor = cutlass.new_from_mlir_values(self.divisor, values[1:])
+        return new_obj

quack-kernels 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

quack-kernels 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl