PyPI - quack-kernels - Versions diffs - 0.2.4__tar.gz → 0.2.6__tar.gz - Mend

quack-kernels 0.2.4tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{quack_kernels-0.2.4/quack_kernels.egg-info → quack_kernels-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.2.4
+Version: 0.2.6
 Requires-Python: >=3.10
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl<4.4.0,>=4.3.4
+Requires-Dist: nvidia-cutlass-dsl>=4.4.0.dev1
 Requires-Dist: torch
 Requires-Dist: apache-tvm-ffi<0.2,>=0.1.6
 Requires-Dist: torch-c-dlpack-ext

{quack_kernels-0.2.4 → quack_kernels-0.2.6}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ name = "quack-kernels"
 dynamic = ["version"]
 requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl>=4.3.4,<4.4.0",
+    "nvidia-cutlass-dsl>=4.4.0.dev1",
     "torch",
     "apache-tvm-ffi>=0.1.6,<0.2",
     "torch-c-dlpack-ext",
@@ -20,7 +20,8 @@ dev = [
 ]
 [tool.setuptools.packages.find]
-exclude = ["tests", "benchmarks"]
+where = ["."]
+include = ["quack*"]
 [tool.setuptools.dynamic]
 version = {attr = "quack.__version__"}

quack_kernels-0.2.6/quack/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+__version__ = "0.2.6"
+import os
+from quack.rmsnorm import rmsnorm
+from quack.softmax import softmax
+from quack.cross_entropy import cross_entropy
+if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
+    import quack.cute_dsl_ptxas  # noqa: F401
+    # Patch to dump ptx and then use system ptxas to compile to cubin
+    quack.cute_dsl_ptxas.patch()
+__all__ = [
+    "rmsnorm",
+    "softmax",
+    "cross_entropy",
+]

{quack_kernels-0.2.4 → quack_kernels-0.2.6}/quack/activation.py RENAMED Viewed

@@ -2,18 +2,24 @@
 import math
 from typing import Tuple
+from functools import partial
 import cutlass.cute as cute
 from cutlass import Float32, Boolean, const_expr
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
-import quack.utils as utils
+from cutlass._mlir.dialects import llvm, nvvm
 F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
+sub_packed_f32x2 = partial(
+    cute.arch.calc_packed_f32x2_op,
+    src_c=None,
+    calc_func=nvvm.sub_packed_f32x2,
+)
 @dsl_user_op
 def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
     return Float32(
@@ -35,9 +41,9 @@ def sigmoid(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
         return 0.5 + 0.5 * tanh(0.5 * x)
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
-        return utils.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
+        return cute.arch.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
 @dsl_user_op
@@ -75,7 +81,7 @@ def relu_sq(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         return cute.arch.fmax(x, Float32(0.0)) * x
     else:
         relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
-        return utils.mul_packed_f32x2(relu_x, x)
+        return cute.arch.mul_packed_f32x2(relu_x, x)
 @dsl_user_op
@@ -98,8 +104,8 @@ def drelu_sq(
         return dx, relu_sq_out
     else:
         relu_x = relu(x)
-        relu_sq_out = utils.mul_packed_f32x2(relu_x, x)
-        dx = utils.mul_packed_f32x2((2.0, 2.0), utils.mul_packed_f32x2(dout, relu_x))
+        relu_sq_out = cute.arch.mul_packed_f32x2(relu_x, x)
+        dx = cute.arch.mul_packed_f32x2((2.0, 2.0), cute.arch.mul_packed_f32x2(dout, relu_x))
         return dx, relu_sq_out
@@ -119,14 +125,14 @@ def gelu_tanh_approx(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
             * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
         )
     else:
-        x_sq = utils.mul_packed_f32x2(x, x)
-        x_sq_scaled = utils.fma_packed_f32x2(
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
-        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
-        x_tanh_z = utils.fma_packed_f32x2(tanh_z, x, x)
-        return utils.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
+        x_tanh_z = cute.arch.fma_packed_f32x2(tanh_z, x, x)
+        return cute.arch.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
 @dsl_user_op
@@ -167,28 +173,28 @@ def dgelu_tanh_approx(
         return dx, gelu_out
     else:
         # Compute z = x * (c1 + c2 * x^2)
-        x_sq = utils.mul_packed_f32x2(x, x)
-        x_sq_scaled = utils.fma_packed_f32x2(
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
-        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
-        half_tanh_z_plus_one = utils.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
-        gelu_out = utils.mul_packed_f32x2(x, half_tanh_z_plus_one)
+        half_tanh_z_plus_one = cute.arch.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
+        gelu_out = cute.arch.mul_packed_f32x2(x, half_tanh_z_plus_one)
         # Compute gradient
         # sech^2(z) = 1 - tanh^2(z)
-        sech2_z = utils.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
+        sech2_z = cute.arch.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
         # dz/dx = c1 + 3 * c2 * x^2
-        dz_dx = utils.fma_packed_f32x2(
+        dz_dx = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
         # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
-        sech2_dz_dx = utils.mul_packed_f32x2(sech2_z, dz_dx)
-        x_sech2_dz_dx = utils.mul_packed_f32x2(x, sech2_dz_dx)
-        dgelu = utils.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
+        sech2_dz_dx = cute.arch.mul_packed_f32x2(sech2_z, dz_dx)
+        x_sech2_dz_dx = cute.arch.mul_packed_f32x2(x, sech2_dz_dx)
+        dgelu = cute.arch.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
-        dx = utils.mul_packed_f32x2(dout, dgelu)
+        dx = cute.arch.mul_packed_f32x2(dout, dgelu)
         return dx, gelu_out
@@ -204,15 +210,15 @@ def softplus(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         )
     else:
         log2_e = math.log2(math.e)
-        x_log2e = utils.mul_packed_f32x2(x, (log2_e, log2_e))
+        x_log2e = cute.arch.mul_packed_f32x2(x, (log2_e, log2_e))
         x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
-        x_exp_p1 = utils.add_packed_f32x2(x_exp, (1.0, 1.0))
+        x_exp_p1 = cute.arch.add_packed_f32x2(x_exp, (1.0, 1.0))
         log_x_exp_p1 = (
             cute.math.log2(x_exp_p1[0], fastmath=True),
             cute.math.log2(x_exp_p1[1], fastmath=True),
         )
         ln2 = math.log(2.0)
-        softplus_x = utils.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
+        softplus_x = cute.arch.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
         use_linear_0 = Boolean(x[0] > 20.0)
         use_linear_1 = Boolean(x[1] > 20.0)
         return (
@@ -241,9 +247,9 @@ def silu(x: F32_or_F32x2, *, already_halved: bool = False, loc=None, ip=None) ->
         # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
         return x_half * tanh(x_half) + x_half
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
-        return utils.fma_packed_f32x2(x_half, tanh_x_half, x_half)
+        return cute.arch.fma_packed_f32x2(x_half, tanh_x_half, x_half)
 @dsl_user_op
@@ -251,7 +257,7 @@ def swiglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32
     if const_expr(not isinstance(x, tuple)):
         return silu(x) * y
     else:
-        return utils.mul_packed_f32x2(silu(x), y)
+        return cute.arch.mul_packed_f32x2(silu(x), y)
 @dsl_user_op
@@ -301,20 +307,22 @@ def dswiglu(
         # Compute sigmoid(x) and silu(x)
         if const_expr(not already_halved):
             sigmoid_x = sigmoid(x)
-            silu_x = utils.mul_packed_f32x2(x, sigmoid_x)
+            silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_x)
         else:
             tanh_x = (tanh(x[0]), tanh(x[1]))
-            sigmoid_x = utils.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
-            silu_x = utils.fma_packed_f32x2(x, tanh_x, x)
-        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+            sigmoid_x = cute.arch.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
+            silu_x = cute.arch.fma_packed_f32x2(x, tanh_x, x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
         # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
-        sigmoid_x_minus_silu_x_sigmoid_x = utils.fma_packed_f32x2(
+        sigmoid_x_minus_silu_x_sigmoid_x = cute.arch.fma_packed_f32x2(
             sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
         )
-        d_silu_x_dout = utils.fma_packed_f32x2(sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout)
-        dx = utils.mul_packed_f32x2(d_silu_x_dout, y)
+        d_silu_x_dout = cute.arch.fma_packed_f32x2(
+            sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout
+        )
+        dx = cute.arch.mul_packed_f32x2(d_silu_x_dout, y)
         dy = silu_x_dout
-        swiglu_out = utils.mul_packed_f32x2(silu_x, y)
+        swiglu_out = cute.arch.mul_packed_f32x2(silu_x, y)
         return dx, dy, swiglu_out
@@ -334,11 +342,11 @@ def swiglu_oai(
         silu_x = x_half * tanh(alpha * x_half) + x_half
         return silu_x * y + silu_x
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
-        alpha_x_half = utils.mul_packed_f32x2((alpha, alpha), x_half)
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
+        alpha_x_half = cute.arch.mul_packed_f32x2((alpha, alpha), x_half)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
-        silu_x = utils.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
-        return utils.fma_packed_f32x2(silu_x, y, silu_x)
+        silu_x = cute.arch.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
+        return cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
 @dsl_user_op
@@ -370,22 +378,22 @@ def dswiglu_oai(
         return dx, dy, swiglu_out
     else:
         # Compute sigmoid(alpha * x)
-        alpha_x_half = utils.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
+        alpha_x_half = cute.arch.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
-        sigmoid_alpha_x = utils.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
-        silu_x = utils.mul_packed_f32x2(x, sigmoid_alpha_x)
-        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+        sigmoid_alpha_x = cute.arch.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
+        silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_alpha_x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
         # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
-        silu_x_minus_product = utils.fma_packed_f32x2(
+        silu_x_minus_product = cute.arch.fma_packed_f32x2(
             silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
         )
-        sigmoid_plus_alpha_diff = utils.fma_packed_f32x2(
+        sigmoid_plus_alpha_diff = cute.arch.fma_packed_f32x2(
             (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
         )
-        d_silu_x_dout = utils.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
-        dx = utils.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
+        d_silu_x_dout = cute.arch.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
+        dx = cute.arch.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
         dy = silu_x_dout
-        swiglu_out = utils.fma_packed_f32x2(silu_x, y, silu_x)
+        swiglu_out = cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
         return dx, dy, swiglu_out
@@ -400,7 +408,7 @@ def glu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         return sigmoid_x * y  # FMUL
     else:
         sigmoid_x = sigmoid(x)
-        return utils.mul_packed_f32x2(sigmoid_x, y)
+        return cute.arch.mul_packed_f32x2(sigmoid_x, y)
 @dsl_user_op
@@ -430,11 +438,11 @@ def dglu(
         return dx, dy, glu_out
     else:
         sigmoid_x = sigmoid(x)
-        sigmoid_x_dout = utils.mul_packed_f32x2(sigmoid_x, dout)
-        glu_out = utils.mul_packed_f32x2(sigmoid_x, y)
+        sigmoid_x_dout = cute.arch.mul_packed_f32x2(sigmoid_x, dout)
+        glu_out = cute.arch.mul_packed_f32x2(sigmoid_x, y)
         # dx = (y - glu_out) * sigmoid_x_dout
-        y_minus_glu_out = utils.sub_packed_f32x2(y, glu_out)
-        dx = utils.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
+        y_minus_glu_out = sub_packed_f32x2(y, glu_out)
+        dx = cute.arch.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
         dy = sigmoid_x_dout
         return dx, dy, glu_out
@@ -448,7 +456,7 @@ def reglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x
         return cute.arch.fmax(x, Float32(0.0)) * y
     else:
         relu_x = relu(x)
-        return utils.mul_packed_f32x2(relu_x, y)
+        return cute.arch.mul_packed_f32x2(relu_x, y)
 @dsl_user_op
@@ -475,10 +483,10 @@ def dreglu(
         x0_pos = Boolean(x[0] > 0)
         x1_pos = Boolean(x[1] > 0)
         relu_x = relu(x)
-        dout_y = utils.mul_packed_f32x2(dout, y)
+        dout_y = cute.arch.mul_packed_f32x2(dout, y)
         dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
-        dy = utils.mul_packed_f32x2(dout, relu_x)
-        reglu_out = utils.mul_packed_f32x2(relu_x, y)
+        dy = cute.arch.mul_packed_f32x2(dout, relu_x)
+        reglu_out = cute.arch.mul_packed_f32x2(relu_x, y)
         return dx, dy, reglu_out
@@ -491,7 +499,7 @@ def geglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x
     if const_expr(not isinstance(x, tuple)):
         return gelu_tanh_approx(x) * y
     else:
-        return utils.mul_packed_f32x2(gelu_tanh_approx(x), y)
+        return cute.arch.mul_packed_f32x2(gelu_tanh_approx(x), y)
 @dsl_user_op
@@ -518,7 +526,7 @@ def dgeglu(
         # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
         dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
         # Compute gradients for geglu
-        dx = utils.mul_packed_f32x2(dgelu_x_dout, y)
-        dy = utils.mul_packed_f32x2(gelu_x, dout)
-        geglu_out = utils.mul_packed_f32x2(gelu_x, y)
+        dx = cute.arch.mul_packed_f32x2(dgelu_x_dout, y)
+        dy = cute.arch.mul_packed_f32x2(gelu_x, dout)
+        geglu_out = cute.arch.mul_packed_f32x2(gelu_x, y)
         return dx, dy, geglu_out

{quack_kernels-0.2.4 → quack_kernels-0.2.6}/quack/broadcast_utils.py RENAMED Viewed

@@ -11,7 +11,7 @@ from quack.layout_utils import make_acc_tensor_mn_view
 @cute.jit
 def vec_op(tCrC: cute.Tensor, tCrVec: cute.Tensor, op: Callable, is_colvec: bool) -> None:
     if const_expr(tCrC.element_type != Float32):  # Convert to f32
-        tCrC_f32 = cute.make_fragment(tCrC.shape, Float32)
+        tCrC_f32 = cute.make_rmem_tensor(tCrC.shape, Float32)
         tCrC_f32.store(tCrC.load().to(Float32))
     else:
         tCrC_f32 = tCrC

{quack_kernels-0.2.4 → quack_kernels-0.2.6}/quack/copy_utils.py RENAMED Viewed

@@ -7,18 +7,19 @@ import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Boolean, const_expr
-from cutlass.cute.nvgpu import cpasync
+from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 from cutlass.cutlass_dsl import dsl_user_op
 import cutlass.pipeline
 @dsl_user_op
 def cvt_copy(
-    atom: cute.CopyAtom,
+    tiled_copy: cute.TiledCopy,
     src: cute.Tensor,
     dst: cute.Tensor,
     *,
     pred: Optional[cute.Tensor] = None,
+    retile: bool = False,
     loc=None,
     ip=None,
     **kwargs,
@@ -28,7 +29,9 @@ def cvt_copy(
         src_cvt = cute.make_fragment_like(src, dst.element_type)
         src_cvt.store(src.load().to(dst.element_type))
         src = src_cvt
-    cute.copy(atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+    if const_expr(retile):
+        src = tiled_copy.retile(src)
+    cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
 @dsl_user_op
@@ -49,7 +52,7 @@ def load_s2r_retile(
 ) -> cute.Tensor:
     # Will also accept dst_shape being a tensor, in which case we write into that tensor
     if const_expr(not isinstance(dst_shape, cute.Tensor)):
-        dst = cute.make_fragment(dst_shape, src.element_type, loc=loc, ip=ip)
+        dst = cute.make_rmem_tensor(dst_shape, src.element_type, loc=loc, ip=ip)
     else:
         dst = dst_shape
     cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
@@ -114,7 +117,7 @@ def tiled_copy_2d(
 @cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
-    tApA = cute.make_fragment(
+    tApA = cute.make_rmem_tensor(
         cute.make_layout(
             (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
@@ -239,9 +242,7 @@ def sm90_get_smem_load_op(
         raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
     is_m_major = layout_c.is_m_major_c()
     if elem_ty_c.width == 16:
-        return cute.make_copy_atom(
-            cute.nvgpu.warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip
-        )
+        return cute.make_copy_atom(warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip)
     else:
         return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
@@ -257,11 +258,127 @@ def get_smem_store_atom(
         )
     else:
         return cute.make_copy_atom(
-            cute.nvgpu.warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
             element_type,
         )
+def get_smem_load_atom(
+    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        return cute.make_copy_atom(
+            warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            element_type,
+        )
+def get_smem_store_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sC = thr_copy.partition_D(sC)
+    else:
+        tRS_sC = partition_D_position_independent(thr_copy, sC)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sC[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sC
+def get_smem_load_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sC = thr_copy.partition_S(sC)
+    else:
+        tSR_sC = partition_S_position_independent(thr_copy, sC)
+    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
+    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
+    tRS_shape = thr_copy_RS.partition_S(cute.make_identity_tensor(sC.shape[:2])).shape
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sC[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    return copy_fn, thr_copy, tSR_sC
+def get_smem_store_A(
+    tiled_mma: cute.TiledMma, sA: cute.Tensor, tidx: Int32, arch: int, position_independent=False
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sA = thr_copy.partition_D(sA)
+    else:
+        tRS_sA = partition_D_position_independent(thr_copy, sA)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sA[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sA
+def get_smem_load_A(
+    tiled_mma: cute.TiledMma,
+    sA: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    with_dst_tensor: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sA = thr_copy.partition_S(sA)
+    else:
+        tSR_sA = partition_S_position_independent(thr_copy, sA)
+    tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sA[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    def copy_fn_w_dst_tensor(src_idx: Int32, dst: cute.Tensor, **new_kwargs):
+        return load_s2r_retile(tiled_copy, tSR_sA[None, None, None, src_idx], dst, **new_kwargs)
+    return copy_fn if not with_dst_tensor else copy_fn_w_dst_tensor, thr_copy, tSR_sA
 def tma_get_copy_fn(
     atom: cute.CopyAtom,
     cta_coord: cute.Coord,
@@ -269,6 +386,7 @@ def tma_get_copy_fn(
     src_tensor: cute.Tensor,
     dst_tensor: cute.Tensor,
     filter_zeros: bool = False,
+    single_stage: bool = False,
     **kwargs,
 ) -> Callable:
     src_is_smem = const_expr(
@@ -276,13 +394,15 @@ def tma_get_copy_fn(
         and src_tensor.memspace == cute.AddressSpace.smem
     )
     smem_tensor, gmem_tensor = (src_tensor, dst_tensor) if src_is_smem else (dst_tensor, src_tensor)
+    group_rank_smem = const_expr(cute.rank(smem_tensor) - (1 if not single_stage else 0))
+    group_rank_gmem = const_expr(cute.rank(gmem_tensor) - (1 if not single_stage else 0))
     # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
     s, g = cpasync.tma_partition(
         atom,
         cta_coord,
         cta_layout,
-        cute.group_modes(smem_tensor, 0, cute.rank(smem_tensor) - 1),
-        cute.group_modes(gmem_tensor, 0, cute.rank(gmem_tensor) - 1),
+        cute.group_modes(smem_tensor, 0, group_rank_smem),
+        cute.group_modes(gmem_tensor, 0, group_rank_gmem),
     )
     if const_expr(filter_zeros):
         s = cute.filter_zeros(s)
@@ -292,7 +412,10 @@ def tma_get_copy_fn(
     def copy_tma(src_idx, dst_idx, **new_kwargs):
         cute.copy(atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs)
-    return copy_tma, s, g
+    def copy_tma_single_stage(**new_kwargs):
+        cute.copy(atom, src, dst, **new_kwargs, **kwargs)
+    return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
 def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsync):
@@ -337,10 +460,10 @@ def gather_m_get_copy_fn(
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
-    m_idx = cute.make_fragment(rows_per_thread, Int32)
+    m_idx = cute.make_rmem_tensor(rows_per_thread, Int32)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         row_idx = tAcA[0, m, 0][0]
         if tApA_m[m]:
@@ -353,7 +476,7 @@ def gather_m_get_copy_fn(
     def copy_fn(src_idx, dst_idx, pred: bool = False):
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
@@ -411,7 +534,7 @@ def gather_k_get_copy_fn(
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
     threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
@@ -427,12 +550,12 @@ def gather_k_get_copy_fn(
         # Prefetch mAIdx early, even before smem is free
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         gAIdx_cur = gAIdx[None, src_idx]
-        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             if const_expr(not pred):
@@ -449,13 +572,13 @@ def gather_k_get_copy_fn(
     ) -> Tuple[cute.Tensor, cute.Tensor]:
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
         sAIdx_cur = sAIdx[None, dst_idx]
-        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             k_idx[k] = sAIdx_cur[col_idx]

quack-kernels 0.2.4__tar.gz → 0.2.6__tar.gz

quack-kernels 0.2.4tar.gz → 0.2.6tar.gz