PyPI - quack-kernels - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

quack-kernels 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

quack/__init__.py +1 -1
quack/activation.py +16 -25
quack/cross_entropy.py +6 -10
quack/layernorm.py +1 -1
quack/reduce.py +6 -7
quack/rmsnorm.py +57 -23
quack/softmax.py +1 -1
quack/tile_scheduler.py +3 -2
quack/utils.py +0 -63
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/METADATA +2 -2
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/RECORD +14 -14
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/top_level.txt +0 -0

quack/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.2.0"
+__version__ = "0.2.1"
 import cutlass.cute as cute

quack/activation.py CHANGED Viewed

@@ -6,23 +6,12 @@ from typing import Tuple
 import cutlass
 import cutlass.cute as cute
 from cutlass import Float32
-from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
+from cutlass.cutlass_dsl import dsl_user_op
 @dsl_user_op
-def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
-    return Float32(
-        llvm.inline_asm(
-            T.f32(),
-            [Float32(a).ir_value(loc=loc, ip=ip)],
-            "tanh.approx.f32 $0, $1;",
-            "=f,f",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
+def sigmoid(x: Float32, *, loc=None, ip=None) -> Float32:
+    return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
 @dsl_user_op
@@ -67,7 +56,10 @@ def gelu_tanh_approx(x: Float32, *, loc=None, ip=None) -> Float32:
     """
     sqrt_2_over_pi = math.sqrt(2 / math.pi)  # ~0.797885
     sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # ~0.0356774
-    return 0.5 * (x * (1 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)))))
+    return 0.5 * (
+        x
+        * (1 + cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)), fastmath=True))
+    )
 @dsl_user_op
@@ -88,7 +80,7 @@ def dgelu_tanh_approx(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[
     # Compute z = x * (c1 + c2 * x^2)
     x_sq = x * x
-    tanh_z = tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq))
+    tanh_z = cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq), fastmath=True)
     half_tanh_z_plus_one = 0.5 + 0.5 * tanh_z
     gelu_out = x * half_tanh_z_plus_one
@@ -111,7 +103,7 @@ def silu(x: Float32, *, loc=None, ip=None) -> Float32:
     This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
     """
     x_half = 0.5 * x
-    return x_half * tanh(x_half) + x_half
+    return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
 @dsl_user_op
@@ -134,8 +126,8 @@ def dswiglu(
     to use FFMA instead of FADD and FMUL).
     """
     # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(0.5 * x))
-    x_half = 0.5 * x  # FMUL
-    sigmoid_x = 0.5 + 0.5 * tanh(x_half)  # MUFU.TANH, then FFMA
+    # FMUL, MUFU.TANH, then FFMA
+    sigmoid_x = sigmoid(x)
     silu_x = x * sigmoid_x  # FMUL
     silu_x_dout = silu_x * dout  # FMUL
     #   d_silu(x) * dout
@@ -161,7 +153,7 @@ def swiglu_oai(x: Float32, y: Float32, alpha: float = 1.702, *, loc=None, ip=Non
     """
     # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
     x_half = 0.5 * x
-    silu_x = x_half * tanh(alpha * x_half) + x_half
+    silu_x = x_half * cute.math.tanh(alpha * x_half, fastmath=True) + x_half
     return silu_x * y + silu_x
@@ -179,7 +171,8 @@ def dswiglu_oai(
     """
     # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
     alpha_x_half = (0.5 * alpha) * x  # FMUL
-    sigmoid_alpha_x = 0.5 + 0.5 * tanh(alpha_x_half)  # MUFU.TANH, then FFMA
+    # MUFU.TANH, then FFMA
+    sigmoid_alpha_x = 0.5 + 0.5 * cute.math.tanh(alpha_x_half, fastmath=True)
     silu_x = x * sigmoid_alpha_x  # FMUL
     silu_x_dout = silu_x * dout  # FMUL
     # FFMA, FFMA, FMUL
@@ -197,8 +190,7 @@ def glu(x: Float32, y: Float32, *, loc=None, ip=None) -> Float32:
     glu(x, y) = sigmoid(x) * y
     Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
     """
-    x_half = 0.5 * x  # FMUL
-    sigmoid_x = 0.5 + 0.5 * tanh(x_half)  # MUFU.TANH, then FFMA
+    sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
     return sigmoid_x * y  # FMUL
@@ -215,8 +207,7 @@ def dglu(
     - glu_out = sigmoid(x) * y
     """
     # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(x/2))
-    x_half = 0.5 * x  # FMUL
-    sigmoid_x = 0.5 + 0.5 * tanh(x_half)  # MUFU.TANH, then FFMA
+    sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
     sigmoid_x_dout = sigmoid_x * dout  # FMUL
     glu_out = sigmoid_x * y  # FMUL
     # dx = y * sigmoid(x) * (1 - sigmoid(x)) * dout

quack/cross_entropy.py CHANGED Viewed

@@ -199,11 +199,8 @@ class CrossEntropy(ReductionBase):
                 cute.autovec_copy(tXsX, tXrX)
                 x = tXrX.load().to(Float32)
             log2_e = math.log2(math.e)
-            # exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
-            # a bit faster, probably because it's calling ex2.approx.ftz instead of ex2.approx?
-            # exp_x = utils.exp2f((x - max_x) * log2_e)
             # This would use ffma instead of fadd then fmul
-            exp_x = utils.exp2f(x * log2_e - (max_x * log2_e))
+            exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=False)
             denom = row_reduce(
                 exp_x,
                 cute.ReductionOp.ADD,
@@ -228,8 +225,7 @@ class CrossEntropy(ReductionBase):
             and row < shape[0]
             and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
         ):
-            ln_2 = math.log(2.0)
-            lse = max_x + utils.log2f(denom) * ln_2
+            lse = max_x + cute.math.log(denom, fastmath=True)
             # Set loss to 0 if this index should be ignored, otherwise compute normally
             loss_val = (lse - target_logit) if not should_ignore else Float32.zero
             mLoss[row] = mLoss.element_type(loss_val)
@@ -552,7 +548,7 @@ class CrossEntropyBackward:
             lse = Float32(mLSE[row])
         log2_e = math.log2(math.e)
-        probs = utils.exp2f(x * log2_e - lse * log2_e)
+        probs = cute.math.exp2(x * log2_e - (lse * log2_e), fastmath=True)
         prob_shifted = probs - 1.0
         mask = cute.make_fragment_like(tXrX, cutlass.Boolean)
         for i in cutlass.range(cute.size(tXcFull), unroll_full=True):
@@ -594,9 +590,9 @@ def _cross_entropy_backward(
     assert x.shape[0] == target.shape[0], "Batch dimensions must match"
     assert x.shape[0] == dloss.shape[0], "Batch dimensions must match"
     assert x.shape[0] == lse.shape[0], "Batch dimensions must match"
-    assert (
-        x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda
-    ), "Tensors must be on CUDA device"
+    assert x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda, (
+        "Tensors must be on CUDA device"
+    )
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
     assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"

quack/layernorm.py CHANGED Viewed

@@ -217,7 +217,7 @@ class LayerNorm(ReductionBase):
             mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
             init_val=0.0,
         )
-        rstd = utils.rsqrt(sum_sq_x_sub_mean / shape[1] + eps)
+        rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
         if cutlass.const_expr(mRstd is not None):
             # Only the thread corresponding to column 0 writes out the rstd to gmem
             if (

quack/reduce.py CHANGED Viewed

@@ -159,8 +159,7 @@ def online_softmax_reduce(
         width=min(threads_per_row, cute.arch.WARP_SIZE),
     )
     log2_e = math.log2(math.e)
-    exp_x = utils.exp2f(x * log2_e - (max_x * log2_e))
-    # exp_x = exp2f((x - max_x) * log2_e)
+    exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
     sum_exp_x = warp_reduce(
         exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
         operator.add,
@@ -190,10 +189,10 @@ def online_softmax_reduce(
                         reduction_buffer[row_idx, lane_idx]
                     )
                 max_x_final = warp_reduce(max_x_single_warp, cute.arch.fmax)
-                sum_exp_x *= utils.exp2f((max_x_single_warp - max_x_final) * log2_e)
+                sum_exp_x *= cute.math.exp(max_x_single_warp - max_x_final, fastmath=True)
                 sum_exp_x = warp_reduce(sum_exp_x, operator.add)
                 if cutlass.const_expr(return_exp_x):
-                    exp_x *= utils.exp2f((max_x - max_x_final) * log2_e)
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
                 max_x = max_x_final
             else:
                 cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
@@ -231,11 +230,11 @@ def online_softmax_reduce(
                 max_x_final = warp_reduce(max_x_final, cute.arch.fmax)
                 sum_exp_x = 0.0
                 for i in cutlass.range_constexpr(num_iter):
-                    sum_exp_x += sum_exp_x_single_warp[i] * utils.exp2f(
-                        (max_x_single_warp[i] - max_x_final) * log2_e
+                    sum_exp_x += sum_exp_x_single_warp[i] * cute.math.exp(
+                        max_x_single_warp[i] - max_x_final, fastmath=True
                     )
                 sum_exp_x = warp_reduce(sum_exp_x, operator.add)
                 if cutlass.const_expr(return_exp_x):
-                    exp_x *= utils.exp2f((max_x - max_x_final) * log2_e)
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
                 max_x = max_x_final
     return max_x, sum_exp_x, (exp_x if cutlass.const_expr(return_exp_x) else None)

quack/rmsnorm.py CHANGED Viewed

@@ -19,6 +19,7 @@ from quack.reduce import row_reduce
 from quack.reduction_base import ReductionBase
 from quack.cute_dsl_utils import torch2cute_dtype_map
 class RMSNorm(ReductionBase):
     def __init__(self, dtype: cutlass.Numeric, N: int):
         super().__init__(dtype, N, stage=1)
@@ -132,7 +133,9 @@ class RMSNorm(ReductionBase):
         mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
         mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
         if const_expr(mB is not None):
-            mB_expanded_layout = cute.prepend(mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
+            mB_expanded_layout = cute.prepend(
+                mB.layout, cute.make_layout((tiler_mn[0],), stride=(0,))
+            )
             mB = cute.make_tensor(mB.iterator, mB_expanded_layout)
         if const_expr(mRstd is not None):
             mRstd_expanded_layout = cute.append(
@@ -202,11 +205,7 @@ class RMSNorm(ReductionBase):
         ]
         cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
         gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
-        gB = (
-            cute.local_tile(mB, tiler_mn, (0, cluster_y))
-            if const_expr(mB is not None)
-            else None
-        )
+        gB = cute.local_tile(mB, tiler_mn, (0, cluster_y)) if const_expr(mB is not None) else None
         gRstd = (
             cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
             if const_expr(mRstd is not None)
@@ -226,12 +225,18 @@ class RMSNorm(ReductionBase):
         copy_atom_load_W = cute.make_copy_atom(
             cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=num_copy_bits_W
         )
-        num_bits_per_copy_B = cutlass.const_expr(
-            min(128, num_copy_elems_X * mB.element_type.width)
-        ) if const_expr(mB is not None) else 0
-        copy_atom_load_B = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mB.element_type, num_bits_per_copy=num_bits_per_copy_B
-        ) if const_expr(mB is not None) else None
+        num_bits_per_copy_B = (
+            cutlass.const_expr(min(128, num_copy_elems_X * mB.element_type.width))
+            if const_expr(mB is not None)
+            else 0
+        )
+        copy_atom_load_B = (
+            cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(), mB.element_type, num_bits_per_copy=num_bits_per_copy_B
+            )
+            if const_expr(mB is not None)
+            else None
+        )
         if const_expr(mRes is not None):
             num_copy_bits_Res = const_expr(min(128, num_copy_elems_X * mRes.element_type.width))
             copy_atom_load_Res_async = cute.make_copy_atom(
@@ -317,7 +322,7 @@ class RMSNorm(ReductionBase):
             init_val=0.0,
             hook_fn=(cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None),
         )
-        rstd = utils.rsqrt(sum_sq_x / shape[1] + eps)
+        rstd = cute.math.rsqrt(sum_sq_x / shape[1] + eps, fastmath=True)
         if const_expr(mRstd is not None):
             # Only the thread corresponding to column 0 writes out the rstd to gmem
             if (
@@ -355,7 +360,7 @@ class RMSNorm(ReductionBase):
     mutates_args=("out", "rstd", "residual_out"),
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor(a!) out, Tensor? bias, Tensor(a!)? rstd, Tensor? residual, Tensor(a!)? residual_out, float eps=1e-6) -> ()",
+    schema="(Tensor x, Tensor weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor? residual, Tensor(a6!)? residual_out, float eps=1e-6) -> ()",
 )
 def _rmsnorm_fwd(
     x: Tensor,
@@ -509,6 +514,7 @@ def rmsnorm_ref(x, w, bias=None, residual=None, eps=1e-6):
     else:
         return out.to(x.dtype), x_f32.to(residual.dtype)
 def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
     """Reference implementation for RMSNorm backward pass."""
     x_f32 = x.float()
@@ -521,6 +527,7 @@ def rmsnorm_bwd_ref(x, w, dout, rstd, eps=1e-6):
     dw = (dout * x_hat).sum(dim=0)
     return dx.to(x.dtype), dw.to(w.dtype)
 class RMSNormBackward(ReductionBase):
     def __init__(self, dtype: cutlass.Numeric, N: int):
         # 2 stages for double buffering when computing mean of x_hat * wdy
@@ -744,7 +751,11 @@ class RMSNormBackward(ReductionBase):
         # Always compute partial weight gradients in fp32
         tXrdW = cute.make_fragment_like(tXgdW, Float32)
-        gdB = cute.local_tile(mdB, (1, tiler_mn[1]), (bidx_start, cluster_y)) if const_expr(mdB is not None) else None
+        gdB = (
+            cute.local_tile(mdB, (1, tiler_mn[1]), (bidx_start, cluster_y))
+            if const_expr(mdB is not None)
+            else None
+        )
         tXgdB = thr_copy_X.partition_S(gdB) if const_expr(mdB is not None) else None
         tXrdB = cute.make_fragment_like(tXgdB, Float32) if const_expr(mdB is not None) else None
@@ -772,8 +783,10 @@ class RMSNormBackward(ReductionBase):
         tXrX, tXrdO, tXrdX = [
             cute.make_fragment_like(thr[None, None, None, 0]) for thr in (tXgX, tXgdO, tXgdX)
         ]
+        tXrdResO = None
         if const_expr(mdResO is not None):
             tXrdResO = cute.make_fragment_like(tXgdResO[None, None, None, 0])
+        tXrdRes = None
         if const_expr(mdRes is not None):
             tXrdRes = cute.make_fragment_like(tXgdRes[None, None, None, 0])
@@ -930,7 +943,9 @@ class RMSNormBackward(ReductionBase):
                 if row == 0:
                     for i in cutlass.range_constexpr(1, const_expr(tiler_mn[0])):
                         tXrdB_other = cute.make_fragment_like(tXrdB)
-                        tXsdB_other = cute.make_tensor(tXsdB.iterator + i * sdB.stride[0], tXsdB.layout)
+                        tXsdB_other = cute.make_tensor(
+                            tXsdB.iterator + i * sdB.stride[0], tXsdB.layout
+                        )
                         cute.autovec_copy(tXsdB_other, tXrdB_other)
                         tXrdB.store(tXrdB.load() + tXrdB_other.load())
                     cute.copy(copy_atom_store_dB, tXrdB, tXgdB, pred=tXpdB)
@@ -963,7 +978,7 @@ def _get_sm_count(N: int, device: torch.device) -> int:
     mutates_args={"dx", "dw_partial", "db_partial", "dresidual"},
     device_types="cuda",
     # We need to specify the schema manually since we're mutating an optional tensor
-    schema="(Tensor x, Tensor weight, Tensor dout, Tensor rstd, Tensor(a!) dx, Tensor(a!) dw_partial, Tensor(a!)? db_partial, Tensor? dresidual_out, Tensor(a!)? dresidual) -> ()",
+    schema="(Tensor x, Tensor weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!) dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual) -> ()",
 )
 def _rmsnorm_bwd(
     x: Tensor,
@@ -1031,14 +1046,23 @@ def _rmsnorm_bwd(
     )
     dw_partial_tensor = from_dlpack(dw_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    db_partial_tensor = from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0) if db_partial is not None else None
+    db_partial_tensor = (
+        from_dlpack(db_partial, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        if db_partial is not None
+        else None
+    )
     rstd_tensor = from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (N, x_tensor.element_type, weight_tensor.element_type, db_partial.dtype if db_partial is not None else None,
+    compile_key = (
+        N,
+        x_tensor.element_type,
+        weight_tensor.element_type,
+        db_partial.dtype if db_partial is not None else None,
         dresidual.dtype if dresidual is not None else None,
-        dresidual_out.dtype if dresidual_out is not None else None)
+        dresidual_out.dtype if dresidual_out is not None else None,
+    )
     if compile_key not in _rmsnorm_bwd.compile_cache:
         rmsnorm_backward_op = RMSNormBackward(x_tensor.element_type, N)
         _rmsnorm_bwd.compile_cache[compile_key] = cute.compile(
@@ -1106,7 +1130,17 @@ def rmsnorm_bwd(
 class RMSNormFunction(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, x, weight, bias=None, residual=None, out_dtype=None, residual_dtype=None, eps=1e-6, prenorm=False):
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias=None,
+        residual=None,
+        out_dtype=None,
+        residual_dtype=None,
+        eps=1e-6,
+        prenorm=False,
+    ):
         x_shape_og = x.shape
         # Flatten input
         x = x.reshape(-1, x.shape[-1])
@@ -1129,7 +1163,7 @@ class RMSNormFunction(torch.autograd.Function):
         ctx.x_shape_og = x_shape_og
         ctx.residual_dtype = residual.dtype if residual is not None else None
         ctx.prenorm = prenorm
-        if residual_out is None or prenorm == False:
+        if residual_out is None or not prenorm:
             return out.reshape(x_shape_og)
         else:
             return out.reshape(x_shape_og), residual_out.reshape(x_shape_og)
@@ -1213,4 +1247,4 @@ class QuackRMSNorm(torch.nn.Module):
     def reset_parameters(self):
         """Reset the weight parameter to ones."""
-        torch.nn.init.ones_(self.weight)
+        torch.nn.init.ones_(self.weight)

quack/softmax.py CHANGED Viewed

@@ -159,7 +159,7 @@ class Softmax(ReductionBase):
                 hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
             )
             log2_e = math.log2(math.e)
-            exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
+            exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
             denom = row_reduce(
                 exp_x,
                 cute.ReductionOp.ADD,

quack/tile_scheduler.py CHANGED Viewed

@@ -390,7 +390,7 @@ def triangular_idx_to_coord(idx: Int32) -> Tuple[Int32, Int32]:
     Convert a triangular index to 2D coordinates.
     This is used to convert the linear index to 2D coordinates for triangular matrices.
     """
-    row = utils.ceil((utils.sqrt(2 * idx + 2.25) - 0.5)) - 1
+    row = utils.ceil((cute.math.sqrt(2 * idx + 2.25, fastmath=True) - 0.5)) - 1
     col = idx - (row * (row + 1)) // 2
     return row, col
@@ -524,7 +524,8 @@ class TriangularTileScheduler(TileScheduler):
         group_size = params.group_size_divmod.divisor
         group_id = (
             utils.ceil(
-                (utils.sqrt(2 * cluster_id_in_problem + 2.25) - 0.5) * params.group_size_inv_f32
+                (cute.math.sqrt(2 * cluster_id_in_problem + 2.25, fastmath=True) - 0.5)
+                * params.group_size_inv_f32
             )
             - 1
         )

quack/utils.py CHANGED Viewed

@@ -100,69 +100,6 @@ def fmin(a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=Non
     )
-@cute.jit
-def exp2f(x: cute.TensorSSA | Float32) -> cute.TensorSSA | Float32:
-    """exp2f calculation for both vector and scalar.
-    :param x: input value
-    :type x: cute.TensorSSA or Float32
-    :return: exp2 value
-    :rtype: cute.TensorSSA or Float32
-    """
-    if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
-        res = cute.make_fragment(x.shape, Float32)
-        res.store(x)
-        for i in cutlass.range(cute.size(x.shape), unroll_full=True):
-            res[i] = cute.arch.exp2(res[i])
-        return res.load()
-    else:
-        return cute.arch.exp2(x)
-@dsl_user_op
-def log2f(a: float | Float32, *, loc=None, ip=None) -> Float32:
-    return Float32(
-        llvm.inline_asm(
-            T.f32(),
-            [Float32(a).ir_value(loc=loc, ip=ip)],
-            "lg2.approx.ftz.f32 $0, $1;",
-            "=f,f",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
-@dsl_user_op
-def sqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
-    return Float32(
-        llvm.inline_asm(
-            T.f32(),
-            [Float32(a).ir_value(loc=loc, ip=ip)],
-            "sqrt.approx.ftz.f32 $0, $1;",
-            "=f,f",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
-@dsl_user_op
-def rsqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
-    return Float32(
-        llvm.inline_asm(
-            T.f32(),
-            [Float32(a).ir_value(loc=loc, ip=ip)],
-            "rsqrt.approx.ftz.f32 $0, $1;",
-            "=f,f",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
 @dsl_user_op
 def ceil(a: float | Float32, *, loc=None, ip=None) -> Int32:
     return Int32(

{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.2.0
-Requires-Python: >=3.12
+Version: 0.2.1
+Requires-Python: >=3.10
 License-File: LICENSE
 Requires-Dist: nvidia-cutlass-dsl==4.2.0
 Requires-Dist: torch

{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-quack/__init__.py,sha256=fGBYbb9JlaNT7HdtUTbUnuAkL5G2Dg8XZAA5Ir1R-ow,364
-quack/activation.py,sha256=ysXaVUXX2yGQC5o4ZVeRXw_fDIHOrqnzpHJaIsc0kHc,10271
+quack/__init__.py,sha256=H1m0CnfPidSSmprZeTGJc8LVh7stdBPmPLEuZwgN_7M,364
+quack/activation.py,sha256=SzQDUCB-kccqsy1aYUrHYJ2cGxKMXxxqpjJaJoqBYaE,10017
 quack/autotuner.py,sha256=czO6JrYL0EJpOeJOYDSsVdrJaFuwfL3vTdG8QfL1F34,10792
-quack/cross_entropy.py,sha256=Kc3P83Vsu1nGaCu7llsO3vct3J_t3frRYPxij7JfHMA,28619
+quack/cross_entropy.py,sha256=TE8j21c-7E4cInKtFjcKsgKXNhKCRFkNfhCJpgpasj8,28409
 quack/cute_dsl_utils.py,sha256=D2Pw7rzX9jY8u8wikIPvPvinmFLCDeZg95HPBLqGej4,4635
 quack/dense_gemm_sm100.py,sha256=hKBNC34UxdctrTKVP68nvANZl4Dq2rnUjRcweESEq3g,109965
 quack/dense_gemm_sm90.py,sha256=TjnjHnjhAwWH5YQWsFlADq07xSxtsprkw_p2Cy0yw7I,100407
@@ -11,27 +11,27 @@ quack/gemm_config.py,sha256=gbYjPFeyT5wAhVwFQroRHlHoMKEJqAWX9P8wWy04l8Q,2258
 quack/gemm_dact_sm90.py,sha256=KCXgjOzdamSDexwrwf_pX2r-ippPRirbClrlU6BP7b8,4990
 quack/gemm_interface.py,sha256=_JTpE7zQw6NUw-v65Wql_XUOZBfW0oSEgiMnharTJU4,20501
 quack/gemm_wrapper_utils.py,sha256=aMMtu-Ojhtjay_5xJH4AjP-JRVks1AB8jmtNme_DIqU,5960
-quack/layernorm.py,sha256=JkK0sVdUfZ-SmoBmNqLF3wCiszDbdorvcBH2julv0Vg,13560
+quack/layernorm.py,sha256=AOe95-YqhFPw96x8pJq7FfBe26ROX9ZTvH025lM1ILs,13579
 quack/linear.py,sha256=SrhRiAFjC7ONIMVmiNu-kSPLHNUyaCXt59a1f_5nNXo,9383
 quack/linear_cross_entropy.py,sha256=Zhy_gdMsKHOie-jntBaqIuiDJtkiq6qEBwnyuWwIRw4,10092
 quack/mlp.py,sha256=YjdwQRwEePA9KyidFXp5H1-lxiJc8dZ41vl8Fv8pgss,2259
 quack/pipeline.py,sha256=DyCwZX8WvoUBFcMBz7CeYm9VUM31haEGgBhAzmxu8cE,5519
-quack/reduce.py,sha256=hsYByu6haCZjLTLB-qpYmKDjqS2UqlwPgfWTup38GNA,10341
+quack/reduce.py,sha256=0hRFMFfn6xC5QLk32Qmgc17XVkQ1yKC-3TfksccSBaU,10341
 quack/reduction_base.py,sha256=CT-t_j7z8H1ByD9FkQYDRik_-THMDFv9QoXHmr9Xx9E,3636
-quack/rmsnorm.py,sha256=93qlTPjY9JBm3R5M-HeHse1PbAfD9931G3OFs71yo_g,48998
-quack/softmax.py,sha256=Mq3_2Ul8H64zeGUI9wOKEpIISJnrCcHQpZvk2sb10Tg,17101
+quack/rmsnorm.py,sha256=PrW2zuaQs_Gr6g8B6DMsGSJFZdEsWf32if_EwUR_IDQ,49386
+quack/softmax.py,sha256=WFWtgc40iLPFBpdStBBTC9803Npnv9rZjOzb_nK-RDs,17110
 quack/symmetric_dense_gemm_sm90.py,sha256=2UXooIpClT2izdyGis1XaIgYYlLj-7MrcOMg2yR7YCk,88694
 quack/tensormap_manager.py,sha256=Ts3Mxp0_es2RNA0ffvUjWMXN79lsfWEBZ0DQYhtbcnw,5338
-quack/tile_scheduler.py,sha256=8qqYmx6GpQzt8XiidcrdLIaWf0TGbJVdwKFfeb1X_us,42265
+quack/tile_scheduler.py,sha256=BQ-SeW5wxulKuwmpq0CAIjkuirv4KWdUdoIGQB88aGE,42319
 quack/topk.py,sha256=RQl-23lIicQ9ry9Njur8i0JGem_WbO_Gchr6jy8EtVM,9185
-quack/utils.py,sha256=tiqeJZiPPFl5irQWCUd7dTPA_OAv4SjHUW5S-u9wO8Y,14526
+quack/utils.py,sha256=wOgNw9VL40FCsLwN52juPfk48zVpX-rta3MQhAQe8Wc,12767
 quack/varlen_utils.py,sha256=vkduMEpo5bJJvZRNnIcKPb6pp1wD34vaIpMIB0ZGIZA,681
 quack/sort/bitonic_sort.py,sha256=8t0SG1a6iEpYIlY8YM_AWvm4aN-4AA4vEzdBuJMJm9g,4768
 quack/sort/generate_sorting_networks.py,sha256=vkJBOjTVEinQkWT4OtFqOWxFVdTIPoNAQocneKc9-rM,14477
 quack/sort/sorting_networks.py,sha256=l_26zi3gXD_z-tnm2eAczRrmE-mbaz00KmqH6ONivL8,9686
 quack/sort/utils.py,sha256=Mkr-l97RMAV-ZoNrwuzA1U3KO0Wjr38CV9Jm7ScyZoI,1090
-quack_kernels-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-quack_kernels-0.2.0.dist-info/METADATA,sha256=DAeQymRUqp7lSfSTNyS7TZF3oWcFzCKriGJ2p8JLu6A,285
-quack_kernels-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-quack_kernels-0.2.0.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
-quack_kernels-0.2.0.dist-info/RECORD,,
+quack_kernels-0.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+quack_kernels-0.2.1.dist-info/METADATA,sha256=_AFigx6aFt-25GzUP6YWalDBwHvwzgK9EU85WjZXvsI,285
+quack_kernels-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+quack_kernels-0.2.1.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
+quack_kernels-0.2.1.dist-info/RECORD,,

{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{quack_kernels-0.2.0.dist-info → quack_kernels-0.2.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

quack-kernels 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

quack-kernels 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl