PyPI - quack-kernels - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

quack-kernels 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{quack_kernels-0.2.0/quack_kernels.egg-info → quack_kernels-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.2.0
-Requires-Python: >=3.12
+Version: 0.2.2
+Requires-Python: >=3.10
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.2.0
+Requires-Dist: nvidia-cutlass-dsl==4.2.1
 Requires-Dist: torch
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"

{quack_kernels-0.2.0 → quack_kernels-0.2.2}/pyproject.toml RENAMED Viewed

@@ -5,9 +5,9 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "quack-kernels"
 dynamic = ["version"]
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl==4.2.0",
+    "nvidia-cutlass-dsl==4.2.1",
     "torch",
 ]

{quack_kernels-0.2.0 → quack_kernels-0.2.2}/quack/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.2.0"
+__version__ = "0.2.2"
 import cutlass.cute as cute

{quack_kernels-0.2.0 → quack_kernels-0.2.2}/quack/activation.py RENAMED Viewed

@@ -6,23 +6,12 @@ from typing import Tuple
 import cutlass
 import cutlass.cute as cute
 from cutlass import Float32
-from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
+from cutlass.cutlass_dsl import dsl_user_op
 @dsl_user_op
-def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
-    return Float32(
-        llvm.inline_asm(
-            T.f32(),
-            [Float32(a).ir_value(loc=loc, ip=ip)],
-            "tanh.approx.f32 $0, $1;",
-            "=f,f",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
+def sigmoid(x: Float32, *, loc=None, ip=None) -> Float32:
+    return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
 @dsl_user_op
@@ -67,7 +56,10 @@ def gelu_tanh_approx(x: Float32, *, loc=None, ip=None) -> Float32:
     """
     sqrt_2_over_pi = math.sqrt(2 / math.pi)  # ~0.797885
     sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # ~0.0356774
-    return 0.5 * (x * (1 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)))))
+    return 0.5 * (
+        x
+        * (1 + cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)), fastmath=True))
+    )
 @dsl_user_op
@@ -88,7 +80,7 @@ def dgelu_tanh_approx(x: Float32, dout: Float32, *, loc=None, ip=None) -> Tuple[
     # Compute z = x * (c1 + c2 * x^2)
     x_sq = x * x
-    tanh_z = tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq))
+    tanh_z = cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq), fastmath=True)
     half_tanh_z_plus_one = 0.5 + 0.5 * tanh_z
     gelu_out = x * half_tanh_z_plus_one
@@ -111,7 +103,7 @@ def silu(x: Float32, *, loc=None, ip=None) -> Float32:
     This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
     """
     x_half = 0.5 * x
-    return x_half * tanh(x_half) + x_half
+    return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
 @dsl_user_op
@@ -134,8 +126,8 @@ def dswiglu(
     to use FFMA instead of FADD and FMUL).
     """
     # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(0.5 * x))
-    x_half = 0.5 * x  # FMUL
-    sigmoid_x = 0.5 + 0.5 * tanh(x_half)  # MUFU.TANH, then FFMA
+    # FMUL, MUFU.TANH, then FFMA
+    sigmoid_x = sigmoid(x)
     silu_x = x * sigmoid_x  # FMUL
     silu_x_dout = silu_x * dout  # FMUL
     #   d_silu(x) * dout
@@ -161,7 +153,7 @@ def swiglu_oai(x: Float32, y: Float32, alpha: float = 1.702, *, loc=None, ip=Non
     """
     # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
     x_half = 0.5 * x
-    silu_x = x_half * tanh(alpha * x_half) + x_half
+    silu_x = x_half * cute.math.tanh(alpha * x_half, fastmath=True) + x_half
     return silu_x * y + silu_x
@@ -179,7 +171,8 @@ def dswiglu_oai(
     """
     # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
     alpha_x_half = (0.5 * alpha) * x  # FMUL
-    sigmoid_alpha_x = 0.5 + 0.5 * tanh(alpha_x_half)  # MUFU.TANH, then FFMA
+    # MUFU.TANH, then FFMA
+    sigmoid_alpha_x = 0.5 + 0.5 * cute.math.tanh(alpha_x_half, fastmath=True)
     silu_x = x * sigmoid_alpha_x  # FMUL
     silu_x_dout = silu_x * dout  # FMUL
     # FFMA, FFMA, FMUL
@@ -197,8 +190,7 @@ def glu(x: Float32, y: Float32, *, loc=None, ip=None) -> Float32:
     glu(x, y) = sigmoid(x) * y
     Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
     """
-    x_half = 0.5 * x  # FMUL
-    sigmoid_x = 0.5 + 0.5 * tanh(x_half)  # MUFU.TANH, then FFMA
+    sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
     return sigmoid_x * y  # FMUL
@@ -215,8 +207,7 @@ def dglu(
     - glu_out = sigmoid(x) * y
     """
     # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(x/2))
-    x_half = 0.5 * x  # FMUL
-    sigmoid_x = 0.5 + 0.5 * tanh(x_half)  # MUFU.TANH, then FFMA
+    sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
     sigmoid_x_dout = sigmoid_x * dout  # FMUL
     glu_out = sigmoid_x * y  # FMUL
     # dx = y * sigmoid(x) * (1 - sigmoid(x)) * dout

{quack_kernels-0.2.0 → quack_kernels-0.2.2}/quack/autotuner.py RENAMED Viewed

@@ -11,7 +11,7 @@ import hashlib
 import json
 from pathlib import Path
 from functools import cached_property, partial
-from typing import Dict, Tuple
+from typing import Dict, Tuple, List, Optional, Any
 import torch
 from torch import Tensor
@@ -53,7 +53,22 @@ def _base32(key):
 class Autotuner:
-    def __init__(self, fn, key, configs, restore_value=None, do_bench=None, cache_results=False):
+    def __init__(
+        self,
+        fn,
+        key,
+        configs,
+        restore_value=None,
+        prune_configs_by: Optional[Dict] = None,
+        do_bench=None,
+        cache_results=False,
+    ):
+        """
+        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+            'perf_model': performance model used to predicate running time with different configs, returns running time
+            'top_k': number of configs to bench
+            'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
+        """
         if not configs:
             self.configs = [AutotuneConfig()]
         else:
@@ -90,6 +105,16 @@ class Autotuner:
         else:
             self.post_hook = None
+        self.perf_model = None
+        self.configs_top_k = 1.0
+        self.early_config_prune = None
+        if prune_configs_by:
+            self.perf_model = prune_configs_by.get("perf_model", self.perf_model)
+            self.configs_top_k = prune_configs_by.get("top_k", self.configs_top_k)
+            self.early_config_prune = prune_configs_by.get(
+                "early_config_prune", self.early_config_prune
+            )
         self.fn = fn
         self._do_bench = do_bench
@@ -198,13 +223,14 @@ class Autotuner:
             key = tuple(key)
             if key not in self.cache:
                 used_cached_result = False
+                pruned_configs = self.prune_configs(kwargs)
                 @torch.compiler.disable  # Don't want any tracing here
                 def benchmark():
                     bench_start = time.time()
                     timings = {
                         config: self._bench(*args, config=config, **kwargs)
-                        for config in self.configs
+                        for config in pruned_configs
                     }
                     bench_end = time.time()
                     if os.getenv(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1":
@@ -215,7 +241,7 @@ class Autotuner:
                     self.configs_timings = timings
                 if self.cache_results:
-                    self.check_disk_cache(key, self.configs, benchmark)
+                    self.check_disk_cache(key, pruned_configs, benchmark)
                 else:
                     benchmark()
@@ -239,6 +265,32 @@ class Autotuner:
         self.nargs = None
         return ret
+    def prune_configs(self, kwargs: Dict) -> List[Any]:
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            elif not isinstance(top_k, int):
+                # Slice index must be an integer
+                raise TypeError(
+                    "Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int"
+                )
+            if len(pruned_configs) > top_k:
+                est_timing = {
+                    config: self.perf_model(
+                        **self.nargs,
+                        **kwargs,
+                        **config.all_kwargs(),
+                    )
+                    for config in pruned_configs
+                }
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+        return pruned_configs
 class AutotuneConfig:
     """
@@ -272,7 +324,9 @@ class AutotuneConfig:
         return self_tuple == other_tuple
-def autotune(configs, key=None, restore_value=None, do_bench=None, cache_results=True):
+def autotune(
+    configs, key=None, prune_configs_by=None, restore_value=None, do_bench=None, cache_results=True
+):
     f"""
     Decorator for auto-tuning a function function.
@@ -286,6 +340,10 @@ def autotune(configs, key=None, restore_value=None, do_bench=None, cache_results
     :type configs: list[AutotuneConfig]
     :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
     :type key: list[str]
+    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+        'perf_model': performance model used to predicate running time with different configs, returns running time
+        'top_k': number of configs to bench
+        'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
     :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
     :type restore_value: list[str]
     :param do_bench: a benchmark function to measure the time of each run.
@@ -303,6 +361,7 @@ def autotune(configs, key=None, restore_value=None, do_bench=None, cache_results
             key,
             configs,
             restore_value=restore_value,
+            prune_configs_by=prune_configs_by,
             do_bench=do_bench,
             cache_results=cache_results,
         )

{quack_kernels-0.2.0 → quack_kernels-0.2.2}/quack/cross_entropy.py RENAMED Viewed

@@ -199,11 +199,8 @@ class CrossEntropy(ReductionBase):
                 cute.autovec_copy(tXsX, tXrX)
                 x = tXrX.load().to(Float32)
             log2_e = math.log2(math.e)
-            # exp_x = cute.math.exp2((x - max_x) * log2_e, fastmath=True)
-            # a bit faster, probably because it's calling ex2.approx.ftz instead of ex2.approx?
-            # exp_x = utils.exp2f((x - max_x) * log2_e)
             # This would use ffma instead of fadd then fmul
-            exp_x = utils.exp2f(x * log2_e - (max_x * log2_e))
+            exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=False)
             denom = row_reduce(
                 exp_x,
                 cute.ReductionOp.ADD,
@@ -228,8 +225,7 @@ class CrossEntropy(ReductionBase):
             and row < shape[0]
             and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
         ):
-            ln_2 = math.log(2.0)
-            lse = max_x + utils.log2f(denom) * ln_2
+            lse = max_x + cute.math.log(denom, fastmath=True)
             # Set loss to 0 if this index should be ignored, otherwise compute normally
             loss_val = (lse - target_logit) if not should_ignore else Float32.zero
             mLoss[row] = mLoss.element_type(loss_val)
@@ -552,7 +548,7 @@ class CrossEntropyBackward:
             lse = Float32(mLSE[row])
         log2_e = math.log2(math.e)
-        probs = utils.exp2f(x * log2_e - lse * log2_e)
+        probs = cute.math.exp2(x * log2_e - (lse * log2_e), fastmath=True)
         prob_shifted = probs - 1.0
         mask = cute.make_fragment_like(tXrX, cutlass.Boolean)
         for i in cutlass.range(cute.size(tXcFull), unroll_full=True):
@@ -594,9 +590,9 @@ def _cross_entropy_backward(
     assert x.shape[0] == target.shape[0], "Batch dimensions must match"
     assert x.shape[0] == dloss.shape[0], "Batch dimensions must match"
     assert x.shape[0] == lse.shape[0], "Batch dimensions must match"
-    assert (
-        x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda
-    ), "Tensors must be on CUDA device"
+    assert x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda, (
+        "Tensors must be on CUDA device"
+    )
     assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
     assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"

{quack_kernels-0.2.0 → quack_kernels-0.2.2}/quack/cute_dsl_utils.py RENAMED Viewed

@@ -98,22 +98,21 @@ class ArgumentsBase(JitArgument):
 def load_cubin_module_data_patched(cubin_data, filepath):
-    path = pathlib.Path(filepath)
-    path.write_bytes(cubin_data)
+    pathlib.Path(filepath).write_bytes(cubin_data)
     return load_cubin_module_data_og(cubin_data)
 def cute_compile_patched(*args, **kwargs):
     """A patched version of cute.compile that dump the SASS to a file if CUTE_CUBIN_PATH is set."""
-    if os.getenv("CUTE_CUBIN_PATH") is not None:
+    cubin_path = os.getenv("CUTE_CUBIN_PATH", None)
+    if cubin_path is not None:
         cutlass.base_dsl.runtime.cuda.load_cubin_module_data = partial(
-            load_cubin_module_data_patched, filepath=os.getenv("CUTE_CUBIN_PATH")
+            load_cubin_module_data_patched, filepath=cubin_path
         )
     output = cute_compile_og(*args, **kwargs)
-    if os.getenv("CUTE_CUBIN_PATH") is not None:
+    if cubin_path is not None:
         cutlass.base_dsl.runtime.cuda.load_cubin_module_data = load_cubin_module_data_og
         if extract is not None:
-            cubin_path = pathlib.Path(os.getenv("CUTE_CUBIN_PATH"))
             sass = extract(cubin_path, None)
-            cubin_path.with_suffix(".annotated.sass").write_text(sass)
+            pathlib.Path(cubin_path).with_suffix(".annotated.sass").write_text(sass)
     return output

quack-kernels 0.2.0__tar.gz → 0.2.2__tar.gz

quack-kernels 0.2.0tar.gz → 0.2.2tar.gz