PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/cute_dsl_utils.py CHANGED Viewed

@@ -1,8 +1,7 @@
 # Copyright (c) 2025, Tri Dao.
-import os
-import pathlib
-from functools import partial, lru_cache
+from typing import Tuple
+from functools import lru_cache
 from dataclasses import dataclass, fields
 import torch
@@ -14,6 +13,7 @@ except ImportError:
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Int64, Float16, BFloat16, Float32
 from cutlass.base_dsl.typing import JitArgument
 from cutlass.cutlass_dsl import NumericMeta
@@ -26,9 +26,11 @@ cute_compile_og = cute.compile
 torch2cute_dtype_map = {
-    torch.float16: cutlass.Float16,
-    torch.bfloat16: cutlass.BFloat16,
-    torch.float32: cutlass.Float32,
+    torch.float16: Float16,
+    torch.bfloat16: BFloat16,
+    torch.float32: Float32,
+    torch.int32: Int32,
+    torch.int64: Int64,
 }
@@ -37,6 +39,11 @@ def get_max_active_clusters(cluster_size):
     return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
+@lru_cache
+def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
+    return torch.cuda.get_device_capability(device)
 @dataclass
 class ParamsBase:
     def __extract_mlir_values__(self):
@@ -75,10 +82,14 @@ class ArgumentsBase(JitArgument):
     def __get_mlir_types__(self):
         all_fields = [getattr(self, field.name) for field in fields(self)]
         non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
-        types = []
+        types, self._values_pos = [], []
         for obj in non_constexpr_fields:
             if hasattr(obj, "__get_mlir_types__"):
-                types.extend(obj.__get_mlir_types__())
+                obj_types = obj.__get_mlir_types__()
+                types.extend(obj_types)
+                self._values_pos.append(len(obj_types))
+            else:
+                self._values_pos.append(0)
         return types
     def __new_from_mlir_values__(self, values):
@@ -87,32 +98,7 @@ class ArgumentsBase(JitArgument):
         non_constexpr_fields = {
             n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
         }
-        # for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
-        for name, field in non_constexpr_fields.items():
-            # non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
-            # values = values[n_items:]
-            n_items = 1
+        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
             non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
             values = values[n_items:]
         return self.__class__(**non_constexpr_fields, **constexpr_fields)
-def load_cubin_module_data_patched(cubin_data, filepath):
-    pathlib.Path(filepath).write_bytes(cubin_data)
-    return load_cubin_module_data_og(cubin_data)
-def cute_compile_patched(*args, **kwargs):
-    """A patched version of cute.compile that dump the SASS to a file if CUTE_CUBIN_PATH is set."""
-    cubin_path = os.getenv("CUTE_CUBIN_PATH", None)
-    if cubin_path is not None:
-        cutlass.base_dsl.runtime.cuda.load_cubin_module_data = partial(
-            load_cubin_module_data_patched, filepath=cubin_path
-        )
-    output = cute_compile_og(*args, **kwargs)
-    if cubin_path is not None:
-        cutlass.base_dsl.runtime.cuda.load_cubin_module_data = load_cubin_module_data_og
-        if extract is not None:
-            sass = extract(cubin_path, None)
-            pathlib.Path(cubin_path).with_suffix(".annotated.sass").write_text(sass)
-    return output

quack/gemm.py ADDED Viewed

@@ -0,0 +1,194 @@
+from typing import Optional
+from functools import partial
+from torch import Tensor
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass import Float32
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from quack.cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from quack.gemm_wrapper_utils import GemmWrapperBase
+from quack.gemm_default_epi import GemmDefaultSm90, GemmDefaultSm100
+def gemm(
+    # (l, m, k) or (total_m, k) if varlen_m or (m, total_k) if varlen_k or (whatever, k) if gather_A_varlen_m or (m, whatever) if gather_A_varlen_k
+    A: Tensor,
+    B: Tensor,  # (l, n, k) or (n, total_k) if varlen_k
+    D: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
+    C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    rowvec_bias: Optional[Tensor] = None,  # (l, n)
+    colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
+    cu_seqlens_k: Optional[Tensor] = None,  # (l+1,) cumulative sum of k values for variable length
+    A_idx: Optional[Tensor] = None,  # (total_m,) or (total_k,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (l,) permutation of batch indices for scheduler
+    add_to_output: bool = False,
+) -> None:
+    varlen = cu_seqlens_m is not None or cu_seqlens_k is not None
+    assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
+        "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
+    )
+    gather_A = A_idx is not None
+    if gather_A:
+        assert varlen, "gather_A requires varlen (cu_seqlens_m or cu_seqlens_k must be specified)"
+        assert cluster_N == 1, "gather_A requires cluster_N=1"
+    if varlen:
+        assert persistent, "varlen requires persistent=True"
+    if add_to_output:
+        assert cu_seqlens_m is None, "Add to output not supported with varlen_m"
+    if cu_seqlens_m is not None:
+        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
+        assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
+    if cu_seqlens_k is not None:
+        assert A.stride(-2) == 1, "varlen_k requires A to be m-major"
+        assert B.stride(-2) == 1, "varlen_k requires B to be n-major"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, cu_seqlens_m=cu_seqlens_m, cu_seqlens_k=cu_seqlens_k, A_idx=A_idx
+    )
+    GemmWrapperBase.permute_tensors(
+        tensor_infos, varlen_m=cu_seqlens_m is not None, varlen_k=cu_seqlens_k is not None
+    )
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmDefaultSm100 if device_capacity[0] > 9 else GemmDefaultSm90
+    acc_dtype = Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    def scalar_arg(scalar: float | Tensor):
+        if isinstance(scalar, float):
+            return Float32(scalar) if scalar != 1.0 else None
+        else:
+            assert isinstance(scalar, Tensor)
+            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
+    epi_args = GemmCls.EpilogueArguments(
+        scalar_arg(alpha),
+        scalar_arg(beta),
+        mRowVecBroadcast=from_dlpack(rowvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1
+        )
+        if rowvec_bias is not None
+        else None,
+        mColVecBroadcast=from_dlpack(colvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1 if cu_seqlens_m is None else 0
+        )
+        if colvec_bias is not None
+        else None,
+        add_to_output=add_to_output,
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters,
+        tile_count_semaphore,
+        batch_idx_permute,
+        max_swizzle_size,
+    )
+    # Create varlen arguments if needed (assumes persistent=True when varlen)
+    varlen_args = GemmWrapperBase.create_varlen_args(
+        cu_seqlens_m,
+        cu_seqlens_k,
+        A_idx,
+        max_active_clusters,
+        cluster_shape_mnk,
+        tensor_infos,
+        GemmCls.num_epi_tensormaps,
+        pingpong,
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        None,  # activation
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        # Technically we don't need to recompile for different max_swizzle_size, but currently
+        # not recompiling will skew the autotuning results due to power throttling.
+        # Effectively we're recompiling as a way to pause between benchmarks during autotuning.
+        max_swizzle_size,
+        rowvec_bias.dtype if rowvec_bias is not None else None,
+        colvec_bias.dtype if colvec_bias is not None else None,
+        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
+        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
+        add_to_output,
+        cu_seqlens_m is not None,
+        cu_seqlens_k is not None,
+        gather_A,
+        batch_idx_permute is not None,
+        key_tensor_names=("A", "B", "D", "C"),
+    )
+    cache = gemm.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm_obj = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=gather_A,
+        )
+        cache[compile_key] = cute.compile(
+            gemm_obj,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm.compile_cache = {}

quack-kernels 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl