PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/autotuner.py CHANGED Viewed

@@ -11,7 +11,7 @@ import hashlib
 import json
 from pathlib import Path
 from functools import cached_property, partial
-from typing import Dict, Tuple
+from typing import Dict, Tuple, List, Optional, Any
 import torch
 from torch import Tensor
@@ -53,7 +53,22 @@ def _base32(key):
 class Autotuner:
-    def __init__(self, fn, key, configs, restore_value=None, do_bench=None, cache_results=False):
+    def __init__(
+        self,
+        fn,
+        key,
+        configs,
+        restore_value=None,
+        prune_configs_by: Optional[Dict] = None,
+        do_bench=None,
+        cache_results=False,
+    ):
+        """
+        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+            'perf_model': performance model used to predicate running time with different configs, returns running time
+            'top_k': number of configs to bench
+            'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
+        """
         if not configs:
             self.configs = [AutotuneConfig()]
         else:
@@ -90,6 +105,16 @@ class Autotuner:
         else:
             self.post_hook = None
+        self.perf_model = None
+        self.configs_top_k = 1.0
+        self.early_config_prune = None
+        if prune_configs_by:
+            self.perf_model = prune_configs_by.get("perf_model", self.perf_model)
+            self.configs_top_k = prune_configs_by.get("top_k", self.configs_top_k)
+            self.early_config_prune = prune_configs_by.get(
+                "early_config_prune", self.early_config_prune
+            )
         self.fn = fn
         self._do_bench = do_bench
@@ -198,13 +223,14 @@ class Autotuner:
             key = tuple(key)
             if key not in self.cache:
                 used_cached_result = False
+                pruned_configs = self.prune_configs(kwargs)
                 @torch.compiler.disable  # Don't want any tracing here
                 def benchmark():
                     bench_start = time.time()
                     timings = {
                         config: self._bench(*args, config=config, **kwargs)
-                        for config in self.configs
+                        for config in pruned_configs
                     }
                     bench_end = time.time()
                     if os.getenv(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1":
@@ -215,7 +241,7 @@ class Autotuner:
                     self.configs_timings = timings
                 if self.cache_results:
-                    self.check_disk_cache(key, self.configs, benchmark)
+                    self.check_disk_cache(key, pruned_configs, benchmark)
                 else:
                     benchmark()
@@ -239,6 +265,32 @@ class Autotuner:
         self.nargs = None
         return ret
+    def prune_configs(self, kwargs: Dict) -> List[Any]:
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            elif not isinstance(top_k, int):
+                # Slice index must be an integer
+                raise TypeError(
+                    "Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int"
+                )
+            if len(pruned_configs) > top_k:
+                est_timing = {
+                    config: self.perf_model(
+                        **self.nargs,
+                        **kwargs,
+                        **config.all_kwargs(),
+                    )
+                    for config in pruned_configs
+                }
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+        return pruned_configs
 class AutotuneConfig:
     """
@@ -272,7 +324,9 @@ class AutotuneConfig:
         return self_tuple == other_tuple
-def autotune(configs, key=None, restore_value=None, do_bench=None, cache_results=True):
+def autotune(
+    configs, key=None, prune_configs_by=None, restore_value=None, do_bench=None, cache_results=True
+):
     f"""
     Decorator for auto-tuning a function function.
@@ -286,6 +340,10 @@ def autotune(configs, key=None, restore_value=None, do_bench=None, cache_results
     :type configs: list[AutotuneConfig]
     :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
     :type key: list[str]
+    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+        'perf_model': performance model used to predicate running time with different configs, returns running time
+        'top_k': number of configs to bench
+        'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
     :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
     :type restore_value: list[str]
     :param do_bench: a benchmark function to measure the time of each run.
@@ -303,6 +361,7 @@ def autotune(configs, key=None, restore_value=None, do_bench=None, cache_results
             key,
             configs,
             restore_value=restore_value,
+            prune_configs_by=prune_configs_by,
             do_bench=do_bench,
             cache_results=cache_results,
         )

quack/broadcast_utils.py ADDED Viewed

@@ -0,0 +1,29 @@
+# Copyright (c) 2025, Tri Dao.
+from typing import Callable
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from quack.layout_utils import make_acc_tensor_mn_view
+@cute.jit
+def vec_op(tCrC: cute.Tensor, tCrVec: cute.Tensor, op: Callable, is_colvec: bool) -> None:
+    if const_expr(tCrC.element_type != Float32):  # Convert to f32
+        tCrC_f32 = cute.make_fragment(tCrC.shape, Float32)
+        tCrC_f32.store(tCrC.load().to(Float32))
+    else:
+        tCrC_f32 = tCrC
+    # this happens to work for frgA layout too, not just acc layout
+    tCrC_f32_mn = make_acc_tensor_mn_view(tCrC_f32)
+    if const_expr(is_colvec):
+        assert cute.size(tCrC_f32_mn, mode=[0]) == cute.size(tCrVec)
+        for r in cutlass.range(cute.size(tCrC_f32_mn, mode=[0]), unroll_full=True):
+            tCrC_f32_mn[r, None].store(op(tCrC_f32_mn[r, None].load(), tCrVec[r]))
+    else:
+        assert cute.size(tCrC_f32_mn, mode=[1]) == cute.size(tCrVec)
+        for c in cutlass.range(cute.size(tCrC_f32_mn, mode=[1]), unroll_full=True):
+            tCrC_f32_mn[None, c].store(op(tCrC_f32_mn[None, c].load(), tCrVec[c]))
+    if const_expr(tCrC.element_type != Float32):  # Convert back to original dtype
+        tCrC.store(tCrC_f32.load().to(tCrC.element_type))

quack/compile_utils.py ADDED Viewed

@@ -0,0 +1,19 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+from typing import Optional
+import cutlass.cute as cute
+def make_fake_tensor(dtype, shape, divisibility=1, leading_dim=-1) -> Optional[cute.Tensor]:
+    if leading_dim < 0:
+        leading_dim = len(shape) + leading_dim
+    if dtype is None:
+        return None
+    stride = tuple(
+        cute.sym_int64(divisibility=divisibility) if i != leading_dim else 1
+        for i in range(len(shape))
+    )
+    return cute.runtime.make_fake_tensor(
+        dtype, shape, stride=stride, assumed_align=divisibility * dtype.width // 8
+    )

quack/copy_utils.py ADDED Viewed

@@ -0,0 +1,487 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import re
+from typing import Optional, Type, Tuple, Callable
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Boolean, const_expr
+from cutlass.cute.nvgpu import cpasync
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass.pipeline
+@dsl_user_op
+def cvt_copy(
+    atom: cute.CopyAtom,
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
+    if const_expr(src.element_type != dst.element_type):
+        src_cvt = cute.make_fragment_like(src, dst.element_type)
+        src_cvt.store(src.load().to(dst.element_type))
+        src = src_cvt
+    cute.copy(atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+@dsl_user_op
+def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    dst = cute.make_fragment_like(src, src.element_type, loc=loc, ip=ip)
+    cute.autovec_copy(src, dst, loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def load_s2r_retile(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst_shape: cute.Tensor | cute.Shape,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    # Will also accept dst_shape being a tensor, in which case we write into that tensor
+    if const_expr(not isinstance(dst_shape, cute.Tensor)):
+        dst = cute.make_fragment(dst_shape, src.element_type, loc=loc, ip=ip)
+    else:
+        dst = dst_shape
+    cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def get_copy_atom(
+    dtype: Type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False, *, loc=None, ip=None
+) -> cute.CopyAtom:
+    num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+@dsl_user_op
+def copy(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    is_async: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    num_copy_elems = src.shape[0][0]
+    copy_atom = get_copy_atom(src.element_type, num_copy_elems, is_async)
+    cute.copy(copy_atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+def tiled_copy_1d(
+    dtype: Type[cutlass.Numeric], num_threads: int, num_copy_elems: int = 1, is_async: bool = False
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    thr_layout = cute.make_layout(num_threads)
+    val_layout = cute.make_layout(num_copy_elems)
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+def tiled_copy_2d(
+    dtype: Type[cutlass.Numeric],
+    threads_per_row: int,
+    num_threads: int,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    assert num_threads % threads_per_row == 0
+    thr_layout = cute.make_ordered_layout(
+        (num_threads // threads_per_row, threads_per_row),
+        order=(1, 0),
+    )
+    val_layout = cute.make_layout((1, num_copy_elems))
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+@cute.jit
+def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
+    # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
+    tApA = cute.make_fragment(
+        cute.make_layout(
+            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            stride=(cute.size(tAcA, mode=[2]), 0, 1),
+        ),
+        Boolean,
+    )
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
+            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+    return tApA
+# def tiled_copy_2d(
+#     dtype: Type[cutlass.Numeric], major_mode_size: int, num_threads: int, is_async: bool = False
+# ) -> cute.TiledCopy:
+#     num_copy_bits = math.gcd(major_mode_size, 128 // dtype.width) * dtype.width
+#     copy_elems = num_copy_bits // dtype.width
+#     copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+#     copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+#     gmem_threads_per_row = major_mode_size // copy_elems
+#     assert num_threads % gmem_threads_per_row == 0
+#     thr_layout = cute.make_ordered_layout(
+#         (num_threads // gmem_threads_per_row, gmem_threads_per_row),
+#         order=(1, 0),
+#     )
+#     val_layout = cute.make_layout((1, copy_elems))
+#     return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+def parse_swizzle_from_pointer(ptr: cute.Pointer) -> Tuple[int, int, int]:
+    """Extract swizzle parameters from a pointer's swizzle_type.
+    The swizzle_type string has the form '!cute.swizzle<"S<b,m,s>">' where
+    b, m, s are the swizzle parameters (bits, base, shift).
+    Returns:
+        A cute.Swizzle object constructed from the extracted parameters
+    Raises:
+        ValueError: If the swizzle_type string cannot be parsed
+    """
+    # Ideally there should be a better API to get swizzle parameters, but we'll just parse
+    # the string here.
+    swizzle_str = str(ptr.type.swizzle_type)
+    # Extract the inner part "S<b,m,s>"
+    match = re.search(r"S<(\d+),(\d+),(\d+)>", swizzle_str)
+    if match:
+        b, m, s = int(match.group(1)), int(match.group(2)), int(match.group(3))
+        return b, m, s
+    else:
+        raise ValueError(f"Could not parse swizzle_type: {swizzle_str}")
+def swizzle_int(ptr_int: Int32, b: int, m: int, s: int) -> Int32:
+    bit_msk = (1 << b) - 1
+    yyy_msk = bit_msk << (m + s)
+    return ptr_int ^ ((ptr_int & yyy_msk) >> s)
+def swizzle_ptr(ptr: cute.Pointer):
+    b, m, s = parse_swizzle_from_pointer(ptr)
+    ptr_int = swizzle_int(ptr.toint(), b, m, s)
+    return cute.make_ptr(ptr.dtype, ptr_int, ptr.memspace, assumed_align=ptr.alignment)
+def as_position_independent_swizzle_tensor(tensor: cute.Tensor) -> cute.Tensor:
+    outer = tensor.layout
+    width = tensor.element_type.width
+    inner = cute.make_swizzle(*parse_swizzle_from_pointer(tensor.iterator))
+    # Need to recast the swizzle from byte (e.g. <3, 4, 3> to element units (e.g. <3, 3, 3> for
+    # for 16 bits and <3, 2, 3> for 32 bits)
+    new_layout = cute.recast_layout(
+        width, 8, cute.make_composed_layout(inner, 0, cute.recast_layout(8, width, outer))
+    )
+    # recast_ptr to remove the pointer swizzle
+    return cute.make_tensor(cute.recast_ptr(tensor.iterator, dtype=tensor.element_type), new_layout)
+def partition_D_position_independent(
+    thr_copy: cute.core.ThrCopy, tensor: cute.Tensor
+) -> cute.Tensor:
+    return cute.make_tensor(
+        swizzle_ptr(thr_copy.partition_D(tensor).iterator),
+        thr_copy.partition_D(as_position_independent_swizzle_tensor(tensor)).layout,
+    )
+def partition_S_position_independent(
+    thr_copy: cute.core.ThrCopy, tensor: cute.Tensor
+) -> cute.Tensor:
+    return cute.make_tensor(
+        swizzle_ptr(thr_copy.partition_S(tensor).iterator),
+        thr_copy.partition_S(as_position_independent_swizzle_tensor(tensor)).layout,
+    )
+@dsl_user_op
+def sm90_get_smem_load_op(
+    layout_c: cutlass.utils.LayoutEnum,
+    elem_ty_c: Type[cutlass.Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """
+    Selects the largest vectorized smem load atom available subject to constraint of gmem layout.
+    Parameters:
+    -----------
+    layout_c : LayoutEnum
+        The layout enum of the output tensor D.
+    elem_ty_c : Type[Numeric]
+        The element type for output tensor D.
+    Returns:
+    --------
+    Either SmemLoadMatrix or SimtSyncCopy, based on the input parameters.
+    """
+    if not isinstance(elem_ty_c, cutlass.cutlass_dsl.NumericMeta):
+        raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
+    is_m_major = layout_c.is_m_major_c()
+    if elem_ty_c.width == 16:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip
+        )
+    else:
+        return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
+def get_smem_store_atom(
+    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            element_type,
+        )
+def tma_get_copy_fn(
+    atom: cute.CopyAtom,
+    cta_coord: cute.Coord,
+    cta_layout: cute.Layout,
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    filter_zeros: bool = False,
+    **kwargs,
+) -> Callable:
+    src_is_smem = const_expr(
+        isinstance(src_tensor.iterator, cute.Pointer)
+        and src_tensor.memspace == cute.AddressSpace.smem
+    )
+    smem_tensor, gmem_tensor = (src_tensor, dst_tensor) if src_is_smem else (dst_tensor, src_tensor)
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    s, g = cpasync.tma_partition(
+        atom,
+        cta_coord,
+        cta_layout,
+        cute.group_modes(smem_tensor, 0, cute.rank(smem_tensor) - 1),
+        cute.group_modes(gmem_tensor, 0, cute.rank(gmem_tensor) - 1),
+    )
+    if const_expr(filter_zeros):
+        s = cute.filter_zeros(s)
+        g = cute.filter_zeros(g)
+    src, dst = (s, g) if src_is_smem else (g, s)
+    def copy_tma(src_idx, dst_idx, **new_kwargs):
+        cute.copy(atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs)
+    return copy_tma, s, g
+def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsync):
+    def copy_fn(src_idx, producer_state: cutlass.pipeline.PipelineState, **new_kwargs):
+        copy(
+            src_idx=src_idx,
+            dst_idx=producer_state.index,
+            tma_bar_ptr=pipeline.producer_get_barrier(producer_state),
+            **new_kwargs,
+        )
+    return copy_fn
+@cute.jit
+def gather_m_get_copy_fn(
+    thr_copy_A: cute.ThrCopy,
+    mA: cute.Tensor,  # (whatever, K)
+    sA: cute.Tensor,  # (tile_M, tile_N, STAGE)
+    gsAIdx: cute.Tensor,  # (tile_M), either gmem or smem
+    limit_m: Int32,
+    limit_k: Int32,
+) -> Callable:
+    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
+    tAsA = thr_copy_A.partition_D(sA)
+    # k-major
+    assert tAsA.shape[2] == 1
+    tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
+    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
+    if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_shape_mk[0])
+    elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor(tile_shape_mk)
+    tAcA = thr_copy_A.partition_S(cA)
+    t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
+    # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
+    # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
+    # This is so that when we do the comparison, t0AcA is known at compile time.
+    limit_m = limit_m - tAcA[0][0]
+    limit_k = limit_k - tAcA[0][1]
+    # Read and cache indices for A
+    rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
+    cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    m_idx = cute.make_fragment(rows_per_thread, Int32)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        row_idx = tAcA[0, m, 0][0]
+        if tApA_m[m]:
+            m_idx[m] = gsAIdx[row_idx]
+        else:
+            m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
+    mA_k = cute.logical_divide(mA, (None, tile_shape_mk[1]))
+    def copy_fn(src_idx, dst_idx, pred: bool = False):
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        mA_cur = mA_k[None, (None, src_idx)]
+        for m in cutlass.range_constexpr(tAcA.shape[1]):
+            # cute.tiled_divide(mA_cur[m_idx[m], None], (elems_per_load,)) would give shape
+            # ((elems_per_load), thread_per_row)
+            # But we actually want shape ((elems_per_load, 1), thread_per_row) to match tAsA
+            # So we append 1s to the last dimension and then do tiled_divide, then slice.
+            mA_row = cute.tiled_divide(
+                cute.append_ones(mA_cur[m_idx[m], None], up_to_rank=2), (elems_per_load, 1)
+            )[None, None, 0]
+            if const_expr(is_even_m_smem) or tApA_m[m]:
+                # There's only 1 load per row
+                assert cute.size(tAcA.shape, mode=[2]) == 1
+                ki = tAcA[0, 0, 0][1] // elems_per_load
+                cute.copy(thr_copy_A, mA_row[None, ki], tAsA[(None, m), dst_idx], pred=tApA_k)
+    return copy_fn
+@cute.jit
+def gather_k_get_copy_fn(
+    thr_copy_A: cute.ThrCopy,
+    mA: cute.Tensor,  # (tile_M, whatever)
+    sA: cute.Tensor,  # (tile_M, tile_N, STAGE)
+    gsAIdx: cute.Tensor,  # (tile_K, RestK), either gmem or smem
+    limit_m: Int32,
+    limit_k: Int32,
+) -> Callable:
+    gAIdx, sAIdx = None, None
+    if const_expr(gsAIdx.memspace == cute.AddressSpace.gmem):
+        gAIdx = gsAIdx
+    else:
+        assert gsAIdx.memspace == cute.AddressSpace.smem
+        sAIdx = gsAIdx
+    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
+    # (atom_v, CPY_M, 1, STAGE)
+    tAsA = thr_copy_A.partition_D(sA)
+    # m-major
+    tAsA = cute.group_modes(tAsA, 0, 3)
+    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
+    if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_shape_mk[0])
+    elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor(tile_shape_mk)
+    tAcA = thr_copy_A.partition_S(cA)
+    t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
+    # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
+    # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
+    # This is so that when we do the comparison, t0AcA is known at compile time.
+    limit_m = limit_m - tAcA[0][0]
+    limit_k = limit_k - tAcA[0][1]
+    # Read and cache indices for A
+    rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
+    cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
+    # This is very convoluted but idk a better way
+    # for tile_M=128, flat_divide gives (8, 16, K),
+    # then logical_divide gives ((8, 1), (8, 2), K).
+    tidx = thr_copy_A.thr_idx
+    tAmA = cute.logical_divide(
+        cute.flat_divide(mA, (elems_per_load,)), (elems_per_load, threads_per_col)
+    )[None, (tidx % threads_per_col, None), None]  # ((8, 1), 2, K)
+    def prefetch_from_gmem_fn(src_idx, pred: bool = False) -> Tuple[cute.Tensor, cute.Tensor]:
+        # Prefetch mAIdx early, even before smem is free
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        gAIdx_cur = gAIdx[None, src_idx]
+        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        for k in cutlass.range(cols_per_thread):
+            col_idx = tAcA[0, 0, k][1]
+            if const_expr(not pred):
+                k_idx[k] = gAIdx_cur[col_idx]
+            else:
+                if tApA_k[k]:
+                    k_idx[k] = gAIdx_cur[col_idx]
+                else:
+                    k_idx[k] = -1
+        return k_idx, tApA_k
+    def prefetch_from_smem_fn(
+        a_prefetch_pipeline, src_idx, dst_idx, a_prefetch_consumer_state, pred: bool = False
+    ) -> Tuple[cute.Tensor, cute.Tensor]:
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+        sAIdx_cur = sAIdx[None, dst_idx]
+        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        for k in cutlass.range(cols_per_thread):
+            col_idx = tAcA[0, 0, k][1]
+            k_idx[k] = sAIdx_cur[col_idx]
+        cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+        return k_idx, tApA_k
+    def copy_fn(
+        src_idx, dst_idx, k_idx_tApA_k: Tuple[cute.Tensor, cute.Tensor], pred: bool = False
+    ):
+        k_idx, tApA_k = k_idx_tApA_k
+        tApA_k_pred = None
+        if const_expr(pred):
+            tApA_k_pred = cute.prepend_ones(tApA_k, up_to_rank=2)  # (1, cols_per_thread)
+        for k in cutlass.range_constexpr(tAcA.shape[2]):
+            # copy_A(tAmA[None, None, k_idx[k]], tAsA[(None, None, k), smem_idx], pred=cute.prepend_ones(tApA_m, up_to_rank=2))
+            for m in cutlass.range_constexpr(tAcA.shape[1]):
+                if tApA_m[m]:
+                    cute.copy(
+                        thr_copy_A,
+                        tAmA[None, m, k_idx[k]],
+                        tAsA[(None, m, k), dst_idx],
+                        pred=None if const_expr(tApA_k_pred is None) else tApA_k_pred[None, k],
+                    )
+    return copy_fn, prefetch_from_gmem_fn if const_expr(
+        gAIdx is not None
+    ) else prefetch_from_smem_fn

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl