PyPI - quack-kernels - Versions diffs - 0.2.3__tar.gz → 0.2.5__tar.gz - Mend

quack-kernels 0.2.3tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{quack_kernels-0.2.3/quack_kernels.egg-info → quack_kernels-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.2.3
+Version: 0.2.5
 Requires-Python: >=3.10
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.3.3
+Requires-Dist: nvidia-cutlass-dsl>=4.4.0.dev0
 Requires-Dist: torch
-Requires-Dist: apache-tvm-ffi<0.2,>=0.1.5
+Requires-Dist: apache-tvm-ffi<0.2,>=0.1.6
 Requires-Dist: torch-c-dlpack-ext
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/pyproject.toml RENAMED Viewed

@@ -7,9 +7,9 @@ name = "quack-kernels"
 dynamic = ["version"]
 requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl==4.3.3",
+    "nvidia-cutlass-dsl>=4.4.0.dev0",
     "torch",
-    "apache-tvm-ffi>=0.1.5,<0.2",
+    "apache-tvm-ffi>=0.1.6,<0.2",
     "torch-c-dlpack-ext",
 ]
@@ -20,7 +20,8 @@ dev = [
 ]
 [tool.setuptools.packages.find]
-exclude = ["tests", "benchmarks"]
+where = ["."]
+include = ["quack*"]
 [tool.setuptools.dynamic]
 version = {attr = "quack.__version__"}

quack_kernels-0.2.5/quack/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+__version__ = "0.2.5"
+import os
+from quack.rmsnorm import rmsnorm
+from quack.softmax import softmax
+from quack.cross_entropy import cross_entropy
+if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
+    import quack.cute_dsl_ptxas  # noqa: F401
+    # Patch to dump ptx and then use system ptxas to compile to cubin
+    quack.cute_dsl_ptxas.patch()
+__all__ = [
+    "rmsnorm",
+    "softmax",
+    "cross_entropy",
+]

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/copy_utils.py RENAMED Viewed

@@ -7,18 +7,19 @@ import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Boolean, const_expr
-from cutlass.cute.nvgpu import cpasync
+from cutlass.cute.nvgpu import cpasync, warpgroup
 from cutlass.cutlass_dsl import dsl_user_op
 import cutlass.pipeline
 @dsl_user_op
 def cvt_copy(
-    atom: cute.CopyAtom,
+    tiled_copy: cute.TiledCopy,
     src: cute.Tensor,
     dst: cute.Tensor,
     *,
     pred: Optional[cute.Tensor] = None,
+    retile: bool = False,
     loc=None,
     ip=None,
     **kwargs,
@@ -28,7 +29,9 @@ def cvt_copy(
         src_cvt = cute.make_fragment_like(src, dst.element_type)
         src_cvt.store(src.load().to(dst.element_type))
         src = src_cvt
-    cute.copy(atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+    if const_expr(retile):
+        src = tiled_copy.retile(src)
+    cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
 @dsl_user_op
@@ -262,6 +265,124 @@ def get_smem_store_atom(
         )
+def get_smem_load_atom(
+    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            element_type,
+        )
+def get_smem_store_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sC = thr_copy.partition_D(sC)
+    else:
+        tRS_sC = partition_D_position_independent(thr_copy, sC)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sC[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sC
+def get_smem_load_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sC = thr_copy.partition_S(sC)
+    else:
+        tSR_sC = partition_S_position_independent(thr_copy, sC)
+    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
+    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
+    tRS_shape = thr_copy_RS.partition_S(cute.make_identity_tensor(sC.shape[:2])).shape
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sC[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    return copy_fn, thr_copy, tSR_sC
+def get_smem_store_A(
+    tiled_mma: cute.TiledMma, sA: cute.Tensor, tidx: Int32, arch: int, position_independent=False
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sA = thr_copy.partition_D(sA)
+    else:
+        tRS_sA = partition_D_position_independent(thr_copy, sA)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sA[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sA
+def get_smem_load_A(
+    tiled_mma: cute.TiledMma,
+    sA: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    with_dst_tensor: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sA = thr_copy.partition_S(sA)
+    else:
+        tSR_sA = partition_S_position_independent(thr_copy, sA)
+    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
+    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
+    tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sA[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    def copy_fn_w_dst_tensor(src_idx: Int32, dst: cute.Tensor, **new_kwargs):
+        return load_s2r_retile(tiled_copy, tSR_sA[None, None, None, src_idx], dst, **new_kwargs)
+    return copy_fn if not with_dst_tensor else copy_fn_w_dst_tensor, thr_copy, tSR_sA
 def tma_get_copy_fn(
     atom: cute.CopyAtom,
     cta_coord: cute.Coord,
@@ -269,6 +390,7 @@ def tma_get_copy_fn(
     src_tensor: cute.Tensor,
     dst_tensor: cute.Tensor,
     filter_zeros: bool = False,
+    single_stage: bool = False,
     **kwargs,
 ) -> Callable:
     src_is_smem = const_expr(
@@ -276,13 +398,15 @@ def tma_get_copy_fn(
         and src_tensor.memspace == cute.AddressSpace.smem
     )
     smem_tensor, gmem_tensor = (src_tensor, dst_tensor) if src_is_smem else (dst_tensor, src_tensor)
+    group_rank_smem = const_expr(cute.rank(smem_tensor) - (1 if not single_stage else 0))
+    group_rank_gmem = const_expr(cute.rank(gmem_tensor) - (1 if not single_stage else 0))
     # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
     s, g = cpasync.tma_partition(
         atom,
         cta_coord,
         cta_layout,
-        cute.group_modes(smem_tensor, 0, cute.rank(smem_tensor) - 1),
-        cute.group_modes(gmem_tensor, 0, cute.rank(gmem_tensor) - 1),
+        cute.group_modes(smem_tensor, 0, group_rank_smem),
+        cute.group_modes(gmem_tensor, 0, group_rank_gmem),
     )
     if const_expr(filter_zeros):
         s = cute.filter_zeros(s)
@@ -292,7 +416,10 @@ def tma_get_copy_fn(
     def copy_tma(src_idx, dst_idx, **new_kwargs):
         cute.copy(atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs)
-    return copy_tma, s, g
+    def copy_tma_single_stage(**new_kwargs):
+        cute.copy(atom, src, dst, **new_kwargs, **kwargs)
+    return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
 def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsync):

quack_kernels-0.2.5/quack/cute_dsl_ptxas.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""
+System ptxas replacement for CUTLASS DSL.
+Environment variables:
+    CUTE_DSL_PTXAS_PATH    - Path to ptxas (e.g., /usr/local/cuda/bin/ptxas)
+    CUTE_DSL_PTXAS_VERBOSE - Set to 1 for verbose output
+"""
+import os
+import sys
+import re
+import ctypes
+import subprocess
+from pathlib import Path
+import cutlass
+CUTE_DSL_PTXAS_PATH = os.environ.get("CUTE_DSL_PTXAS_PATH", None)
+VERBOSE = os.environ.get("CUTE_DSL_PTXAS_VERBOSE", "0") == "1"
+_original_load_cuda_library = None
+_user_wanted_ptx = False  # True if user originally set CUTE_DSL_KEEP_PTX=1
+def _log(msg):
+    if VERBOSE:
+        print(f"[ptxas] {msg}", file=sys.stderr)
+def _get_ptx(compiled_func) -> tuple[str, Path] | None:
+    """Find and read PTX file, stripping null bytes."""
+    func_name = getattr(compiled_func, "function_name", None)
+    if not func_name:
+        return None
+    dump_dir = os.environ.get("CUTE_DSL_DUMP_DIR", Path.cwd())
+    for ptx_path in Path(dump_dir).glob(f"*{func_name}*.ptx"):
+        content = ptx_path.read_text().rstrip("\x00")
+        if ".entry " in content and content.rstrip().endswith("}"):
+            _log(f"Found PTX: {ptx_path}")
+            return content, ptx_path
+    return None
+def _compile_ptx(ptx_path: Path, ptx_content: str) -> bytes:
+    """Compile PTX to cubin using system ptxas."""
+    # Extract arch from PTX
+    match = re.search(r"\.target\s+(sm_\d+[a-z]?)", ptx_content)
+    arch = match.group(1) if match else "sm_90a"
+    # Write stripped content back if needed
+    if ptx_path.read_text() != ptx_content:
+        ptx_path.write_text(ptx_content)
+    # Compile
+    cubin_tmp = ptx_path.with_suffix(".cubin.tmp")
+    try:
+        assert CUTE_DSL_PTXAS_PATH is not None
+        result = subprocess.run(
+            [CUTE_DSL_PTXAS_PATH, f"-arch={arch}", "-O3", "-o", str(cubin_tmp), str(ptx_path)],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"ptxas failed: {result.stderr}")
+        cubin_data = cubin_tmp.read_bytes()
+        _log(f"Compiled {ptx_path.name} -> {len(cubin_data)} bytes ({arch})")
+        # Save cubin if CUTE_DSL_KEEP_CUBIN is set
+        if os.environ.get("CUTE_DSL_KEEP_CUBIN", "0") == "1":
+            cubin_out = ptx_path.with_suffix(".cubin")
+            cubin_out.write_bytes(cubin_data)
+            _log(f"Saved: {cubin_out}")
+        return cubin_data
+    finally:
+        cubin_tmp.unlink(missing_ok=True)
+def _patched_load_cuda_library(self):
+    """Replacement for _load_cuda_library that uses system ptxas."""
+    result = _get_ptx(self)
+    if not result:
+        _log("PTX not found, falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    ptx_content, ptx_path = result
+    try:
+        cubin = _compile_ptx(ptx_path, ptx_content)
+    except Exception as e:
+        _log(f"Compilation failed ({e}), falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    # Load cubin
+    import cuda.bindings.runtime as cuda_runtime
+    err, library = cuda_runtime.cudaLibraryLoadData(cubin, None, None, 0, None, None, 0)
+    if err != cuda_runtime.cudaError_t.cudaSuccess:
+        _log(f"cudaLibraryLoadData failed ({err}), falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    # Register kernels on all devices
+    _, cuda_load_to_device = self._get_cuda_init_and_load()
+    lib_ptr = ctypes.c_void_p(int(library))
+    dev_id = ctypes.c_int32(0)
+    err_val = ctypes.c_int32(0)
+    args = (ctypes.c_void_p * 3)(
+        ctypes.cast(ctypes.pointer(lib_ptr), ctypes.c_void_p),
+        ctypes.cast(ctypes.pointer(dev_id), ctypes.c_void_p),
+        ctypes.cast(ctypes.pointer(err_val), ctypes.c_void_p),
+    )
+    for dev in range(self.num_devices):
+        dev_id.value = dev
+        cuda_load_to_device(args)
+        if err_val.value != 0:
+            _log("cuda_load_to_device failed, falling back to embedded ptxas")
+            return _original_load_cuda_library(self)
+    _log(f"Loaded kernel from {ptx_path.name}")
+    # Delete PTX if user didn't originally want it kept
+    if not _user_wanted_ptx:
+        ptx_path.unlink(missing_ok=True)
+    return [cuda_runtime.cudaLibrary_t(lib_ptr.value)]
+def patch():
+    """Install system ptxas hook. Call before importing cutlass."""
+    global _original_load_cuda_library, _user_wanted_ptx
+    assert CUTE_DSL_PTXAS_PATH is not None
+    if not os.path.isfile(CUTE_DSL_PTXAS_PATH) or not os.access(CUTE_DSL_PTXAS_PATH, os.X_OK):
+        raise RuntimeError(f"ptxas not found: {CUTE_DSL_PTXAS_PATH}")
+    # Track if user originally wanted PTX kept
+    _user_wanted_ptx = os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1"
+    # os.environ['CUTE_DSL_KEEP_PTX'] = '1'
+    assert os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1", (
+        "Require CUTE_DSL_KEEP_PTX=1 to use system's ptxas"
+    )
+    cls = cutlass.cutlass_dsl.cuda_jit_executor.CudaDialectJitCompiledFunction
+    _original_load_cuda_library = cls._load_cuda_library
+    cls._load_cuda_library = _patched_load_cuda_library
+    _log("Patch applied")
+    return

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/layout_utils.py RENAMED Viewed

@@ -187,6 +187,10 @@ def make_acc_tensor_mn_view(acc: cute.Tensor) -> cute.Tensor:
     return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
+def reshape_acc_to_mn(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
 @cute.jit
 def convert_layout_acc_frgA(acc_layout: cute.Layout) -> cute.Layout:
     # For back to back gemm, convert layout of acc0 to gemm 1 accept layout.
@@ -227,6 +231,10 @@ def convert_layout_acc_frgA(acc_layout: cute.Layout) -> cute.Layout:
     return rA_mma_view
+def reshape_acc_to_frgA(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_frgA(acc.layout))
 def convert_layout_zero_stride(
     input: cute.Tensor | cute.Layout, ref_layout: cute.Layout
 ) -> cute.Layout:

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/pipeline.py RENAMED Viewed

@@ -5,14 +5,15 @@ from dataclasses import dataclass
 import cutlass.cute as cute
 from cutlass import Boolean, Int32, const_expr
-from cutlass.cutlass_dsl import if_generate, and_
+from cutlass.cutlass_dsl import if_generate, and_, dsl_user_op
 from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
 from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
 from cutlass.pipeline import PipelineTmaUmma
 class PipelineStateWAdvance(PipelineState):
-    def advance_iters(self, num_iterations: Int32):
+    @dsl_user_op
+    def advance_iters(self, num_iterations: Int32, *, loc=None, ip=None):
         self._count += Int32(num_iterations)
         new_index = self._index + Int32(num_iterations)
         # How many times did we cross the stages boundary
@@ -126,34 +127,40 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
             is_signalling_thread,
         )
+    @dsl_user_op
     def producer_acquire(
         self,
         state: PipelineState,
         try_acquire_token: Optional[Boolean] = None,
         is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
     ):
         """
         TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
         """
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(state.index, state.phase),
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             is_tma_warp,
-            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
         )
-    def producer_cpasync_commit(self, state: PipelineState):
+    @dsl_user_op
+    def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)
 class MbarrierArrayWDropCount(MbarrierArray):
+    @dsl_user_op
     def __init__(
         self,
         barrier_storage: cute.Pointer,
@@ -161,6 +168,9 @@ class MbarrierArrayWDropCount(MbarrierArray):
         agent: tuple[PipelineOp, CooperativeGroup],
         tx_count: int = 0,
         drop_count: Optional[Int32] = None,
+        *,
+        loc=None,
+        ip=None,
     ) -> None:
         self.barrier_storage = barrier_storage
         self.tx_count = tx_count
@@ -183,7 +193,7 @@ class MbarrierArrayWDropCount(MbarrierArray):
         self.mbarrier_base = self.barrier_storage
         # Mbarrier initialization in constructor
-        self.mbarrier_init()
+        self.mbarrier_init(loc=loc, ip=ip)
     def __extract_mlir_values__(self):
         return [self.barrier_storage, self.drop_count]
@@ -211,6 +221,7 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         barrier_storage: cute.Pointer = None,
         cta_layout_vmnk: Optional[cute.Layout] = None,
         producer_drop_count: Optional[Int32] = None,
+        mcast_mode_mn: tuple[int, int] = (1, 1),
     ):
         """
         This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
@@ -226,6 +237,8 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         :type tx_count: int
         :param cta_layout_vmnk: Layout of the cluster shape
         :type cta_layout_vmnk: cute.Layout | None
+        :param mcast_mode_mn: Tuple specifying multicast modes for m and n dimensions (each 0 or 1)
+        :type mcast_mode_mn: tuple[int, int], optional
         """
         if not isinstance(barrier_storage, cute.Pointer):
             raise ValueError(
@@ -245,7 +258,7 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             tx_count,
             drop_count=producer_drop_count,
         )
-        sync_object_empty = PipelineAsync._make_sync_object(
+        sync_object_empty = PipelineTmaUmma._make_sync_object(
             barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
         )
@@ -255,7 +268,7 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             # All threadblocks are leaders if not using clusters
             is_leader_cta = True
         else:
-            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk)
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk, mcast_mode_mn)
             is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
         cta_group = (
@@ -278,11 +291,15 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             cta_group,
         )
+    @dsl_user_op
     def producer_acquire(
         self,
         state: PipelineState,
         try_acquire_token: Optional[Boolean] = None,
         is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
     ):
         """
         TMA producer commit conditionally waits on buffer empty and sets the
@@ -290,17 +307,18 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         """
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(state.index, state.phase),
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             and_(self.is_leader_cta, is_tma_warp),
-            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
         )
-    def producer_cpasync_commit(self, state: PipelineState):
+    @dsl_user_op
+    def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/sm90_utils.py RENAMED Viewed

@@ -27,10 +27,11 @@ def make_smem_layout(
         sm90_utils_og.get_smem_layout_atom(layout, dtype, major_mode_size),
         dtype,
     )
+    order = (1, 0, 2) if const_expr(layout.is_m_major_c()) else (0, 1, 2)
     smem_layout_staged = cute.tile_to_shape(
         smem_layout_atom,
         cute.append(shape, stage) if const_expr(stage is not None) else shape,
-        order=(1, 0, 2) if layout.is_m_major_c() else (0, 1, 2),
+        order=order if const_expr(stage is not None) else order[:2],
     )
     return smem_layout_staged
@@ -125,3 +126,32 @@ def gemm_w_idx(
         rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
         rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
         gemm(tiled_mma, acc, rA, rB, zero_init=zero_init, wg_wait=wg_wait)
+def partition_fragment_ABC(
+    thr_mma: cute.ThrMma,
+    shape_mnk: cute.Shape,
+    sA: Optional[cute.Tensor],
+    sB: Optional[cute.Tensor],
+    swap_AB: bool = False,
+):
+    is_rs = thr_mma.op.a_src == warpgroup.OperandSource.RMEM
+    if const_expr(not swap_AB):
+        acc = cute.make_fragment(thr_mma.partition_shape_C(shape_mnk[:2]), Float32)
+        if const_expr(not is_rs):
+            assert sA is not None
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_A(sA))
+        else:
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[0], shape_mnk[2])))
+        assert sB is not None
+        tCrB = thr_mma.make_fragment_B(thr_mma.partition_B(sB))
+    else:
+        acc = cute.make_fragment(thr_mma.partition_shape_C((shape_mnk[1], shape_mnk[0])), Float32)
+        if const_expr(not is_rs):
+            assert sB is not None
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_A(sB))
+        else:  # B in rmem
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[1], shape_mnk[2])))
+        assert sA is not None
+        tCrA = thr_mma.make_fragment_B(thr_mma.partition_B(sA))
+    return acc, tCrA, tCrB

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/sort/bitonic_sort.py RENAMED Viewed

@@ -83,7 +83,7 @@ def bitonic_topk_merge(
     else:
         minmax_fn = min if ascending else max
     # Write the top k elements to the first half of the array
-    for i in cutlass.range(k, unfoll_full=True):
+    for i in cutlass.range(k, unroll_full=True):
         arr0[start0 + i] = minmax_fn(arr0[start0 + i], arr1[start1 + k - 1 - i])
     # Now the 1st half is bitonic, we just need to merge it
     bitonic_merge(arr0, k, start0, ascending)

{quack_kernels-0.2.3 → quack_kernels-0.2.5/quack_kernels.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.2.3
+Version: 0.2.5
 Requires-Python: >=3.10
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.3.3
+Requires-Dist: nvidia-cutlass-dsl>=4.4.0.dev0
 Requires-Dist: torch
-Requires-Dist: apache-tvm-ffi<0.2,>=0.1.5
+Requires-Dist: apache-tvm-ffi<0.2,>=0.1.6
 Requires-Dist: torch-c-dlpack-ext
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack_kernels.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ quack/broadcast_utils.py
 quack/compile_utils.py
 quack/copy_utils.py
 quack/cross_entropy.py
+quack/cute_dsl_ptxas.py
 quack/cute_dsl_utils.py
 quack/fast_math.py
 quack/gemm.py

quack_kernels-0.2.5/quack_kernels.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,8 @@
+nvidia-cutlass-dsl>=4.4.0.dev0
+torch
+apache-tvm-ffi<0.2,>=0.1.6
+torch-c-dlpack-ext
+[dev]
+pre-commit
+ruff

quack_kernels-0.2.5/quack_kernels.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ quack

quack_kernels-0.2.3/quack/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-__version__ = "0.2.3"
-from quack.rmsnorm import rmsnorm
-from quack.softmax import softmax
-from quack.cross_entropy import cross_entropy
-__all__ = [
-    "rmsnorm",
-    "softmax",
-    "cross_entropy",
-]

quack_kernels-0.2.3/quack_kernels.egg-info/requires.txt DELETED Viewed

@@ -1,8 +0,0 @@
-nvidia-cutlass-dsl==4.3.3
-torch
-apache-tvm-ffi<0.2,>=0.1.5
-torch-c-dlpack-ext
-[dev]
-pre-commit
-ruff

quack_kernels-0.2.3/quack_kernels.egg-info/top_level.txt DELETED Viewed

@@ -1,5 +0,0 @@
-benchmarks
-dist
-docs
-media
-quack

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/LICENSE RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/README.md RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/activation.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/autotuner.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/broadcast_utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/compile_utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/cross_entropy.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/cute_dsl_utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/fast_math.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_act.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_config.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_dact.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_default_epi.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_interface.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_sm100.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_sm90.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_symmetric.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/gemm_wrapper_utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/linear.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/linear_cross_entropy.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/mlp.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/reduce.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/reduction_base.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/rmsnorm.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/sm100_utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/softmax.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/sort/generate_sorting_networks.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/sort/sorting_networks.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/sort/utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/tensormap_manager.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/tile_scheduler.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/topk.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack/varlen_utils.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/quack_kernels.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/setup.cfg RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_cross_entropy.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_layernorm.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_linear.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_linear_cross_entropy.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_linear_varlen_k.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_linear_varlen_m.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_rmsnorm.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_softmax.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_symmetric_gemm.py RENAMED Viewed

File without changes

{quack_kernels-0.2.3 → quack_kernels-0.2.5}/tests/test_topk.py RENAMED Viewed

File without changes

quack-kernels 0.2.3__tar.gz → 0.2.5__tar.gz

quack-kernels 0.2.3tar.gz → 0.2.5tar.gz