PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/METADATA +4 -2
quack_kernels-0.2.4.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/top_level.txt +0 -0

{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,12 @@
 Metadata-Version: 2.4
 Name: quack-kernels
-Version: 0.2.2
+Version: 0.2.4
 Requires-Python: >=3.10
 License-File: LICENSE
-Requires-Dist: nvidia-cutlass-dsl==4.2.1
+Requires-Dist: nvidia-cutlass-dsl<4.4.0,>=4.3.4
 Requires-Dist: torch
+Requires-Dist: apache-tvm-ffi<0.2,>=0.1.6
+Requires-Dist: torch-c-dlpack-ext
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"
 Requires-Dist: ruff; extra == "dev"

quack_kernels-0.2.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,44 @@
+quack/__init__.py,sha256=_vZWQp7kr01iQb3frKnJuzUf11z7ID0upA7oR_8mRTE,203
+quack/activation.py,sha256=-lZgojraqdyLjOzgOXBehoVeRBhBq30UX7kOkXsCpGI,20855
+quack/autotuner.py,sha256=atw0ntedi22RPwSdjWOoge4S56S8VFvRocJQcYhpAlo,13454
+quack/broadcast_utils.py,sha256=X5vWg2RtIIWU9Z7nEUW6m0EP0Cfd9XtCKxp4tSyp4Mg,1283
+quack/compile_utils.py,sha256=qJ3oTsDlbAiddrJHtEO7LPYVqn_s-neNfiw-_KvfXZU,591
+quack/copy_utils.py,sha256=J1Hcw18iNHHpOP2wNFhF8Lz16NEmXtoQMu59mmLrRCs,18761
+quack/cross_entropy.py,sha256=w6fjHC_vXt5ji2KfoLrSOdAvpLrQszrYU9rmRij2yY8,24899
+quack/cute_dsl_utils.py,sha256=4uQx5aYDG9UvVzbWwJTjjJLrnoympz70_CD8b37FQWo,3854
+quack/fast_math.py,sha256=E1XUqfUt0_n9BPZNggF-UDzZ6anso9bYUrwqafemWvQ,2297
+quack/gemm.py,sha256=8V23MPq49QbV3csv-_AxjfE9qf8R3NIqFK9Q9db6t2c,7417
+quack/gemm_act.py,sha256=Y8HJKfw3tCoFKecwhwhd5xpXd9jCQCGZT_V2xXf-CnU,20823
+quack/gemm_config.py,sha256=94o3g9x7H0wi7aBbsb7H67H8nSzTurwL2zgvKDtQUas,3575
+quack/gemm_dact.py,sha256=l__UhCrFbPjD9a1TAVgP7_C7p5lLfX5DkRcM6z0ofOw,7789
+quack/gemm_default_epi.py,sha256=6qO8Ovtcw8sQQ_kXTBTTQ5IHh1lS6RBCGZG0lgLHNrs,11916
+quack/gemm_interface.py,sha256=AF5PYTNgEHjb3MNXcNvvEpOcShAHtak0Xu12l1zrOAw,44804
+quack/gemm_sm100.py,sha256=U9jmzpST_d1W6CBFf1ZHhTtr0K8hENCsUz7dXvHaMZc,122344
+quack/gemm_sm90.py,sha256=u-Q3fN6DPm1fEdz0LcMecMbGTBcRunUCWopufwO8cHU,92015
+quack/gemm_symmetric.py,sha256=mqx7wgOCY6Dh9hjL6gR9PBstMD476GhpA_NkGeaEtik,13349
+quack/gemm_wrapper_utils.py,sha256=EaPyR3Lq19z_RkdB2_xxRj0IPSJMgyfpkrTXyvY3B6M,12775
+quack/layout_utils.py,sha256=QjFFlvDcLiyGGfA2FKWKI75twHIkOJ2AotE0cIpBAlI,11923
+quack/linear.py,sha256=mhN2A98w7H7X4MS63XCCK3gpOm1eS8H7a4WO9ovkt5U,9791
+quack/linear_cross_entropy.py,sha256=Zhy_gdMsKHOie-jntBaqIuiDJtkiq6qEBwnyuWwIRw4,10092
+quack/mlp.py,sha256=YjdwQRwEePA9KyidFXp5H1-lxiJc8dZ41vl8Fv8pgss,2259
+quack/pipeline.py,sha256=mMdIlpUaHdRDOkvQzgKdCdJydJq6C2eYrny5Bui4KFs,11311
+quack/reduce.py,sha256=ySKT2xh1_pIlbJX29BPmwH6yJ7MxIrRZyxHIPPYVpm0,12698
+quack/reduction_base.py,sha256=QqlPs5L2VCxwDrO4CHPq-KY6f_BAYRbvsR6k81LPzTU,3180
+quack/rmsnorm.py,sha256=esy18s5JtT7KBPRPhWf_anLRTrtromwqeJmg2yzOm60,44678
+quack/sm100_utils.py,sha256=-p5qj3Wi9n4WDLy2sl-fApYpGp5rH3JvZQb712OTxPs,1901
+quack/sm90_utils.py,sha256=hg8qq7S8NODZlUSaxNpdZcsnxcR0jM921rMn1VmBo7o,4278
+quack/softmax.py,sha256=ZqeVbnGfzwkro1LfWBHagbS7B7ug7b9SLZWuGx_Y3Kc,14367
+quack/tensormap_manager.py,sha256=Ts3Mxp0_es2RNA0ffvUjWMXN79lsfWEBZ0DQYhtbcnw,5338
+quack/tile_scheduler.py,sha256=vbKq0xp94eII0uJ63yY_3sgvJkQI7Irc8y1OttO6cRA,42514
+quack/topk.py,sha256=43xHpRGbwZCSRsulmfrG4WA_r2eLHc3sniaUFU7wn-o,22522
+quack/utils.py,sha256=WIttE1iiwyPIwR1NpaeO26Pn9YkZb361TDxFTUDH-IE,7354
+quack/varlen_utils.py,sha256=SOYkomxX2FoqjYlybg99CqNhS9IARM6F9ba2AkIVvT4,15811
+quack/sort/bitonic_sort.py,sha256=VJPVjPulW_jEr3myBE7AiBYGtsc5T9FEy3sjXFukF7s,4831
+quack/sort/generate_sorting_networks.py,sha256=vkJBOjTVEinQkWT4OtFqOWxFVdTIPoNAQocneKc9-rM,14477
+quack/sort/sorting_networks.py,sha256=l_26zi3gXD_z-tnm2eAczRrmE-mbaz00KmqH6ONivL8,9686
+quack/sort/utils.py,sha256=RbubEY1GcEpsjiz_6o5o2WB47IeMOzaajW6Jis0s444,1059
+quack_kernels-0.2.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+quack_kernels-0.2.4.dist-info/METADATA,sha256=vMKNVe5-xDcELyrpCllppMWMRLp0T3M0wFqkHsT7hw0,368
+quack_kernels-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+quack_kernels-0.2.4.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
+quack_kernels-0.2.4.dist-info/RECORD,,

quack/layernorm.py DELETED Viewed

@@ -1,353 +0,0 @@
-# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
-import torch
-from typing import Optional
-import cuda.bindings.driver as cuda
-import cutlass
-import cutlass.cute as cute
-from cutlass.cute.runtime import from_dlpack
-import quack.utils as utils
-from quack.reduce import row_reduce
-from quack.reduction_base import ReductionBase
-from quack.cute_dsl_utils import torch2cute_dtype_map
-class LayerNorm(ReductionBase):
-    def __init__(self, dtype: cutlass.Numeric, N: int):
-        super().__init__(dtype, N, stage=2)  # 2 stages for mean and var
-        self.reload_from = None if N <= 16384 else "smem"
-        self.delay_w_load = False
-    def _calculate_threads_per_row(self):
-        N = self.N
-        return (
-            8
-            if N <= 64
-            else (
-                16
-                if N <= 128
-                else (32 if N <= 3072 else (64 if N <= 6144 else (128 if N <= 16384 else 256)))
-            )
-        )
-    def _set_cluster_n(self):
-        N = self.N
-        # cluster_n = 4 is faster and cluster_n = 2 for N=64k for some reason
-        # Similarly cluster_n = 8 is faster for N=128k
-        if cutlass.const_expr(self.dtype.width == 16):
-            cluster_n = (
-                1
-                if N <= 16 * 1024
-                else (
-                    2
-                    if N <= 32 * 1024
-                    else (4 if N <= 64 * 1024 else (8 if N <= 128 * 1024 else 16))
-                )
-            )
-        else:  # fp32
-            cluster_n = (
-                1
-                if N <= 32 * 1024
-                else (
-                    2
-                    if N <= 64 * 1024
-                    else (4 if N <= 128 * 1024 else (8 if N <= 256 * 1024 else 16))
-                )
-            )
-        self.cluster_n = cluster_n
-    @cute.jit
-    def __call__(
-        self,
-        mX: cute.Tensor,
-        mW: cute.Tensor,
-        mO: cute.Tensor,
-        mRstd: Optional[cute.Tensor],
-        mMean: Optional[cute.Tensor],
-        stream: cuda.CUstream,
-        eps: cutlass.Float32 = 1e-6,
-    ):
-        assert mX.element_type == self.dtype
-        assert mO.element_type == self.dtype
-        self._set_cluster_n()
-        tiler_mn, tv_layout = self._get_tv_layout()
-        num_threads = cute.size(tv_layout, mode=[0])
-        num_warps = num_threads // cute.arch.WARP_SIZE
-        mW_expanded_layout = cute.prepend(mW.layout, cute.make_layout((tiler_mn[0],), stride=(0,)))
-        mW = cute.make_tensor(mW.iterator, mW_expanded_layout)
-        if cutlass.const_expr(mRstd is not None):
-            mRstd_expanded_layout = cute.append(
-                mRstd.layout, cute.make_layout((self.N,), stride=(0,))
-            )
-            mRstd = cute.make_tensor(mRstd.iterator, mRstd_expanded_layout)
-        if cutlass.const_expr(mMean is not None):
-            mMean_expanded_layout = cute.append(
-                mMean.layout, cute.make_layout((self.N,), stride=(0,))
-            )
-            mMean = cute.make_tensor(mMean.iterator, mMean_expanded_layout)
-        self.kernel(mX, mW, mO, mRstd, mMean, eps, tv_layout, tiler_mn, self.reload_from).launch(
-            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
-            block=[num_threads, 1, 1],
-            cluster=[1, self.cluster_n, 1] if cutlass.const_expr(self.cluster_n > 1) else None,
-            smem=self._smem_size_in_bytes(tiler_mn, num_warps),
-            stream=stream,
-        )
-    @cute.kernel
-    def kernel(
-        self,
-        mX: cute.Tensor,
-        mW: cute.Tensor,
-        mO: cute.Tensor,
-        mRstd: Optional[cute.Tensor],
-        mMean: Optional[cute.Tensor],
-        eps: cute.Float32,
-        tv_layout: cute.Layout,
-        tiler_mn: cute.Shape,
-        reload_from: cutlass.Constexpr = None,
-        delay_w_load: cutlass.Constexpr = False,
-    ):
-        tidx, _, _ = cute.arch.thread_idx()
-        bidx, _, _ = cute.arch.block_idx()
-        if cutlass.const_expr(self.cluster_n > 1):
-            cluster_y = cute.arch.block_idx()[1]
-        else:
-            cluster_y = cutlass.const_expr(0)
-        smem = cutlass.utils.SmemAllocator()
-        sX = smem.allocate_tensor(
-            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
-        )
-        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
-        shape = mX.shape
-        idX = cute.make_identity_tensor(shape)
-        # slice for CTAs
-        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
-        mX, mO = [utils.domain_offset_i64((bidx * tiler_mn[0], 0), mT) for mT in (mX, mO)]
-        gX, gO = [cute.local_tile(mT, tiler_mn, (0, cluster_y)) for mT in (mX, mO)]
-        cX = cute.local_tile(idX, tiler_mn, (bidx, cluster_y))
-        gW = cute.local_tile(mW, tiler_mn, (0, cluster_y))
-        gRstd = (
-            cute.local_tile(mRstd, tiler_mn, (bidx, cluster_y))
-            if cutlass.const_expr(mRstd is not None)
-            else None
-        )
-        gMean = (
-            cute.local_tile(mMean, tiler_mn, (bidx, cluster_y))
-            if cutlass.const_expr(mMean is not None)
-            else None
-        )
-        # declare the atoms which will be used later for memory copy
-        copy_atom_load_X = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mX.element_type, num_bits_per_copy=128
-        )
-        copy_atom_load_X_async = cute.make_copy_atom(
-            cute.nvgpu.cpasync.CopyG2SOp(), mX.element_type, num_bits_per_copy=128
-        )
-        copy_atom_load_W = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mW.element_type, num_bits_per_copy=128
-        )
-        copy_atom_store_O = cute.make_copy_atom(
-            cute.nvgpu.CopyUniversalOp(), mO.element_type, num_bits_per_copy=128
-        )
-        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X_async, tv_layout, tiler_mn).get_slice(
-            tidx
-        )
-        thr_copy_W = cute.make_tiled_copy(copy_atom_load_W, tv_layout, tiler_mn).get_slice(tidx)
-        thr_copy_O = cute.make_tiled_copy(copy_atom_store_O, tv_layout, tiler_mn).get_slice(tidx)
-        tWgW = thr_copy_W.partition_S(gW)
-        tXgX = thr_copy_X.partition_S(gX)
-        tXsX = thr_copy_X.partition_D(sX)
-        tXgO = thr_copy_O.partition_D(gO)
-        tXrRstd = thr_copy_O.partition_D(gRstd) if cutlass.const_expr(mRstd is not None) else None
-        tXrMean = thr_copy_O.partition_D(gMean) if cutlass.const_expr(mMean is not None) else None
-        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
-        # allocate fragments for gmem->rmem
-        tWrW = cute.make_fragment_like(tWgW)
-        tXrW = thr_copy_X.retile(tWrW)
-        tXrX, tXrO = [cute.make_fragment_like(thr) for thr in (tXgX, tXgO)]
-        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
-        self._initialize_cluster(tidx, mbar_ptr, num_warps)
-        tXpX = utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
-        row = tXcX[0][0]
-        if row < shape[0]:
-            cute.copy(copy_atom_load_X_async, tXgX, tXsX, pred=tXpX)
-        cute.arch.cp_async_commit_group()
-        tWpW = utils.predicate_k(thr_copy_W.partition_S(cX), limit=shape[1])
-        if cutlass.const_expr(not delay_w_load):
-            cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
-        cute.arch.cp_async_wait_group(0)
-        cute.autovec_copy(tXsX, tXrX)
-        x = tXrX.load().to(cute.Float32)
-        threads_per_row = tv_layout.shape[0][0]
-        sum_x = row_reduce(
-            x,
-            cute.ReductionOp.ADD,
-            threads_per_row,
-            reduction_buffer[None, None, 0],
-            mbar_ptr + 0 if cutlass.const_expr(self.cluster_n > 1) else None,
-            init_val=0.0,
-            hook_fn=cute.arch.cluster_wait if cutlass.const_expr(self.cluster_n > 1) else None,
-        )
-        mean = sum_x / shape[1]
-        if cutlass.const_expr(reload_from == "smem"):
-            cute.autovec_copy(tXsX, tXrX)
-            x = tXrX.load().to(cute.Float32)
-        elif cutlass.const_expr(reload_from == "gmem"):
-            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
-            x = tXrX.load().to(cute.Float32)
-        sum_sq_x_sub_mean = row_reduce(
-            (x - mean) * (x - mean),
-            cute.ReductionOp.ADD,
-            threads_per_row,
-            reduction_buffer[None, None, 1],
-            mbar_ptr + 1 if cutlass.const_expr(self.cluster_n > 1) else None,
-            init_val=0.0,
-        )
-        rstd = cute.math.rsqrt(sum_sq_x_sub_mean / shape[1] + eps, fastmath=True)
-        if cutlass.const_expr(mRstd is not None):
-            # Only the thread corresponding to column 0 writes out the rstd to gmem
-            if (
-                tXcX[0][1] == 0
-                and row < shape[0]
-                and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
-            ):
-                tXrRstd[0] = rstd
-        if cutlass.const_expr(mMean is not None):
-            # Only the thread corresponding to column 0 writes out the mean to gmem
-            if (
-                tXcX[0][1] == 0
-                and row < shape[0]
-                and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
-            ):
-                tXrMean[0] = mean
-        if cutlass.const_expr(delay_w_load):
-            cute.copy(copy_atom_load_W, tWgW, tWrW, pred=tWpW)
-        if cutlass.const_expr(reload_from == "smem"):
-            cute.autovec_copy(tXsX, tXrX)
-            x = tXrX.load().to(cute.Float32)
-        elif cutlass.const_expr(reload_from == "gmem"):
-            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
-            x = tXrX.load().to(cute.Float32)
-        x_hat = (x - mean) * rstd
-        w = tXrW.load().to(cute.Float32)
-        y = x_hat * w
-        tXrO.store(y.to(tXrO.element_type))
-        tOpO = utils.predicate_k(thr_copy_O.partition_S(cX), limit=shape[1])
-        if row < shape[0]:
-            cute.copy(copy_atom_store_O, tXrO, tXgO, pred=tOpO)
-def layernorm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float = 1e-6,
-    return_rstd: bool = False,
-    return_mean: bool = False,
-) -> torch.Tensor:
-    """LayerNorm forward pass.
-    Args:
-        x: Input tensor of shape (M, N)
-        weight: Weight tensor of shape (N,)
-        eps: Small value for numerical stability
-        return_rstd: Whether to return the reciprocal standard deviation
-        return_mean: Whether to return the mean
-    Returns:
-        Normalized output tensor of same shape as x
-        If return_rstd is True, also returns rstd tensor of shape (M,)
-        If return_mean is True, also returns mean tensor of shape (M,)
-    """
-    assert x.dim() == 2, "Input must be 2D"
-    assert weight.dim() == 1, "Weight must be 1D"
-    assert x.shape[-1] == weight.shape[0], "Last dimension of input must match weight dimension"
-    assert x.is_cuda and weight.is_cuda, "Tensors must be on CUDA device"
-    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported dtype"
-    assert weight.dtype == torch.float32, "Weight must be float32"
-    M, N = x.shape
-    device = x.device
-    out = torch.empty_like(x)
-    rstd = torch.empty(M, device=device, dtype=torch.float32) if return_rstd else None
-    mean = torch.empty(M, device=device, dtype=torch.float32) if return_mean else None
-    dtype = torch2cute_dtype_map[x.dtype]
-    convert_from_dlpack = lambda x: (
-        from_dlpack(x.detach(), assumed_align=16).mark_compact_shape_dynamic(
-            mode=0, stride_order=(0, 1)
-        )
-    )
-    x_tensor, out_tensor = [
-        # utils.convert_from_dlpack(t, leading_dim=t.ndim - 1, divisibility=128 // dtype.width)
-        convert_from_dlpack(t)
-        for t in (x, out)
-    ]
-    weight_tensor = utils.convert_from_dlpack(
-        weight.detach(), leading_dim=0, divisibility=128 // cutlass.Float32.width
-    )
-    rstd_tensor = (
-        from_dlpack(rstd.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
-        if rstd is not None
-        else None
-    )
-    mean_tensor = (
-        from_dlpack(mean.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
-        if mean is not None
-        else None
-    )
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (dtype, N, rstd is not None, mean is not None)
-    if compile_key not in layernorm.compile_cache:
-        rmsnorm_op = LayerNorm(dtype, N)
-        layernorm.compile_cache[compile_key] = cute.compile(
-            rmsnorm_op,
-            x_tensor,
-            weight_tensor,
-            out_tensor,
-            rstd_tensor,
-            mean_tensor,
-            current_stream,
-        )
-    layernorm.compile_cache[compile_key](
-        x_tensor, weight_tensor, out_tensor, rstd_tensor, mean_tensor, current_stream, eps
-    )
-    return (
-        (out, rstd, mean)
-        if return_mean and return_rstd
-        else (
-            (out, rstd)
-            if return_rstd and not return_mean
-            else ((out, mean) if return_mean and not return_rstd else (out))
-        )
-    )
-layernorm.compile_cache = {}
-def layernorm_ref(x: torch.Tensor, w: torch.Tensor, eps: float = 1e-6):
-    x_f32 = x.float()
-    return torch.nn.functional.layer_norm(x_f32, w.shape, w, None, eps).to(x.dtype)
-def rstd_ref(x: torch.Tensor, eps: float = 1e-6):
-    x_f32 = x.float()
-    mean = x_f32.mean(dim=-1, keepdim=True)
-    var = ((x_f32 - mean) ** 2).mean(dim=-1)
-    return 1.0 / torch.sqrt(var + eps)
-def mean_ref(x: torch.Tensor) -> torch.Tensor:
-    return x.float().mean(dim=-1)

quack-kernels 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl