PyPI - quack-kernels - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -34
quack/gemm.py +194 -0
quack/{gemm_act_sm90.py → gemm_act.py} +218 -117
quack/gemm_config.py +72 -46
quack/{gemm_dact_sm90.py → gemm_dact.py} +53 -21
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +177 -31
quack/gemm_sm100.py +729 -506
quack/{dense_gemm_sm90.py → gemm_sm90.py} +344 -814
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +3 -1
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +476 -526
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +23 -16
quack/topk.py +409 -85
quack/utils.py +32 -220
quack/varlen_utils.py +370 -1
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/METADATA +4 -2
quack_kernels-0.2.4.dist-info/RECORD +44 -0
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.2.dist-info/RECORD +0 -37
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.2.dist-info → quack_kernels-0.2.4.dist-info}/top_level.txt +0 -0

quack/gemm_interface.py CHANGED Viewed

@@ -9,9 +9,11 @@ from torch import Tensor
 from quack.gemm_config import GemmConfig, get_all_configs
 from quack.autotuner import autotune, AutotuneConfig
-from quack.dense_gemm_sm90 import gemm_sm90
-from quack.gemm_act_sm90 import gemm_act_sm90
-from quack.gemm_dact_sm90 import gemm_dact_sm90
+from quack.cute_dsl_utils import get_device_capacity
+from quack.gemm import gemm as gemm_sm90_sm100
+from quack.gemm_act import gemm_act as gemm_act_sm90_sm100
+from quack.gemm_dact import gemm_dact as gemm_dact_sm90_sm100
+from quack.gemm_symmetric import gemm_symmetric as gemm_symmetric_sm90_sm100
 # Dictionary mapping activation names to PyTorch functions
@@ -34,6 +36,16 @@ gated_to_pytorch_fn_map = {
 }
+default_device_capacity = get_device_capacity(torch.device("cuda"))
+def default_config(device):
+    if get_device_capacity(device)[0] != 10:
+        return GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
+    else:
+        return GemmConfig(tile_m=256, tile_n=256, cluster_m=2, cluster_n=1, pingpong=False)
 def prune_invalid_gemm_configs(configs, named_args: dict, **kwargs):
     kwargs = named_args | kwargs
     gather_A = kwargs.get("A_idx", None) is not None
@@ -41,17 +53,18 @@ def prune_invalid_gemm_configs(configs, named_args: dict, **kwargs):
     if varlen_m or gather_A:  # Doesn't support swap_ab
         configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
     if gather_A:
-        # tile_n == 208 causes register spills, as gather_A requires more registers for the producer
-        configs = [
-            conf
-            for conf in configs
-            if conf.kwargs["config"].cluster_n == 1 and conf.kwargs["config"].tile_n != 208
-        ]
+        if get_device_capacity(kwargs["A"].device)[0] == 9:
+            # tile_n == 208 causes register spills, as gather_A requires more registers for the producer
+            configs = [
+                conf
+                for conf in configs
+                if conf.kwargs["config"].cluster_n == 1 and conf.kwargs["config"].tile_n != 208
+            ]
     return configs
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
@@ -61,6 +74,7 @@ def gemm_tuned(
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float | Tensor = 1.0,  # (1,)
     beta: float | Tensor = 1.0,  # (1,)
     cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
@@ -72,7 +86,7 @@ def gemm_tuned(
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
+        config = default_config(A.device)
     varlen_m = cu_seqlens_m is not None
     varlen_k = cu_seqlens_k is not None
     varlen = varlen_m or varlen_k
@@ -91,6 +105,8 @@ def gemm_tuned(
         C = C.unsqueeze(0)  # (1, M, N)
     if out.ndim == 2 and not varlen_m:
         out = out.unsqueeze(0)
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
     batch_size = B.shape[0] if not varlen_k else cu_seqlens_k.shape[0] - 1
     if varlen_m:
         # If gather_A (A_idx provided), use its length; otherwise use A.shape[0]
@@ -102,7 +118,7 @@ def gemm_tuned(
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_sm90(
+    gemm_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         out if not config.swap_ab else out.mT,
@@ -113,6 +129,10 @@ def gemm_tuned(
         config.cluster_m,
         config.cluster_n,
         config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
         alpha=alpha,
         beta=beta,
         cu_seqlens_m=cu_seqlens_m,
@@ -124,7 +144,7 @@ def gemm_tuned(
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["activation", "dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
@@ -136,6 +156,7 @@ def gemm_act_tuned(
     preact_out: Optional[Tensor],
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
@@ -143,7 +164,7 @@ def gemm_act_tuned(
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
+        config = default_config(A.device)
     varlen_m = cu_seqlens_m is not None
     if varlen_m:
         assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
@@ -162,10 +183,12 @@ def gemm_act_tuned(
         PostAct = postact_out.unsqueeze(0)
     else:
         PostAct = postact_out
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_act_sm90(
+    gemm_act_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         (D if not config.swap_ab else D.mT) if D is not None else None,
@@ -179,13 +202,16 @@ def gemm_act_tuned(
         config.cluster_n,
         config.pingpong,
         persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
         cu_seqlens_m=cu_seqlens_m,
         A_idx=A_idx,
     )
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["activation", "dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
@@ -203,7 +229,7 @@ def gemm_dact_tuned(
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
+        config = default_config(A.device)
     varlen_m = cu_seqlens_m is not None
     if varlen_m:
         assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
@@ -225,7 +251,7 @@ def gemm_dact_tuned(
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_dact_sm90(
+    gemm_dact_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         D if not config.swap_ab else D.mT,
@@ -239,6 +265,7 @@ def gemm_dact_tuned(
         config.cluster_n,
         config.pingpong,
         persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
         cu_seqlens_m=cu_seqlens_m,
         A_idx=A_idx,
     )
@@ -249,6 +276,7 @@ def gemm(
     A: Tensor,
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float | Tensor = 1.0,
     out_dtype: Optional[torch.dtype] = None,
     cu_seqlens_m: Optional[Tensor] = None,
@@ -281,6 +309,7 @@ def gemm(
         A,
         B,
         out,
+        bias=bias,
         alpha=alpha,
         alpha_tensor=alpha_tensor,
         cu_seqlens_m=cu_seqlens_m,
@@ -299,13 +328,14 @@ def gemm(
     device_types="cuda",
     # We have to split out alpha and alpha_tensor since torch.library requires
     # each argument to have a fixed type
-    # schema="(Tensor A, Tensor B, Tensor(a2!) out, float alpha=1.0, Tensor? alpha_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+    # schema="(Tensor A, Tensor B, Tensor(a2!) out, Tensor? bias, float alpha=1.0, Tensor? alpha_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
 )
 def gemm_out(
     # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float = 1.0,
     alpha_tensor: Optional[Tensor] = None,
     cu_seqlens_m: Optional[Tensor] = None,
@@ -323,6 +353,7 @@ def gemm_out(
         B,
         out,
         C=None,
+        bias=bias,
         alpha=alpha,
         cu_seqlens_m=cu_seqlens_m,
         cu_seqlens_k=cu_seqlens_k,
@@ -337,6 +368,7 @@ def gemm_ref(
     A: Tensor,
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float | Tensor = 1.0,
     cu_seqlens_m: Optional[Tensor] = None,
     cu_seqlens_k: Optional[Tensor] = None,
@@ -349,6 +381,11 @@ def gemm_ref(
     if cu_seqlens_m is None and cu_seqlens_k is None:
         fn = torch.bmm if A.ndim == 3 else torch.mm
         out = fn(A, B, out_dtype=out_dtype, out=out)
+        if not isinstance(alpha, float) or alpha != 1.0:
+            out *= alpha
+        if bias is not None:
+            bias = bias if A.ndim == 2 else bias.unsqueeze(1)
+            out += bias
     elif cu_seqlens_m is not None:
         # Handle varlen_m case
         if out is None:
@@ -362,6 +399,10 @@ def gemm_ref(
                 else A[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
             )
             torch.mm(A_slice, B[i], out=out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]])
+            if not isinstance(alpha, float) or alpha != 1.0:
+                out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]] *= alpha
+            if bias is not None:
+                out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]] += bias[i]
     else:  # cu_seqlens_k is not None
         L = cu_seqlens_k.shape[0] - 1
         if out is None:
@@ -373,8 +414,10 @@ def gemm_ref(
                 else A[:, cu_seqlens_k[i] : cu_seqlens_k[i + 1]]
             )
             torch.mm(A_slice, B[cu_seqlens_k[i] : cu_seqlens_k[i + 1], :], out=out[i])
-    if not isinstance(alpha, float) or alpha != 1.0:
-        out = out * alpha
+        if not isinstance(alpha, float) or alpha != 1.0:
+            out *= alpha
+        if bias is not None:
+            out += bias
     return out
@@ -488,6 +531,7 @@ def gemm_add_ref(
     A: Tensor,
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     alpha: float | Tensor = 1.0,
     beta: float | Tensor = 1.0,
@@ -499,7 +543,7 @@ def gemm_add_ref(
     """Reference implementation for GEMM with addition and pre-allocated output."""
     if cu_seqlens_m is None and cu_seqlens_k is None:
         if isinstance(alpha, float) and isinstance(beta, float):
-            return torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
+            out = torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
         else:
             out_dtype = (
                 out.dtype if out is not None else (out_dtype if out_dtype is not None else A.dtype)
@@ -507,7 +551,9 @@ def gemm_add_ref(
             result = (alpha * (A @ B) + beta * C).to(out_dtype)
             if out is not None:
                 out.copy_(result)
-            return result
+        if bias is not None:
+            bias = bias if A.ndim == 2 else bias.unsqueeze(1)
+            out += bias
     elif cu_seqlens_m is not None:
         # Handle varlen_m case
         if out is None:
@@ -524,6 +570,8 @@ def gemm_add_ref(
             C_slice = C[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
             out_slice = out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
             result = alpha * torch.mm(A_slice, B[i]) + beta * C_slice
+            if bias is not None:
+                result += bias[i]
             out_slice.copy_(result)
     else:  # cu_seqlens_k is not None
         # Handle varlen_k case
@@ -540,6 +588,8 @@ def gemm_add_ref(
             B_slice = B[cu_seqlens_k[i] : cu_seqlens_k[i + 1], :]
             result = alpha * torch.mm(A_slice, B_slice) + beta * C[i]
             out[i].copy_(result)
+        if bias is not None:
+            out += bias
     return out
@@ -639,6 +689,7 @@ def gemm_act(
     A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
     B: Tensor,  # (K, N) or (L, K, N)
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     preact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
@@ -667,7 +718,17 @@ def gemm_act(
     if postact_out is None:
         postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
     gemm_act_out(
-        A, B, preact_out, postact_out, C, activation, cu_seqlens_m, A_idx, dynamic_scheduler, tuned
+        A,
+        B,
+        preact_out,
+        postact_out,
+        C,
+        bias,
+        activation,
+        cu_seqlens_m,
+        A_idx,
+        dynamic_scheduler,
+        tuned,
     )
     return preact_out, postact_out
@@ -676,7 +737,7 @@ def gemm_act(
     "quack::gemm_act_out",
     mutates_args=("preact_out", "postact_out"),
     device_types="cuda",
-    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
 )
 def gemm_act_out(
     A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
@@ -684,6 +745,7 @@ def gemm_act_out(
     preact_out: Optional[Tensor],  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
@@ -692,13 +754,14 @@ def gemm_act_out(
 ) -> None:
     """GEMM with activation and pre-allocated output tensors."""
     fn = gemm_act_tuned if tuned else partial(gemm_act_tuned.fn, config=None)
-    fn(A, B, preact_out, postact_out, C, activation, cu_seqlens_m, A_idx, dynamic_scheduler)
+    fn(A, B, preact_out, postact_out, C, bias, activation, cu_seqlens_m, A_idx, dynamic_scheduler)
 def gemm_act_ref(
     A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
@@ -709,9 +772,9 @@ def gemm_act_ref(
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
     if C is None:
-        out = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+        out = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     else:
-        out = gemm_add_ref(A, B, C, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+        out = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     postact = act_to_pytorch_fn_map[activation](out).to(postact_dtype)
     return out.to(out_dtype) if store_preact else None, postact
@@ -806,6 +869,7 @@ def gemm_gated_ref(
     A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"] = "swiglu",
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
@@ -832,9 +896,9 @@ def gemm_gated_ref(
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
     if C is None:
-        preact = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+        preact = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     else:
-        preact = gemm_add_ref(A, B, C, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+        preact = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     # Split preact into gate and up projections
     gate = preact[..., ::2]  # (M, N//2)
     up = preact[..., 1::2]  # (M, N//2)
@@ -873,14 +937,96 @@ def gemm_dgated_ref(
     # Split PreAct into gate and up projections
     gate = PreAct[..., ::2]  # (M, N)
     up = PreAct[..., 1::2]  # (M, N)
-    postact = gated_to_pytorch_fn_map[activation](gate, up)
     # Use autograd to compute gradients w.r.t. gate and up
+    gate_requires_grad, up_requires_grad = gate.requires_grad, up.requires_grad
+    gate.requires_grad_(True)
+    up.requires_grad_(True)
+    postact = gated_to_pytorch_fn_map[activation](gate, up)
     dgate, dup = torch.autograd.grad(postact, [gate, up], dout, create_graph=False)
+    gate.requires_grad_(gate_requires_grad)
+    up.requires_grad_(up_requires_grad)
     # Interleave gradients back
     dx = torch.stack([dgate, dup], dim=-1).reshape(PreAct.shape)
     return dx.to(out_dtype), postact.to(postact_dtype)
+@torch.library.custom_op(
+    "quack::gemm_symmetric_out",
+    mutates_args=("out",),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!) out, Tensor? C=None, bool dynamic_scheduler=False, float alpha=1.0, float beta=1.0) -> ()",
+)
+def gemm_symmetric_out(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, M) or (L, K, M)
+    out: Tensor,  # (M, M) or (L, M, M)
+    C: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    dynamic_scheduler: bool = False,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+) -> None:
+    """GEMM with guaranteed symmetric output."""
+    if A.ndim == 2:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (M, K) or (L, M, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, M, K)
+    if C is not None and C.ndim == 2:
+        C = C.unsqueeze(0)  # (1, M, M)
+    if out.ndim == 2:
+        out = out.unsqueeze(0)
+    else:
+        out = out
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_symmetric_sm90_sm100(
+        A,
+        B,
+        out if out is not None else None,
+        C if C is not None else None,
+        tile_count_semaphore,
+        tile_M=128,
+        tile_N=256,
+        cluster_M=2,
+        cluster_N=1,
+        pingpong=False,
+        persistent=True,
+        max_swizzle_size=8,
+        alpha=alpha,
+        beta=beta,
+    )
+def gemm_symmetric(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, M) or (L, K, M)
+    C: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    out: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    out_dtype: Optional[torch.dtype] = None,
+    dynamic_scheduler: bool = False,
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """GEMM with symmetric output."""
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    # Determine output shape based on gather_A
+    if A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    if out is None:
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    alpha_val = alpha if isinstance(alpha, float) else 1.0
+    beta_val = beta if isinstance(beta, float) else 1.0
+    gemm_symmetric_out(
+        A, B, out, C, dynamic_scheduler=dynamic_scheduler, alpha=alpha_val, beta=beta_val
+    )
+    return out
 # TODO: this is not quite right, do we need to register gemm_add not gemm_add_out?
 # try:
 #     from torch._inductor.fx_passes.reinplace import InplaceableOp

quack-kernels 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

quack-kernels 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl