PyPI - quack-kernels - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

quack/__init__.py +1 -8
quack/activation.py +366 -121
quack/autotuner.py +64 -5
quack/broadcast_utils.py +29 -0
quack/compile_utils.py +19 -0
quack/copy_utils.py +487 -0
quack/cross_entropy.py +157 -233
quack/cute_dsl_utils.py +20 -35
quack/gemm.py +194 -0
quack/gemm_act.py +510 -0
quack/gemm_config.py +72 -46
quack/gemm_dact.py +215 -0
quack/gemm_default_epi.py +259 -0
quack/gemm_interface.py +615 -146
quack/{dense_gemm_sm100.py → gemm_sm100.py} +1034 -787
quack/{dense_gemm_sm90.py → gemm_sm90.py} +552 -727
quack/gemm_symmetric.py +330 -0
quack/gemm_wrapper_utils.py +182 -23
quack/layout_utils.py +287 -0
quack/linear.py +24 -16
quack/pipeline.py +158 -3
quack/reduce.py +88 -49
quack/reduction_base.py +25 -36
quack/rmsnorm.py +508 -624
quack/sm100_utils.py +62 -0
quack/sm90_utils.py +127 -0
quack/softmax.py +135 -203
quack/sort/bitonic_sort.py +13 -10
quack/sort/utils.py +6 -6
quack/tile_scheduler.py +55 -61
quack/topk.py +409 -85
quack/utils.py +37 -172
quack/varlen_utils.py +370 -6
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/METADATA +4 -2
quack_kernels-0.2.3.dist-info/RECORD +44 -0
quack/gemm_act_sm90.py +0 -368
quack/gemm_dact_sm90.py +0 -150
quack/layernorm.py +0 -353
quack/symmetric_dense_gemm_sm90.py +0 -2091
quack_kernels-0.2.1.dist-info/RECORD +0 -37
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/WHEEL +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.2.1.dist-info → quack_kernels-0.2.3.dist-info}/top_level.txt +0 -0

quack/gemm_interface.py CHANGED Viewed

@@ -9,9 +9,11 @@ from torch import Tensor
 from quack.gemm_config import GemmConfig, get_all_configs
 from quack.autotuner import autotune, AutotuneConfig
-from quack.dense_gemm_sm90 import gemm_sm90
-from quack.gemm_act_sm90 import gemm_act_sm90
-from quack.gemm_dact_sm90 import gemm_dact_sm90
+from quack.cute_dsl_utils import get_device_capacity
+from quack.gemm import gemm as gemm_sm90_sm100
+from quack.gemm_act import gemm_act as gemm_act_sm90_sm100
+from quack.gemm_dact import gemm_dact as gemm_dact_sm90_sm100
+from quack.gemm_symmetric import gemm_symmetric as gemm_symmetric_sm90_sm100
 # Dictionary mapping activation names to PyTorch functions
@@ -34,34 +36,89 @@ gated_to_pytorch_fn_map = {
 }
+default_device_capacity = get_device_capacity(torch.device("cuda"))
+def default_config(device):
+    if get_device_capacity(device)[0] != 10:
+        return GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
+    else:
+        return GemmConfig(tile_m=256, tile_n=256, cluster_m=2, cluster_n=1, pingpong=False)
+def prune_invalid_gemm_configs(configs, named_args: dict, **kwargs):
+    kwargs = named_args | kwargs
+    gather_A = kwargs.get("A_idx", None) is not None
+    varlen_m = kwargs.get("cu_seqlens_m", None) is not None
+    if varlen_m or gather_A:  # Doesn't support swap_ab
+        configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
+    if gather_A:
+        if get_device_capacity(kwargs["A"].device)[0] == 9:
+            # tile_n == 208 causes register spills, as gather_A requires more registers for the producer
+            configs = [
+                conf
+                for conf in configs
+                if conf.kwargs["config"].cluster_n == 1 and conf.kwargs["config"].tile_n != 208
+            ]
+    return configs
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
 def gemm_tuned(
-    A: Tensor,  # (M, K)
-    B: Tensor,  # (K, N)
-    out: Tensor,  # (M, N) - required output tensor
-    C: Optional[Tensor] = None,  # (M, N)
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float | Tensor = 1.0,  # (1,)
     beta: float | Tensor = 1.0,  # (1,)
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    cu_seqlens_k: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    add_to_output: bool = False,
     dynamic_scheduler: bool = False,
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
-    A, B = A.unsqueeze(0), B.mT.unsqueeze(0)  # (1, M, K), (1, N, K)
-    if C is not None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    varlen_k = cu_seqlens_k is not None
+    varlen = varlen_m or varlen_k
+    gather_A = A_idx is not None
+    if gather_A:
+        assert varlen, "gather_A requires either varlen_m or varlen_k"
+        assert config.cluster_n == 1, "gather_A requires cluster_n=1"
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K) or (N, total_K)
+    if B.ndim == 2 and not varlen_k:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if C is not None and C.ndim == 2 and not varlen_m:
         C = C.unsqueeze(0)  # (1, M, N)
-    assert out.shape == (
-        A.shape[1],
-        B.shape[1],
-    ), f"out shape mismatch: {out.shape} vs {(A.shape[1], B.shape[1])}"
-    out = out.unsqueeze(0)
+    if out.ndim == 2 and not varlen_m:
+        out = out.unsqueeze(0)
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
+    batch_size = B.shape[0] if not varlen_k else cu_seqlens_k.shape[0] - 1
+    if varlen_m:
+        # If gather_A (A_idx provided), use its length; otherwise use A.shape[0]
+        total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-2])
+    else:
+        out_shape = (batch_size, A.shape[-2], B.shape[-2])
+    assert out.shape == out_shape, f"out shape mismatch: {out.shape} vs {out_shape}"
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_sm90(
+    gemm_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         out if not config.swap_ab else out.mT,
@@ -72,77 +129,129 @@ def gemm_tuned(
         config.cluster_m,
         config.cluster_n,
         config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
         alpha=alpha,
         beta=beta,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
     )
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
-    key=["activation"],
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
+    key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
 def gemm_act_tuned(
-    A: Tensor,  # (M, K)
-    B: Tensor,  # (K, N)
-    preact_out: Optional[Tensor],  # (M, N) - None if not storing preact
-    postact_out: Tensor,  # (M, N)
-    C: Optional[Tensor] = None,  # (M, N)
+    # (M, K) or or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N)
+    # (M, N) or (L, M, N) or (total_M, N) if varlen_m - None if not storing preact
+    preact_out: Optional[Tensor],
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = False,
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
-    A, B = A.unsqueeze(0), B.mT.unsqueeze(0)  # (1, M, K), (1, N, K)
-    if C is not None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen_m:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if C is not None and C.ndim == 2 and not varlen_m:
         C = C.unsqueeze(0)  # (1, M, N)
-    if preact_out is not None:
-        assert preact_out.shape == (A.shape[1], B.shape[1])
+    if preact_out is not None and preact_out.ndim == 2 and not varlen_m:
         D = preact_out.unsqueeze(0)
     else:
-        D = None
-    assert postact_out.shape == (A.shape[1], B.shape[1])
-    PostAct = postact_out.unsqueeze(0)
-    gemm_act_sm90(
+        D = preact_out
+    if postact_out.ndim == 2 and not varlen_m:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_act_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         (D if not config.swap_ab else D.mT) if D is not None else None,
         (C if not config.swap_ab else C.mT) if C is not None else None,
         PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
         activation,
         config.tile_m,
         config.tile_n,
         config.cluster_m,
         config.cluster_n,
         config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
     )
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
 def gemm_dact_tuned(
-    A: Tensor,  # (M, K)
-    B: Tensor,  # (K, N)
-    PreAct: Tensor,  # (M, N)
-    dx_out: Tensor,  # (M, N)
-    postact_out: Tensor,  # (M, N)
+    # (M, K) or or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, N, N) or (total_M, N) if varlen_m
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
-    A, B = A.unsqueeze(0), B.mT.unsqueeze(0)  # (1, M, K), (1, N, K)
-    PreAct = PreAct.unsqueeze(0)  # (1, M, N)
-    assert dx_out.shape == (A.shape[1], B.shape[1])
-    D = dx_out.unsqueeze(0)
-    assert postact_out.shape == (A.shape[1], B.shape[1])
-    PostAct = postact_out.unsqueeze(0)
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen_m:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if PreAct.ndim == 2 and not varlen_m:
+        PreAct = PreAct.unsqueeze(0)  # (1, M, N)
+    if dx_out.ndim == 2 and not varlen_m:
+        D = dx_out.unsqueeze(0)
+    else:
+        D = dx_out
+    if postact_out.ndim == 2 and not varlen_m:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_dact_sm90(
+    gemm_dact_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         D if not config.swap_ab else D.mT,
@@ -155,30 +264,58 @@ def gemm_dact_tuned(
         config.cluster_m,
         config.cluster_n,
         config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
     )
 def gemm(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    out: Optional[Tensor] = None,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float | Tensor = 1.0,
     out_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> Tensor:
     """GEMM with optional output tensor and tuning control."""
     if out is None:
         out_dtype = A.dtype if out_dtype is None else out_dtype
-        out = torch.empty((A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        varlen_m = cu_seqlens_m is not None
+        varlen_k = cu_seqlens_k is not None
+        if varlen_m:
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_shape = (total_m, B.shape[-1])
+        elif varlen_k:
+            L = cu_seqlens_k.shape[0] - 1
+            # For varlen_k, the first dimension is always A.shape[0] (M dimension)
+            out_shape = (L, A.shape[0], B.shape[-1])
+        else:
+            out_shape = (
+                (A.shape[0], B.shape[-1]) if A.ndim == 2 else (A.shape[0], A.shape[-2], B.shape[-1])
+            )
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     alpha_tensor = alpha if not isinstance(alpha, float) else None
     alpha = alpha if isinstance(alpha, float) else 1.0
     gemm_out(
         A,
         B,
         out,
+        bias=bias,
         alpha=alpha,
         alpha_tensor=alpha_tensor,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
         tuned=tuned,
     )
@@ -191,53 +328,134 @@ def gemm(
     device_types="cuda",
     # We have to split out alpha and alpha_tensor since torch.library requires
     # each argument to have a fixed type
-    # schema="(Tensor A, Tensor B, Tensor(a2!) out, float alpha=1.0, Tensor? alpha_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+    # schema="(Tensor A, Tensor B, Tensor(a2!) out, Tensor? bias, float alpha=1.0, Tensor? alpha_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
 )
 def gemm_out(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    out: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float = 1.0,
     alpha_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     """GEMM with pre-allocated output tensor."""
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
-    fn(A, B, out, C=None, alpha=alpha, dynamic_scheduler=dynamic_scheduler)
+    fn(
+        A,
+        B,
+        out,
+        C=None,
+        bias=bias,
+        alpha=alpha,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        dynamic_scheduler=dynamic_scheduler,
+    )
 def gemm_ref(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    out: Optional[Tensor] = None,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     alpha: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
     out_dtype: Optional[torch.dtype] = None,
 ) -> Tensor:
     """Reference implementation for GEMM with pre-allocated output."""
     # The out_dtype argument requires torch >= 2.8
-    out = torch.mm(A, B, out_dtype=out_dtype, out=out)
-    if not isinstance(alpha, float) or alpha != 1.0:
-        out = out * alpha
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    if cu_seqlens_m is None and cu_seqlens_k is None:
+        fn = torch.bmm if A.ndim == 3 else torch.mm
+        out = fn(A, B, out_dtype=out_dtype, out=out)
+        if not isinstance(alpha, float) or alpha != 1.0:
+            out *= alpha
+        if bias is not None:
+            bias = bias if A.ndim == 2 else bias.unsqueeze(1)
+            out += bias
+    elif cu_seqlens_m is not None:
+        # Handle varlen_m case
+        if out is None:
+            # When gather_A (A_idx provided), output size is determined by A_idx length
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out = torch.empty((total_m, B.shape[-1]), dtype=out_dtype, device=A.device)
+        for i in range(cu_seqlens_m.shape[0] - 1):
+            A_slice = (
+                A[A_idx[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]]
+                if A_idx is not None
+                else A[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            )
+            torch.mm(A_slice, B[i], out=out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]])
+            if not isinstance(alpha, float) or alpha != 1.0:
+                out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]] *= alpha
+            if bias is not None:
+                out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]] += bias[i]
+    else:  # cu_seqlens_k is not None
+        L = cu_seqlens_k.shape[0] - 1
+        if out is None:
+            out = torch.empty((L, A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        for i in range(L):
+            A_slice = (
+                A[:, A_idx[cu_seqlens_k[i] : cu_seqlens_k[i + 1]]]
+                if A_idx is not None
+                else A[:, cu_seqlens_k[i] : cu_seqlens_k[i + 1]]
+            )
+            torch.mm(A_slice, B[cu_seqlens_k[i] : cu_seqlens_k[i + 1], :], out=out[i])
+        if not isinstance(alpha, float) or alpha != 1.0:
+            out *= alpha
+        if bias is not None:
+            out += bias
     return out
 def gemm_add(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    C: Tensor,
-    out: Optional[Tensor] = None,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     alpha: float | Tensor = 1.0,
     beta: float | Tensor = 1.0,
     out_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> Tensor:
     """GEMM with addition and optional output tensor."""
     if out is None:
         out_dtype = A.dtype if out_dtype is None else out_dtype
-        out = torch.empty((A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        varlen_m = cu_seqlens_m is not None
+        varlen_k = cu_seqlens_k is not None
+        if varlen_m:
+            # If A_idx is provided (gather_A), use its length; otherwise use A.shape[0]
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_shape = (total_m, B.shape[-1])
+        elif varlen_k:
+            L = cu_seqlens_k.shape[0] - 1
+            # For varlen_k, the first dimension is always A.shape[0] (M dimension)
+            out_shape = (L, A.shape[0], B.shape[-1])
+        else:
+            out_shape = (
+                (A.shape[0], B.shape[-1]) if A.ndim == 2 else (A.shape[0], A.shape[-2], B.shape[-1])
+            )
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    add_to_output = C is out and isinstance(beta, float) and beta == 1.0 and cu_seqlens_m is None
     alpha_tensor = alpha if not isinstance(alpha, float) else None
     alpha = alpha if isinstance(alpha, float) else 1.0
     beta_tensor = beta if not isinstance(beta, float) else None
@@ -245,12 +463,17 @@ def gemm_add(
     gemm_add_out(
         A,
         B,
-        C,
+        C if not add_to_output else None,
         out,
         alpha,
         beta,
         alpha_tensor,
         beta_tensor,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
         dynamic_scheduler=dynamic_scheduler,
         tuned=tuned,
     )
@@ -263,17 +486,23 @@ def gemm_add(
     device_types="cuda",
     # We have to split out alpha and alpha_tensor since torch.library requires
     # each argument to have a fixed type
-    # schema="(Tensor A, Tensor B, Tensor C, Tensor(a3!) out, float alpha=1.0, float beta=1.0, Tensor? alpha_tensor=None, Tensor? beta_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+    # schema="(Tensor A, Tensor B, Tensor C, Tensor(a3!) out, float alpha=1.0, float beta=1.0, Tensor? alpha_tensor=None, Tensor? beta_tensor=None, Tensor? cu_seqlens_m=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
 )
 def gemm_add_out(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    C: Tensor,
-    out: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     alpha: float = 1.0,
     beta: float = 1.0,
     alpha_tensor: Optional[Tensor] = None,
     beta_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    add_to_output: bool = False,
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
@@ -281,47 +510,112 @@ def gemm_add_out(
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
     beta = beta_tensor if beta_tensor is not None else beta
-    fn(A, B, out, C, alpha=alpha, beta=beta, dynamic_scheduler=dynamic_scheduler)
+    fn(
+        A,
+        B,
+        out,
+        C,
+        alpha=alpha,
+        beta=beta,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+        dynamic_scheduler=dynamic_scheduler,
+    )
 def gemm_add_ref(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    C: Tensor,
-    out: Optional[Tensor] = None,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     alpha: float | Tensor = 1.0,
     beta: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
     out_dtype: Optional[torch.dtype] = None,
 ) -> Tensor:
     """Reference implementation for GEMM with addition and pre-allocated output."""
-    if isinstance(alpha, float) and isinstance(beta, float):
-        return torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
-    else:
-        out_dtype = (
-            out.dtype if out is not None else (out_dtype if out_dtype is not None else A.dtype)
-        )
-        result = (alpha * (A @ B) + beta * C).to(out_dtype)
-        if out is not None:
-            out.copy_(result)
-        return result
+    if cu_seqlens_m is None and cu_seqlens_k is None:
+        if isinstance(alpha, float) and isinstance(beta, float):
+            out = torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
+        else:
+            out_dtype = (
+                out.dtype if out is not None else (out_dtype if out_dtype is not None else A.dtype)
+            )
+            result = (alpha * (A @ B) + beta * C).to(out_dtype)
+            if out is not None:
+                out.copy_(result)
+        if bias is not None:
+            bias = bias if A.ndim == 2 else bias.unsqueeze(1)
+            out += bias
+    elif cu_seqlens_m is not None:
+        # Handle varlen_m case
+        if out is None:
+            # When gather_A (A_idx provided), output size is determined by A_idx length
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_dtype = out_dtype if out_dtype is not None else A.dtype
+            out = torch.empty((total_m, B.shape[-1]), dtype=out_dtype, device=A.device)
+        for i in range(cu_seqlens_m.shape[0] - 1):
+            A_slice = (
+                A[A_idx[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]]
+                if A_idx is not None
+                else A[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            )
+            C_slice = C[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            out_slice = out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            result = alpha * torch.mm(A_slice, B[i]) + beta * C_slice
+            if bias is not None:
+                result += bias[i]
+            out_slice.copy_(result)
+    else:  # cu_seqlens_k is not None
+        # Handle varlen_k case
+        L = cu_seqlens_k.shape[0] - 1
+        out_dtype = out_dtype if out_dtype is not None else A.dtype
+        if out is None:
+            out = torch.empty((L, A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        for i in range(L):
+            A_slice = (
+                A[:, A_idx[cu_seqlens_k[i] : cu_seqlens_k[i + 1]]]
+                if A_idx is not None
+                else A[:, cu_seqlens_k[i] : cu_seqlens_k[i + 1]]
+            )
+            B_slice = B[cu_seqlens_k[i] : cu_seqlens_k[i + 1], :]
+            result = alpha * torch.mm(A_slice, B_slice) + beta * C[i]
+            out[i].copy_(result)
+        if bias is not None:
+            out += bias
+    return out
 def gemm_add_inplace(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    out: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
     alpha: float | Tensor = 1.0,
     beta: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     """In-place GEMM with addition: out = alpha * A @ B + beta * out.
     Args:
-        A: (M, K) input tensor
-        B: (K, N) input tensor
-        out: (M, N) tensor to accumulate into (modified in-place)
+        A: (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k - input tensor
+        B: (K, N) or (L, K, N) or (total_K, N) if varlen_k - input tensor
+        out: (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k - tensor to accumulate into (modified in-place)
         alpha: Scalar multiplier for A @ B
         beta: Scalar multiplier for out
+        cu_seqlens_m: Optional cumulative sequence lengths for variable M
+        cu_seqlens_k: Optional cumulative sequence lengths for variable K
         dynamic_scheduler: Whether to use dynamic scheduler
         tuned: Whether to use autotuned configuration
     """
@@ -329,7 +623,21 @@ def gemm_add_inplace(
     alpha = alpha if isinstance(alpha, float) else 1.0
     beta_tensor = beta if not isinstance(beta, float) else None
     beta = beta if isinstance(beta, float) else 1.0
-    gemm_add_inplace_op(A, B, out, alpha, beta, alpha_tensor, beta_tensor, dynamic_scheduler, tuned)
+    gemm_add_inplace_op(
+        A,
+        B,
+        out,
+        alpha,
+        beta,
+        alpha_tensor,
+        beta_tensor,
+        cu_seqlens_m,
+        cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        dynamic_scheduler=dynamic_scheduler,
+        tuned=tuned,
+    )
 @torch.library.custom_op(
@@ -338,46 +646,90 @@ def gemm_add_inplace(
     device_types="cuda",
     # We have to split out alpha and alpha_tensor since torch.library requires
     # each argument to have a fixed type
-    # schema="(Tensor A, Tensor B, Tensor(a2!) out, float alpha=1.0, float beta=1.0, Tensor? alpha_tensor=None, Tensor? beta_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+    # schema="(Tensor A, Tensor B, Tensor(a2!) out, float alpha=1.0, float beta=1.0, Tensor? alpha_tensor=None, Tensor? beta_tensor=None, Tensor? cu_seqlens_m=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
 )
 def gemm_add_inplace_op(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
     A: Tensor,
-    B: Tensor,
-    out: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
     alpha: float = 1.0,
     beta: float = 1.0,
     alpha_tensor: Optional[Tensor] = None,
     beta_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
     beta = beta_tensor if beta_tensor is not None else beta
-    # Use C as both input bias and output
-    fn(A, B, out, out, alpha=alpha, beta=beta, dynamic_scheduler=dynamic_scheduler)
+    add_to_output = isinstance(beta, float) and beta == 1.0 and cu_seqlens_m is None
+    # Use out as both input bias and output
+    fn(
+        A,
+        B,
+        out,
+        out if not add_to_output else None,
+        alpha=alpha,
+        beta=beta,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+        dynamic_scheduler=dynamic_scheduler,
+    )
 def gemm_act(
-    A: Tensor,
-    B: Tensor,
-    C: Optional[Tensor] = None,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
-    preact_out: Optional[Tensor] = None,
-    postact_out: Optional[Tensor] = None,
+    preact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     store_preact: bool = True,
+    dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> Tuple[Optional[Tensor], Tensor]:
     """GEMM with activation and optional output tensors."""
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
+    varlen_m = cu_seqlens_m is not None
+    # Determine output shape based on gather_A
+    if varlen_m:
+        total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-1])
+    elif A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
     if preact_out is None and store_preact:
-        preact_out = torch.empty((A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        preact_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     if postact_out is None:
-        postact_out = torch.empty((A.shape[0], B.shape[1]), dtype=postact_dtype, device=A.device)
-    gemm_act_out(A, B, preact_out, postact_out, C, activation, tuned)
+        postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
+    gemm_act_out(
+        A,
+        B,
+        preact_out,
+        postact_out,
+        C,
+        bias,
+        activation,
+        cu_seqlens_m,
+        A_idx,
+        dynamic_scheduler,
+        tuned,
+    )
     return preact_out, postact_out
@@ -385,58 +737,81 @@ def gemm_act(
     "quack::gemm_act_out",
     mutates_args=("preact_out", "postact_out"),
     device_types="cuda",
-    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, str? activation=None, bool tuned=True) -> ()",
+    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
 )
 def gemm_act_out(
-    A: Tensor,
-    B: Tensor,
-    preact_out: Optional[Tensor],
-    postact_out: Tensor,
-    C: Optional[Tensor] = None,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    preact_out: Optional[Tensor],  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     """GEMM with activation and pre-allocated output tensors."""
     fn = gemm_act_tuned if tuned else partial(gemm_act_tuned.fn, config=None)
-    fn(A, B, preact_out, postact_out, C, activation)
+    fn(A, B, preact_out, postact_out, C, bias, activation, cu_seqlens_m, A_idx, dynamic_scheduler)
 def gemm_act_ref(
-    A: Tensor,
-    B: Tensor,
-    C: Optional[Tensor] = None,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
     store_preact: bool = True,
 ) -> Tuple[Optional[Tensor], Tensor]:
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
-    out = torch.mm(A, B) if C is None else C + torch.mm(A, B)
+    if C is None:
+        out = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+    else:
+        out = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     postact = act_to_pytorch_fn_map[activation](out).to(postact_dtype)
     return out.to(out_dtype) if store_preact else None, postact
 def gemm_dact(
-    A: Tensor,
-    B: Tensor,
-    PreAct: Tensor,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
-    dx_out: Optional[Tensor] = None,
-    postact_out: Optional[Tensor] = None,
+    dx_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
     tuned: bool = True,
 ) -> Tuple[Tensor, Tensor]:
     """GEMM with activation gradient and optional output tensors."""
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
+    varlen_m = cu_seqlens_m is not None
+    # Determine output shape based on gather_A
+    if varlen_m:
+        total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-1])
+    elif A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
     if dx_out is None:
-        dx_out = torch.empty((A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        dx_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     if postact_out is None:
-        postact_out = torch.empty((A.shape[0], B.shape[1]), dtype=postact_dtype, device=A.device)
-    gemm_dact_out(A, B, PreAct, dx_out, postact_out, activation, dynamic_scheduler, tuned)
+        postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
+    gemm_dact_out(
+        A, B, PreAct, dx_out, postact_out, activation, cu_seqlens_m, A_idx, dynamic_scheduler, tuned
+    )
     return dx_out, postact_out
@@ -444,35 +819,39 @@ def gemm_dact(
     "quack::gemm_dact_out",
     mutates_args=("dx_out", "postact_out"),
     device_types="cuda",
-    schema="(Tensor A, Tensor B, Tensor PreAct, Tensor(a3!) dx_out, Tensor(a4!) postact_out, str? activation=None, bool dynamic_scheduler=True, bool tuned=True) -> ()",
+    schema="(Tensor A, Tensor B, Tensor PreAct, Tensor(a3!) dx_out, Tensor(a4!) postact_out, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> ()",
 )
 def gemm_dact_out(
-    A: Tensor,
-    B: Tensor,
-    PreAct: Tensor,
-    dx_out: Tensor,
-    postact_out: Tensor,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
     tuned: bool = True,
 ) -> None:
     """GEMM with activation gradient and pre-allocated output tensors."""
     fn = gemm_dact_tuned if tuned else partial(gemm_dact_tuned.fn, config=None)
-    fn(A, B, PreAct, dx_out, postact_out, activation, dynamic_scheduler)
+    fn(A, B, PreAct, dx_out, postact_out, activation, cu_seqlens_m, A_idx, dynamic_scheduler)
 def gemm_dact_ref(
-    A: Tensor,
-    B: Tensor,
-    PreAct: Tensor,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
 ) -> Tuple[Tensor, Tensor]:
     """Reference implementation for GEMM with activation gradient."""
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
-    dout = torch.mm(A, B).to(out_dtype)
+    dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
     postact = act_to_pytorch_fn_map[activation](PreAct)
     # Compute gradient using autograd
     if activation is None:
@@ -487,10 +866,13 @@ def gemm_dact_ref(
 def gemm_gated_ref(
-    A: Tensor,
-    B: Tensor,
-    C: Optional[Tensor] = None,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
     activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"] = "swiglu",
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
     store_preact: bool = True,
@@ -499,8 +881,8 @@ def gemm_gated_ref(
     Args:
         A: (M, K) - input tensor
-        B: (K, 2*N) - weight tensor with gate and up projections
-        C: (M, 2*N) - optional bias tensor
+        B: (K, N) - weight tensor with gate and up projections
+        C: (M, N) - optional bias tensor
         activation: Type of gated activation
         out_dtype: Output dtype for preact
         postact_dtype: Output dtype for postact
@@ -508,24 +890,29 @@ def gemm_gated_ref(
     Returns:
         (preact, postact) where:
-        - preact: (M, 2*N) pre-activation (if store_preact=True, else None)
-        - postact: (M, N) post-activation output
+        - preact: (M, N) pre-activation (if store_preact=True, else None)
+        - postact: (M, N // 2) post-activation output
     """
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
-    preact = torch.mm(A, B) if C is None else C + torch.mm(A, B)
+    if C is None:
+        preact = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+    else:
+        preact = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     # Split preact into gate and up projections
-    gate = preact[..., ::2]  # (M, N)
-    up = preact[..., 1::2]  # (M, N)
+    gate = preact[..., ::2]  # (M, N//2)
+    up = preact[..., 1::2]  # (M, N//2)
     postact = gated_to_pytorch_fn_map[activation](gate, up)
     return preact.to(out_dtype) if store_preact else None, postact.to(postact_dtype)
 def gemm_dgated_ref(
-    A: Tensor,
-    B: Tensor,
-    PreAct: Tensor,
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    PreAct: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
     activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"],
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
 ) -> Tuple[Tensor, Tensor]:
@@ -546,18 +933,100 @@ def gemm_dgated_ref(
     """
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
-    dout = torch.mm(A, B).to(out_dtype)
+    dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
     # Split PreAct into gate and up projections
     gate = PreAct[..., ::2]  # (M, N)
     up = PreAct[..., 1::2]  # (M, N)
-    postact = gated_to_pytorch_fn_map[activation](gate, up)
     # Use autograd to compute gradients w.r.t. gate and up
+    gate_requires_grad, up_requires_grad = gate.requires_grad, up.requires_grad
+    gate.requires_grad_(True)
+    up.requires_grad_(True)
+    postact = gated_to_pytorch_fn_map[activation](gate, up)
     dgate, dup = torch.autograd.grad(postact, [gate, up], dout, create_graph=False)
+    gate.requires_grad_(gate_requires_grad)
+    up.requires_grad_(up_requires_grad)
     # Interleave gradients back
     dx = torch.stack([dgate, dup], dim=-1).reshape(PreAct.shape)
     return dx.to(out_dtype), postact.to(postact_dtype)
+@torch.library.custom_op(
+    "quack::gemm_symmetric_out",
+    mutates_args=("out",),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!) out, Tensor? C=None, bool dynamic_scheduler=False, float alpha=1.0, float beta=1.0) -> ()",
+)
+def gemm_symmetric_out(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, M) or (L, K, M)
+    out: Tensor,  # (M, M) or (L, M, M)
+    C: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    dynamic_scheduler: bool = False,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+) -> None:
+    """GEMM with guaranteed symmetric output."""
+    if A.ndim == 2:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (M, K) or (L, M, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, M, K)
+    if C is not None and C.ndim == 2:
+        C = C.unsqueeze(0)  # (1, M, M)
+    if out.ndim == 2:
+        out = out.unsqueeze(0)
+    else:
+        out = out
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_symmetric_sm90_sm100(
+        A,
+        B,
+        out if out is not None else None,
+        C if C is not None else None,
+        tile_count_semaphore,
+        tile_M=128,
+        tile_N=256,
+        cluster_M=2,
+        cluster_N=1,
+        pingpong=False,
+        persistent=True,
+        max_swizzle_size=8,
+        alpha=alpha,
+        beta=beta,
+    )
+def gemm_symmetric(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, M) or (L, K, M)
+    C: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    out: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    out_dtype: Optional[torch.dtype] = None,
+    dynamic_scheduler: bool = False,
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """GEMM with symmetric output."""
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    # Determine output shape based on gather_A
+    if A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    if out is None:
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    alpha_val = alpha if isinstance(alpha, float) else 1.0
+    beta_val = beta if isinstance(beta, float) else 1.0
+    gemm_symmetric_out(
+        A, B, out, C, dynamic_scheduler=dynamic_scheduler, alpha=alpha_val, beta=beta_val
+    )
+    return out
 # TODO: this is not quite right, do we need to register gemm_add not gemm_add_out?
 # try:
 #     from torch._inductor.fx_passes.reinplace import InplaceableOp

quack-kernels 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

quack-kernels 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl