PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/gemm/triton/grouped_gemm.py ADDED Viewed

@@ -0,0 +1,1132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import functools
+import warnings
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from triton.runtime import driver  # @manual
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+    TMA_AVAILABLE = True
+except ImportError:
+    TMA_AVAILABLE = False
+    pass
+def _grouped_gemm_set_block_size_hook(nargs):
+    BLOCK_M = nargs["BLOCK_SIZE_M"]
+    BLOCK_N = nargs["BLOCK_SIZE_N"]
+    BLOCK_K = nargs["BLOCK_SIZE_K"]
+    if nargs["USE_TMA_LOAD"]:
+        nargs["a_desc_ptr"].block_shape = [BLOCK_M, BLOCK_K]
+        nargs["b_desc_ptr"].block_shape = [BLOCK_N, BLOCK_K]
+_NV_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+            "NUM_CONSUMER_GROUPS": 1,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+        num_ctas=num_ctas,
+        pre_hook=_grouped_gemm_set_block_size_hook,
+    )
+    for block_size_m in [64, 128]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+    for num_ctas in [1]
+]
+if TMA_AVAILABLE:
+    _NV_WS_CONFIGS = [
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": block_size_m,
+                "BLOCK_SIZE_N": block_size_n,
+                "BLOCK_SIZE_K": block_size_k,
+                "NUM_CONSUMER_GROUPS": 1,
+                "USE_TMA_STORE": use_tma_store,
+            },
+            num_stages=num_stages,
+            num_warps=num_warps,
+            num_ctas=num_ctas,
+            pre_hook=_grouped_gemm_set_block_size_hook,
+        )
+        for block_size_m in [64, 128, 256]
+        for block_size_n in [64, 128, 256]
+        for block_size_k in [64, 128, 256]
+        for num_stages in [2, 3, 4]
+        for num_warps in [4, 8, 16]
+        for num_ctas in [1]
+        for use_tma_store in [False]
+    ]
+else:
+    _NV_WS_CONFIGS = _NV_CONFIGS
+_AMD_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+            "waves_per_eu": waves_per_cu,
+            "matrix_instr_nonkdim": matrix_instr_nonkdim,
+            "NUM_CONSUMER_GROUPS": 1,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+    )
+    for block_size_m in [32, 64, 128]
+    for block_size_n in [32, 64, 128, 256]
+    for block_size_k in [128, 256]
+    for num_stages in [1, 2]
+    for num_warps, waves_per_cu in [(4, 1), (8, 2), (16, 4)]
+    for matrix_instr_nonkdim in [16]
+]
+def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    if dtsize is None:
+        dtsize = named_args["c_ptr"].element_size()
+    if dtype is None:
+        dtype = named_args["c_ptr"].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        (
+            BLOCK_M,
+            BLOCK_N,
+            BLOCK_K,
+            num_stages,
+            use_tma_load_on_scales,
+        ) = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+            kw.get("USE_TMA_LOAD_ON_SCALES", False),
+        )
+        G, M, N = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        if torch.version.hip:
+            required_shared_memory = BLOCK_N * BLOCK_K * num_stages * dtsize
+        else:
+            required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 32 if torch.version.hip else 64
+        # 2. make sure we don't load M tiles that are too big
+        if BLOCK_M > MIN_M_TILES and BLOCK_M > (M_PER_GROUP * 2):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = (N + BLOCK_N - 1) // BLOCK_N
+        MIN_N_TILES = 32 if torch.version.hip else 64
+        # 4. make sure we don't load N tiles that are too big
+        if BLOCK_N > MIN_N_TILES and M * N_TILES < num_sm:
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        if dtsize >= 2:
+            if use_tma_load_on_scales:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+def early_config_prune_ws(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    if dtsize is None:
+        dtsize = named_args["c_ptr"].element_size()
+    if dtype is None:
+        dtype = named_args["c_ptr"].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        (
+            BLOCK_M,
+            BLOCK_N,
+            BLOCK_K,
+            num_stages,
+            use_tma_load_on_scales,
+        ) = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+            kw.get("USE_TMA_LOAD_ON_SCALES", False),
+        )
+        G, M, N = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        if torch.version.hip:
+            required_shared_memory = BLOCK_N * BLOCK_K * num_stages * dtsize
+        else:
+            required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 32 if torch.version.hip else 64
+        # 2. make sure we don't load M tiles that are too big
+        if BLOCK_M > MIN_M_TILES and BLOCK_M > (M_PER_GROUP * 2):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = (N + BLOCK_N - 1) // BLOCK_N
+        MIN_N_TILES = 32 if torch.version.hip else 64
+        # 4. make sure we don't load N tiles that are too big
+        if BLOCK_N > MIN_N_TILES and M * N_TILES < num_sm:
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        if dtsize >= 2:
+            if use_tma_load_on_scales:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+@triton.autotune(
+    configs=_AMD_CONFIGS if torch.version.hip else _NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _mslk_grouped_gemm(
+    a_desc_ptr,
+    b_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    bias_ptr,
+    token_weights_ptr,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_TOKEN_WEIGHTS: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+) -> None:
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g)
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            n_size = N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            num_tiles = num_m_tiles * num_n_tiles
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, n_size],
+                    # pyre-ignore
+                    strides=[n_size, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            while tidx >= iterated_tiles and tidx < iterated_tiles + num_tiles:
+                gidx = tidx - iterated_tiles
+                # Split M first and N second.
+                tile_m_idx = gidx % num_m_tiles
+                tile_n_idx = gidx // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                if USE_TMA_LOAD:
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = a_desc_ptr.load([m_offset, k_offset])
+                        b = b_desc_ptr.load([n_offset, k_offset])
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                else:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    offs_k = tl.arange(0, BLOCK_SIZE_K)
+                    a_ptrs = (
+                        a_desc_ptr
+                        + (M_start_offset + offs_am[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    b_ptrs = (
+                        b_desc_ptr
+                        + (N_start_offset + offs_bn[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        updated_k_offset = k_offset + offs_k
+                        updated_k_offset_mask = updated_k_offset[None, :] < K  # type: ignore[16]
+                        a = tl.load(
+                            a_ptrs,
+                            mask=((offs_am[:, None] < m_size) & updated_k_offset_mask),
+                            other=0.0,
+                        )
+                        b = tl.load(
+                            b_ptrs,
+                            mask=((offs_bn[:, None] < n_size) & updated_k_offset_mask),
+                            other=0.0,
+                        )
+                        accumulator += tl.dot(a, b.T)
+                        a_ptrs += BLOCK_SIZE_K
+                        b_ptrs += BLOCK_SIZE_K
+                if HAS_BIAS:
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    bias_ptrs = bias_ptr + g.to(tl.int64) * N + offs_bn
+                    bias = tl.load(bias_ptrs, mask=(offs_bn < n_size), other=0.0).to(
+                        accumulator.dtype
+                    )
+                    accumulator = accumulator + bias[None, :]
+                if HAS_TOKEN_WEIGHTS:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    tw_ptrs = token_weights_ptr + M_start_offset + offs_am
+                    tw = tl.load(tw_ptrs, mask=(offs_am < m_size), other=1.0).to(
+                        accumulator.dtype
+                    )
+                    accumulator = accumulator * tw[:, None]
+                if USE_TMA_STORE:
+                    m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    # pyre-ignore
+                    c_desc_ptr.store(
+                        [m_offset, n_offset], accumulator.to(c_ptr.dtype.element_ty)
+                    )
+                elif FUSE_SCATTER_ADD:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    mask = offs_am < m_size
+                    m_offsets = tl.load(
+                        scatter_add_indices + M_start_offset + offs_am,
+                        mask=mask,
+                        cache_modifier=".ca",
+                    )
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    c = accumulator.to(c_ptr.dtype.element_ty)
+                    tl.atomic_add(
+                        c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                        c,
+                        mask=mask[:, None] and offs_bn[None, :] < n_size,
+                        sem="relaxed",
+                    )
+                else:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    c = accumulator.to(c_ptr.dtype.element_ty)
+                    tl.store(
+                        c_ptr
+                        + (M_start_offset + offs_am[:, None]) * N
+                        + offs_bn[None, :],
+                        c,
+                        mask=offs_am[:, None] < m_size and offs_bn[None, :] < n_size,
+                    )
+                tidx += NUM_SMS
+            iterated_tiles += num_tiles
+# TODO(shikaili): Too much code duplication. Need to refactor.
+@triton.autotune(
+    configs=_NV_WS_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune_ws},
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _mslk_grouped_gemm_ws(
+    a_desc_ptr,
+    b_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    bias_ptr,
+    token_weights_ptr,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_TOKEN_WEIGHTS: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+) -> None:
+    tl.static_assert(USE_TMA_LOAD, "Always use TMA load with warp specialziation!")
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g, cache_modifier=".ca")
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            tl.static_assert(N % BLOCK_SIZE_N == 0, f"{N=} {BLOCK_SIZE_N=}")
+            NUM_N_TILES: tl.constexpr = N // BLOCK_SIZE_N
+            num_tiles = num_m_tiles * NUM_N_TILES
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            next_iterated_tiles = iterated_tiles + num_tiles
+            if (tidx >= iterated_tiles) and (tidx < next_iterated_tiles):
+                for i in range(tidx, next_iterated_tiles, NUM_SMS):
+                    gidx = i - iterated_tiles
+                    # Split M first and N second.
+                    tile_m_idx = gidx % num_m_tiles
+                    tile_n_idx = gidx // num_m_tiles
+                    accumulator = tl.zeros(
+                        (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32
+                    )
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = a_desc_ptr.load([m_offset, k_offset])
+                        b = b_desc_ptr.load([n_offset, k_offset])
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                    if HAS_BIAS:
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        bias_ptrs = bias_ptr + g.to(tl.int64) * N + offs_bn
+                        bias = tl.load(bias_ptrs).to(accumulator.dtype)
+                        accumulator = accumulator + bias[None, :]
+                    if HAS_TOKEN_WEIGHTS:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        tw_ptrs = token_weights_ptr + M_start_offset + offs_am
+                        tw = tl.load(tw_ptrs, mask=(offs_am < m_size), other=1.0).to(
+                            accumulator.dtype
+                        )
+                        accumulator = accumulator * tw[:, None]
+                    if USE_TMA_STORE:
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset],
+                            accumulator.to(c_ptr.dtype.element_ty),
+                        )
+                    elif FUSE_SCATTER_ADD:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
+                    else:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
+                    tidx += NUM_SMS
+            iterated_tiles += num_tiles
+TT_FP8_DTYPE = tl.float8e4b8 if torch.version.hip else tl.float8e4nv
+# TODO(shikaili): clean up redundant 'b_scale_desc_ptr' argument.
+@triton.autotune(
+    configs=_AMD_CONFIGS if torch.version.hip else _NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={
+        "early_config_prune": functools.partial(
+            early_config_prune, dtype=TT_FP8_DTYPE, dtsize=1
+        )
+    },
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _mslk_grouped_gemm_fp8_rowwise(
+    a_desc_ptr,
+    a_scale_ptr,
+    b_desc_ptr,
+    b_scale_ptr,
+    b_scale_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+) -> None:
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g)
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            n_size = N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            num_tiles = num_m_tiles * num_n_tiles
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, n_size],
+                    # pyre-ignore
+                    strides=[n_size, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            while tidx >= iterated_tiles and tidx < iterated_tiles + num_tiles:
+                gidx = tidx - iterated_tiles
+                # Split M first and N second.
+                tile_m_idx = gidx % num_m_tiles
+                tile_n_idx = gidx // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                tl.static_assert(K % BLOCK_SIZE_K == 0)
+                if USE_TMA_LOAD:
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = a_desc_ptr.load([m_offset, k_offset])
+                        b = b_desc_ptr.load([n_offset, k_offset])
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                else:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    offs_k = tl.arange(0, BLOCK_SIZE_K)
+                    a_ptrs = (
+                        a_desc_ptr
+                        + (M_start_offset + offs_am[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    b_ptrs = (
+                        b_desc_ptr
+                        + (N_start_offset + offs_bn[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    for _ in range(0, K, BLOCK_SIZE_K):
+                        a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
+                        b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
+                        accumulator += tl.dot(a, b.T)
+                        a_ptrs += BLOCK_SIZE_K
+                        b_ptrs += BLOCK_SIZE_K
+                offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                a_scale = tl.load(
+                    a_scale_ptr + M_start_offset + offs_am[:, None],
+                    mask=offs_am[:, None] < m_size,
+                )
+                b_scale = tl.load(
+                    b_scale_ptr + N_start_offset + offs_bn[None, :],
+                    mask=offs_bn[None, :] < n_size,
+                )
+                c = accumulator.to(tl.float32) * a_scale * b_scale
+                if USE_TMA_STORE:
+                    m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    # pyre-ignore
+                    c_desc_ptr.store([m_offset, n_offset], c.to(c_ptr.dtype.element_ty))
+                elif FUSE_SCATTER_ADD:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    mask = offs_am < m_size
+                    m_offsets = tl.load(
+                        scatter_add_indices + M_start_offset + offs_am,
+                        mask=mask,
+                        cache_modifier=".ca",
+                    )
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    tl.atomic_add(
+                        c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                        c.to(c_ptr.dtype.element_ty),
+                        mask=mask[:, None] and offs_bn[None, :] < n_size,
+                        sem="relaxed",
+                    )
+                else:
+                    tl.store(
+                        c_ptr
+                        + (M_start_offset + offs_am[:, None]) * N
+                        + offs_bn[None, :],
+                        c,
+                        mask=offs_am[:, None] < m_size and offs_bn[None, :] < n_size,
+                    )
+                tidx += NUM_SMS
+            iterated_tiles += num_tiles
+# TODO(shikaili): Too much code duplication. Need to refactor.
+@triton.autotune(
+    configs=_NV_WS_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={
+        "early_config_prune": functools.partial(
+            early_config_prune_ws, dtype=TT_FP8_DTYPE, dtsize=1
+        )
+    },
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _mslk_grouped_gemm_fp8_rowwise_ws(
+    a_desc_ptr,
+    a_scale_ptr,
+    b_desc_ptr,
+    b_scale_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+) -> None:
+    tl.static_assert(USE_TMA_LOAD, "Always use TMA load with warp specialziation!")
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g, cache_modifier=".ca")
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            tl.static_assert(N % BLOCK_SIZE_N == 0)
+            NUM_N_TILES: tl.constexpr = N // BLOCK_SIZE_N
+            num_tiles = num_m_tiles * NUM_N_TILES
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            next_iterated_tiles = iterated_tiles + num_tiles
+            if (tidx >= iterated_tiles) and (tidx < next_iterated_tiles):
+                for i in range(tidx, next_iterated_tiles, NUM_SMS):
+                    gidx = i - iterated_tiles
+                    # Split M first and N second.
+                    tile_m_idx = gidx % num_m_tiles
+                    tile_n_idx = gidx // num_m_tiles
+                    accumulator = tl.zeros(
+                        (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32
+                    )
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = a_desc_ptr.load([m_offset, k_offset])
+                        b = b_desc_ptr.load([n_offset, k_offset])
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    a_scale = tl.load(
+                        a_scale_ptr + M_start_offset + offs_am[:, None],
+                        mask=offs_am[:, None] < m_size,
+                        cache_modifier=".ca",
+                    )
+                    b_scale = tl.load(
+                        b_scale_ptr + N_start_offset + offs_bn[None, :],
+                        cache_modifier=".ca",
+                    )
+                    c = accumulator.to(tl.float32) * a_scale * b_scale
+                    if USE_TMA_STORE:
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
+                        )
+                    elif FUSE_SCATTER_ADD:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
+                    else:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
+                    tidx += NUM_SMS
+            iterated_tiles += num_tiles
+warnings.simplefilter("once")
+def _grouped_gemm(
+    *,
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    x_scale: Optional[torch.Tensor],
+    w_scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    token_weights: Optional[torch.Tensor],
+    use_fast_accum: bool,
+    use_warp_specialization: bool,
+    output_tensor: Optional[torch.Tensor],
+    scatter_add_indices: Optional[torch.Tensor],
+) -> torch.Tensor:
+    USE_TMA_LOAD = not torch.version.hip and TMA_AVAILABLE
+    USE_TMA_STORE = False
+    # TODO(shikaili): Check the readniess of WS on ROCm side in Meta's Triton.
+    if use_warp_specialization and torch.version.hip:
+        warnings.warn(
+            "Warp specialization is disabled as it is not supported on ROCm.",
+            stacklevel=2,
+        )
+        use_warp_specialization = False
+    if use_warp_specialization:
+        assert TMA_AVAILABLE, "TMA is not available"
+        USE_TMA_STORE = True  # Tuning decision
+    G = m_sizes.shape[0]
+    assert x.is_contiguous()
+    assert w.is_contiguous()
+    assert m_sizes.is_contiguous()
+    M, K = x.shape
+    N = w.shape[0] // G
+    assert K == w.shape[1]
+    if K % 8 != 0 or N % 8 != 0:
+        use_warp_specialization = False
+        USE_TMA_LOAD = False
+        USE_TMA_STORE = False
+        warnings.warn(
+            f"TMA load and warp specialization are disabled since K or N is not a multiple of 8: {K=}, {N=}.",
+            stacklevel=2,
+        )
+        assert x_scale is None, (
+            f"Quantisation is not supported yet when K or N is not a multiple of 8: {K=}, {N=}."
+        )
+        assert output_tensor is None, (
+            f"Fused scatter add has large rounding error when K or N is not a multiple of 8: {K=}, {N=}."
+        )
+    HAS_BIAS = bias is not None
+    if HAS_BIAS:
+        assert bias is not None  # for type checker
+        assert bias.is_contiguous(), "Bias must be contiguous"
+        assert len(bias.shape) == 2, f"Bias must be 2D, got shape {bias.shape}"
+        assert bias.shape[0] == G, f"Bias dim 0 must match G={G}, got {bias.shape[0]}"
+        assert bias.shape[1] == N, f"Bias dim 1 must match N={N}, got {bias.shape[1]}"
+    HAS_TOKEN_WEIGHTS = token_weights is not None
+    if HAS_TOKEN_WEIGHTS:
+        assert token_weights is not None  # for type checker
+        assert token_weights.is_contiguous(), "token_weights must be contiguous"
+        assert len(token_weights.shape) == 1, (
+            f"token_weights must be 1D, got shape {token_weights.shape}"
+        )
+        assert token_weights.shape[0] == M, (
+            f"token_weights dim 0 must match M={M}, got {token_weights.shape[0]}"
+        )
+    if output_tensor is None:
+        FUSE_SCATTER_ADD = False
+        assert scatter_add_indices is None
+        y = torch.empty((M, N), device=x.device, dtype=torch.bfloat16)
+    else:
+        FUSE_SCATTER_ADD = True
+        assert scatter_add_indices is not None
+        assert scatter_add_indices.is_contiguous()
+        assert scatter_add_indices.shape == (M,)
+        y = output_tensor
+    if M == 0 or N == 0:
+        return y
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    # A dummy block value that will be overwritten in the pre_hook when we have the real block size
+    dummy_block = [1, 1]
+    if USE_TMA_LOAD:
+        # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+        # argument, expected `List[int]` but got `Size`
+        desc_x = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+        # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+        # argument, expected `List[int]` but got `Size`
+        desc_w = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    else:
+        desc_x = x
+        desc_w = w
+    if USE_TMA_STORE:
+        def alloc_fn(size: int, alignment: int, stream: Optional[int]):
+            return torch.empty(size, device="cuda", dtype=torch.int8)
+        triton.set_allocator(alloc_fn)
+    def grid(META):
+        return (NUM_SMS,)
+    M_BUCKET_CAP = 16384
+    M_BUCKET = min(triton.next_power_of_2(M), M_BUCKET_CAP)
+    if x_scale is not None and w_scale is not None:
+        assert x_scale.is_contiguous()
+        assert w_scale.is_contiguous()
+        fn = (
+            _mslk_grouped_gemm_fp8_rowwise_ws
+            if use_warp_specialization
+            else _mslk_grouped_gemm_fp8_rowwise
+        )
+        if use_warp_specialization:
+            args = (
+                desc_x,
+                x_scale,
+                desc_w,
+                w_scale,
+                y,
+                scatter_add_indices,
+                m_sizes,
+                G,
+                M_BUCKET,
+                N,
+                K,
+                NUM_SMS,
+                FUSE_SCATTER_ADD,
+                USE_TMA_LOAD,
+                use_fast_accum,
+            )
+        else:
+            args = (
+                desc_x,
+                x_scale,
+                desc_w,
+                w_scale,
+                w_scale,  # b_scale_desc_ptr (unused, just passed for API compatibility)
+                y,
+                scatter_add_indices,
+                m_sizes,
+                G,
+                M_BUCKET,
+                N,
+                K,
+                NUM_SMS,
+                FUSE_SCATTER_ADD,
+                USE_TMA_LOAD,
+                USE_TMA_STORE,
+                use_fast_accum,
+            )
+        fn[grid](*args)
+    else:
+        assert x_scale is None
+        assert w_scale is None
+        fn = _mslk_grouped_gemm_ws if use_warp_specialization else _mslk_grouped_gemm
+        args = (
+            desc_x,
+            desc_w,
+            y,
+            scatter_add_indices,
+            m_sizes,
+            bias if HAS_BIAS else None,
+            token_weights if HAS_TOKEN_WEIGHTS else None,
+            G,
+            M_BUCKET,
+            N,
+            K,
+            NUM_SMS,
+            FUSE_SCATTER_ADD,
+            USE_TMA_LOAD,
+        )
+        if use_warp_specialization:
+            args += (use_fast_accum, HAS_BIAS, HAS_TOKEN_WEIGHTS)
+        else:
+            args += (USE_TMA_STORE, use_fast_accum, HAS_BIAS, HAS_TOKEN_WEIGHTS)
+        fn[grid](*args)
+    return y
+def grouped_gemm(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    token_weights: Optional[torch.Tensor] = None,
+    use_fast_accum: bool = True,
+    *,
+    _use_warp_specialization: bool = True,
+    _output_tensor: Optional[torch.Tensor] = None,
+    _scatter_add_indices: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Grouped GEMM with optional bias addition and per-token weight scaling.
+    Performs: output = (x @ w.T + bias) * token_weights
+    where operations are grouped by experts.
+    Args:
+        x: Input tensor [M, K] where M is total tokens across all experts
+        w: Weight tensor [G * N, K] where G is number of experts
+        m_sizes: Tensor [G] indicating number of tokens per expert
+        bias: Optional bias tensor [G, N], one bias vector per expert
+        token_weights: Optional per-token scaling weights [M] (e.g., router weights)
+        use_fast_accum: Enable fast accumulation for better performance
+        _use_warp_specialization: Flag for warp specialization
+        _output_tensor: Optional pre-allocated output tensor for scatter-add
+        _scatter_add_indices: Optional indices for scatter-add operation
+    Returns:
+        Output tensor [M, N]
+    """
+    return _grouped_gemm(
+        x=x,
+        w=w,
+        m_sizes=m_sizes,
+        x_scale=None,
+        w_scale=None,
+        bias=bias,
+        token_weights=token_weights,
+        use_fast_accum=use_fast_accum,
+        use_warp_specialization=_use_warp_specialization,
+        output_tensor=_output_tensor,
+        scatter_add_indices=_scatter_add_indices,
+    )
+def grouped_gemm_fp8_rowwise(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    x_scale: torch.Tensor,
+    w_scale: torch.Tensor,
+    use_fast_accum: bool = True,
+    *,
+    _use_warp_specialization: bool = True,
+    _output_tensor: Optional[torch.Tensor] = None,
+    _scatter_add_indices: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return _grouped_gemm(
+        x=x,
+        w=w,
+        m_sizes=m_sizes,
+        x_scale=x_scale,
+        w_scale=w_scale,
+        bias=None,
+        token_weights=None,
+        use_fast_accum=use_fast_accum,
+        use_warp_specialization=_use_warp_specialization,
+        output_tensor=_output_tensor,
+        scatter_add_indices=_scatter_add_indices,
+    )