PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py ADDED Viewed

@@ -0,0 +1,177 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from typing import Callable, Dict, List
+import click
+import pandas as pd
+import torch
+import triton  # @manual
+from mslk.gemm.triton.grouped_gemm import grouped_gemm
+def triton_fused_bench(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    bias: torch.Tensor,
+    token_weights: torch.Tensor,
+) -> Callable[[], torch.Tensor]:
+    """Factory for Triton fused grouped_gemm + bias + token_weights."""
+    def run() -> torch.Tensor:
+        return grouped_gemm(x, w, m_sizes, bias=bias, token_weights=token_weights)
+    return run
+@torch.compile(mode="reduce-overhead")
+def _torch_bmm_bias_scale(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    bias: torch.Tensor,
+    token_weights: torch.Tensor,
+    G: int,
+    M_per_group: int,
+) -> torch.Tensor:
+    """Compiled torch baseline: bmm + bias + scale."""
+    N = w.shape[0] // G
+    K = w.shape[1]
+    x_3d = x.view(G, M_per_group, K)
+    w_3d = w.view(G, N, K)
+    out = torch.bmm(x_3d, w_3d.transpose(-1, -2))
+    out = out + bias.unsqueeze(1)
+    out = out * token_weights.view(G, M_per_group, 1)
+    return out.view(-1, N)
+def torch_baseline_bench(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    bias: torch.Tensor,
+    token_weights: torch.Tensor,
+    G: int,
+    M_per_group: int,
+) -> Callable[[], torch.Tensor]:
+    """Factory for torch.compile'd batched matmul baseline."""
+    def run() -> torch.Tensor:
+        return _torch_bmm_bias_scale(x, w, bias, token_weights, G, M_per_group)
+    return run
+def triton_gemm_torch_bias_scale_bench(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    bias: torch.Tensor,
+    token_weights: torch.Tensor,
+    G: int,
+    M_per_group: int,
+) -> Callable[[], torch.Tensor]:
+    """Factory for Triton grouped_gemm + torch bias + torch token_weights."""
+    def run() -> torch.Tensor:
+        out = grouped_gemm(x, w, m_sizes)
+        out_3d = out.view(G, M_per_group, -1)
+        out_3d = out_3d + bias.unsqueeze(1)
+        out_3d = out_3d * token_weights.view(G, M_per_group, 1)
+        return out_3d.view(-1, out.shape[-1])
+    return run
+@click.command()
+@click.option("--warmup", type=int, default=25, help="Warmup iterations")
+@click.option("--rep", type=int, default=25, help="Benchmark repetitions")
+def bench(warmup: int, rep: int) -> None:
+    """Benchmark grouped_gemm_bias_scale vs torch baseline."""
+    device = torch.accelerator.current_accelerator()
+    dtype = torch.bfloat16
+    # G: Number of experts/groups in the MoE layer
+    # M: Total number of tokens across all groups
+    # N: Output dimension (hidden size of expert output)
+    # K: Input dimension (hidden size of expert input)
+    configs = [
+        {"G": 4, "M": 512, "N": 256, "K": 256, "name": "Small"},
+        {"G": 16, "M": 4096, "N": 512, "K": 512, "name": "Medium"},
+        {"G": 64, "M": 16384, "N": 512, "K": 512, "name": "Large"},
+    ]
+    # Print configuration table
+    config_df = pd.DataFrame(configs).rename(
+        columns={
+            "name": "Config",
+            "G": "G (experts)",
+            "M": "M (tokens)",
+            "N": "N (out_dim)",
+            "K": "K (in_dim)",
+        }
+    )[["Config", "G (experts)", "M (tokens)", "N (out_dim)", "K (in_dim)"]]
+    print("\nBenchmark Configurations:")
+    print(config_df.to_string(index=False))
+    print()
+    results: List[Dict[str, str]] = []
+    for idx, cfg in enumerate(configs):
+        G: int = cfg["G"]  # pyre-ignore[9]
+        M: int = cfg["M"]  # pyre-ignore[9]
+        N: int = cfg["N"]  # pyre-ignore[9]
+        K: int = cfg["K"]  # pyre-ignore[9]
+        name: str = cfg["name"]  # pyre-ignore[9]
+        M_per_group = M // G
+        print(f"Processing config {idx + 1}/{len(configs)}: {name}...")
+        # Create tensors
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(G * N, K, dtype=dtype, device=device)
+        bias = torch.randn(G, N, dtype=dtype, device=device)
+        token_weights = torch.rand(M, dtype=dtype, device=device) + 0.5
+        m_sizes = torch.full((G,), M_per_group, dtype=torch.int32, device=device)
+        # Create benchmark functions
+        triton_fn = triton_fused_bench(x, w, m_sizes, bias, token_weights)
+        triton_torch_fn = triton_gemm_torch_bias_scale_bench(
+            x, w, m_sizes, bias, token_weights, G, M_per_group
+        )
+        torch_fn = torch_baseline_bench(x, w, bias, token_weights, G, M_per_group)
+        # Warmup torch.compile
+        for _ in range(3):
+            torch_fn()
+        torch.cuda.synchronize()
+        # Benchmark
+        fused_ms = triton.testing.do_bench(triton_fn, warmup=warmup, rep=rep)
+        triton_torch_ms = triton.testing.do_bench(
+            triton_torch_fn, warmup=warmup, rep=rep
+        )
+        torch_ms = triton.testing.do_bench(torch_fn, warmup=warmup, rep=rep)
+        results.append(
+            {
+                "Config": name,
+                "fused (ms)": f"{fused_ms:.3f}",
+                "triton+torch (ms)": f"{triton_torch_ms:.3f}",
+                "torch (ms)": f"{torch_ms:.3f}",
+                "Speedup vs torch": f"{torch_ms / fused_ms:.2f}x",
+                "Speedup vs triton+torch": f"{triton_torch_ms / fused_ms:.2f}x",
+            }
+        )
+    print("\nBenchmark Results:")
+    print(pd.DataFrame(results).to_string(index=False))
+    print()
+if __name__ == "__main__":
+    bench()

mslk/bench/moe/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict

mslk/bench/moe/gather_scatter_bench.py ADDED Viewed

@@ -0,0 +1,356 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+import itertools
+from typing import Optional
+import click
+import torch
+import triton  # noqa: F401
+from mslk.moe import (
+    combine_shuffling,
+    gather_scale_dense_tokens,
+    gather_scale_quant_dense_tokens,
+    scatter_add_dense_tokens,
+    split_shuffling,
+)
+from triton.testing import do_bench, do_bench_cudagraph
+index_shuffling = None
+gather_along_first_dim = None
+scatter_add_along_first_dim = None
+if torch.cuda.is_available():
+    index_shuffling = torch.ops.mslk.index_shuffling  # noqa F401
+    if not torch.version.hip:
+        # SM90 support
+        gather_along_first_dim = torch.ops.mslk.gather_along_first_dim  # noqa F401
+        scatter_add_along_first_dim = torch.ops.mslk.scatter_add_along_first_dim  # noqa F401
+_ACCELERATOR_TAG = torch.accelerator.current_accelerator()
+def bench_gather_along_first_dim(M: int, N: int, K: int) -> None:
+    src = torch.randn([M, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    if M == N:
+        indices = torch.randperm(N, device=_ACCELERATOR_TAG, dtype=torch.int32)
+    else:
+        indices = torch.randint(0, M, [N], device=_ACCELERATOR_TAG, dtype=torch.int32)
+    def fn():
+        return torch.ops.mslk.gather_along_first_dim(src, indices)
+    def ref_fn():
+        return torch.index_select(src, 0, indices)
+    # Load src, store dst. x2.
+    data_size_in_gigabytes = N * K * 2 * 2 / 1e9
+    time_in_us = triton.testing.do_bench(fn) * 1e3
+    time_in_second = time_in_us / 1e6
+    gigabytes_per_second = data_size_in_gigabytes / time_in_second
+    ref_time_in_us = triton.testing.do_bench(ref_fn) * 1e3
+    ref_time_in_second = ref_time_in_us / 1e6
+    ref_gigabytes_per_second = data_size_in_gigabytes / ref_time_in_second
+    print(
+        f"Benchmark gather_along_first_dim: {M=:5d}, {N=:5d}, {K=:5d}, "
+        f"MSLK time: {time_in_us:10.3f} us. Bandwidth: {gigabytes_per_second:10.3f} GB/s, "
+        f"Torch time: {ref_time_in_us:10.3f} us. Bandwidth: {ref_gigabytes_per_second:10.3f} GB/s"
+    )
+def bench_scatter_add_along_first_dim_(op, M: int, N: int, K: int) -> None:
+    src = torch.randn([M, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    dst = torch.randn([N, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    if M == N:
+        indices_1d = torch.randperm(N, device=_ACCELERATOR_TAG, dtype=torch.int64)
+    else:
+        indices_1d = torch.randint(
+            0, N, [M], device=_ACCELERATOR_TAG, dtype=torch.int64
+        )
+    indices_2d = indices_1d.to(torch.int64).unsqueeze(1).expand(-1, K)
+    test_dst = dst.clone()
+    ref_dst = dst.clone()
+    def fn():
+        op(test_dst, src, indices_1d)
+    def ref_fn():
+        ref_dst.scatter_add_(0, indices_2d, src)
+    # Load src, load dst, store dst. x3.
+    data_size_in_gigabytes = N * K * 2 * 3 / 1e9
+    time_in_us = triton.testing.do_bench(fn) * 1e3
+    time_in_second = time_in_us / 1e6
+    gigabytes_per_second = data_size_in_gigabytes / time_in_second
+    ref_time_in_us = triton.testing.do_bench(ref_fn) * 1e3
+    ref_time_in_second = ref_time_in_us / 1e6
+    ref_gigabytes_per_second = data_size_in_gigabytes / ref_time_in_second
+    print(
+        f"Benchmark {op.__name__}: {M=:5d}, {N=:5d}, {K=:5d}, "
+        f"MSLK time: {time_in_us:10.3f} us. Bandwidth: {gigabytes_per_second:10.3f} GB/s, "
+        f"Torch time: {ref_time_in_us:10.3f} us. Bandwidth: {ref_gigabytes_per_second:10.3f} GB/s"
+    )
+bench_scatter_add_along_first_dim = functools.partial(
+    bench_scatter_add_along_first_dim_, scatter_add_along_first_dim
+)
+bench_scatter_add_dense_tokens = functools.partial(
+    bench_scatter_add_along_first_dim_, scatter_add_dense_tokens
+)
+def bench_gather_scale_dense_tokens(E: int, T: int, D: int, quantize: bool):
+    x = torch.randn((T, D), dtype=torch.bfloat16, device=_ACCELERATOR_TAG).abs()
+    expert_indices = torch.randint(0, E, (T,), device=_ACCELERATOR_TAG)
+    token_indices = torch.randperm(T, device=_ACCELERATOR_TAG)
+    scores = torch.rand((E, T), dtype=torch.bfloat16, device=_ACCELERATOR_TAG)
+    def torch_fn():
+        shuffled_x = torch.index_select(x, dim=0, index=token_indices)
+        shuffled_scores = torch.index_select(scores, dim=1, index=token_indices)
+        shuffled_selected_scores = torch.gather(
+            shuffled_scores, dim=0, index=expert_indices.view(1, T)
+        )
+        ref_output = shuffled_x * shuffled_selected_scores.view(-1, 1)
+        return ref_output
+    torch_fn()
+    scores_TE = scores.transpose(0, 1).contiguous()
+    mslk_fn = gather_scale_quant_dense_tokens if quantize else gather_scale_dense_tokens
+    def triton_fn():
+        test_output = mslk_fn(x, token_indices, expert_indices, scores_TE)
+        return test_output
+    triton_fn()
+    # Run benchmark
+    if quantize:
+        data_size_in_gigabytes = T * D * 3 / 1e9
+    else:
+        data_size_in_gigabytes = T * D * 4 / 1e9
+    mslk_time = do_bench(triton_fn, rep=1000) * 1e3
+    mslk_bw = data_size_in_gigabytes / (mslk_time / 1e6)
+    torch_time = do_bench(torch_fn, rep=1000) * 1e3
+    torch_bw = data_size_in_gigabytes / (torch_time / 1e6)
+    print(
+        f"Benchmark gather_scale_dense_tokens({quantize=}), {E=:3d}, {T=:5d}, {D=:5d}, "
+        f"MSLK time: {mslk_time:10.3f} us. Bandwidth: {mslk_bw:10.3f} GB/s, "
+        f"Torch time: {torch_time:10.3f} us. Bandwidth: {torch_bw:10.3f} GB/s"
+    )
+def bench_topk_index_shuffling(T: int, E: int, K: int) -> None:
+    torch.manual_seed(0)
+    num_rotating_buffers = min(max(2, triton.cdiv(1024 * 1024 * 1024, T * E * 2)), 1000)
+    scores_list: list[torch.Tensor] = [
+        torch.randn(T, E, device=_ACCELERATOR_TAG, dtype=torch.bfloat16)
+        for i in range(num_rotating_buffers)
+    ]
+    def fn() -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        for scores in scores_list:
+            index_shuffling(scores, top_k=K)
+    def ref_fn() -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        for scores in scores_list:
+            _, selected_expert_indices = torch.topk(scores, K, dim=1)
+            expert_indices, _ = torch.sort(
+                selected_expert_indices.flatten(), dim=0, stable=True
+            )
+            _ = (
+                expert_indices[:, None]
+                == torch.arange(E, device=expert_indices.device)[None, :]
+            ).sum(dim=0)
+    mslk_time = do_bench_cudagraph(fn) * 1e3 / num_rotating_buffers
+    torch_time = do_bench_cudagraph(ref_fn) * 1e3 / num_rotating_buffers
+    print(
+        f"Benchmark index_shuffling, num_tokens={T:4}, num_experts={E:4}, top_k={K:4}, "
+        f"mslk_time={mslk_time:7.3f}us, torch_time={torch_time:7.3f}us"
+    )
+def bench_combine_or_split_shuffling(
+    T: int,
+    D: int,
+    E: int,
+    EP: bool,
+    is_padded: bool,
+    is_balanced: bool,
+    is_combine_shuffling: bool,
+):
+    torch.manual_seed(0)
+    assert E % EP == 0
+    if is_padded:
+        # graph. allgather
+        input_num_tokens: int = EP * T
+        input_num_experts: int = E
+        output_num_experts: int = E // EP
+        start_expert_index: int = 1
+        end_expert_index: int = 1 + output_num_experts
+    else:
+        # eager. all2all
+        input_num_tokens: int = T
+        input_num_experts: int = E // EP
+        output_num_experts: int = E // EP
+        start_expert_index: int = 0
+        end_expert_index: int = output_num_experts
+    tokens = torch.randn(
+        input_num_tokens, D, device=_ACCELERATOR_TAG, dtype=torch.bfloat16
+    )
+    if input_num_tokens < (EP * input_num_experts) != 0:
+        return
+    input_num_tokens_per_expert: int = input_num_tokens // (EP * input_num_experts)
+    token_counts: torch.Tensor = (
+        torch.ones(
+            [EP, input_num_experts],
+            dtype=torch.int32,
+            device=_ACCELERATOR_TAG,
+        )
+        * input_num_tokens_per_expert
+    )
+    if not is_balanced:
+        for i in range(EP):
+            token_counts[i, start_expert_index] -= input_num_tokens_per_expert
+            token_counts[i, end_expert_index - 1] += input_num_tokens_per_expert
+    assert token_counts.sum().item() == input_num_tokens
+    num_rotating_buffers = triton.cdiv(1024 * 1024 * 1024, tokens.numel() * 2)
+    token_list: list[torch.Tensor] = [
+        tokens.clone() for _ in range(num_rotating_buffers)
+    ]
+    token_count_list: list[torch.Tensor] = [
+        token_counts.clone() for _ in range(num_rotating_buffers)
+    ]
+    def fn() -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        for tokens, token_counts in zip(token_list, token_count_list):
+            if is_combine_shuffling:
+                combine_shuffling(
+                    tokens,
+                    token_counts,
+                    expert_start=start_expert_index,
+                    expert_end=end_expert_index,
+                    is_balanced=is_balanced,
+                )
+            else:
+                split_shuffling(
+                    tokens,
+                    token_counts,
+                    expert_start=start_expert_index,
+                    expert_end=end_expert_index,
+                    is_balanced=is_balanced,
+                )
+    fn()
+    output_num_tokens = 0
+    for per_rank_counts in token_counts.tolist():
+        for expert_index, per_expert_counts in enumerate(per_rank_counts):
+            if expert_index >= start_expert_index and expert_index < end_expert_index:
+                output_num_tokens += per_expert_counts
+    mem_bytes = output_num_tokens * D * 2 * 2
+    mslk_time = do_bench_cudagraph(fn) * 1e3 / num_rotating_buffers
+    mslk_bw = mem_bytes * 1e-9 / (mslk_time * 1e-6)
+    print(
+        f"Benchmark {'combine_shuffling' if is_combine_shuffling else 'split_shuffling'}, "
+        f"num_tokens={T:4}, dim={D:4}, num_experts={E:4}, expert_parallelism={EP:4}, output_num_tokens={output_num_tokens:4}, "
+        f"{is_balanced=}, {is_padded=}, "
+        f"mslk_time={mslk_time:7.3f}us, mslk_bw={mslk_bw:8.3f}GBytes/s."
+    )
+@click.command()
+@click.option(
+    "--kernels",
+    default=None,
+    help="Comma separated list of kernels to benchmark. Defaults to all kernels.",
+)
+def main(kernels: Optional[str]):
+    if kernels is not None:
+        kernels = kernels.split(",")
+    def should_bench_kernel(fn):
+        return (fn is not None) and (kernels is None or fn.__name__ in kernels)
+    Es = [16, 128]
+    Ts = [1, 128, 2048, 4096, 8192, 16384]
+    Ds = [5120]
+    # Gather/Scatter
+    if should_bench_kernel(gather_scale_dense_tokens):
+        for E, T, D in itertools.product(Es, Ts, Ds):
+            bench_gather_scale_dense_tokens(E, T, D, quantize=False)
+    if should_bench_kernel(gather_scale_quant_dense_tokens):
+        for E, T, D in itertools.product(Es, Ts, Ds):
+            bench_gather_scale_dense_tokens(E, T, D, quantize=True)
+    if should_bench_kernel(gather_along_first_dim):
+        for T, D in itertools.product(Ts, Ds):
+            bench_gather_along_first_dim(T, T, D)
+    if should_bench_kernel(scatter_add_along_first_dim):
+        for T, D in itertools.product(Ts, Ds):
+            bench_scatter_add_along_first_dim(T, T, D)
+    if should_bench_kernel(scatter_add_dense_tokens):
+        for T, D in itertools.product(Ts, Ds):
+            bench_scatter_add_dense_tokens(T, T, D)
+    Ks = [1, 2, 4]
+    Es = [16, 32, 128, 320]
+    # Shuffling
+    if should_bench_kernel(index_shuffling):
+        for T, E, K in itertools.product(Ts, Es, Ks):
+            bench_topk_index_shuffling(T, E, K)
+    EPs = [2, 16]
+    Ts = [32, 128, 2048, 4096, 8192, 16384]
+    padded = [True, False]
+    balanced = [True, False]
+    if should_bench_kernel(combine_shuffling):
+        for T, D, E, EP, p, b in itertools.product(Ts, Ds, Es, EPs, padded, balanced):
+            bench_combine_or_split_shuffling(
+                T, D, E, EP, p, b, is_combine_shuffling=True
+            )
+    if should_bench_kernel(split_shuffling):
+        for T, D, E, EP, p, b in itertools.product(Ts, Ds, Es, EPs, padded, balanced):
+            bench_combine_or_split_shuffling(
+                T, D, E, EP, p, b, is_combine_shuffling=False
+            )
+if __name__ == "__main__":
+    main()