PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py ADDED Viewed

@@ -0,0 +1,257 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import tempfile
+import uuid
+from functools import lru_cache
+from pprint import pprint
+import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.distributed.launcher.api import elastic_launch, LaunchConfig
+@lru_cache(None)
+def get_symm_buffer(group):
+    inp = symm_mem.empty(
+        16 * 1024 * 1024, device="cuda", dtype=torch.bfloat16
+    )  # .normal_()
+    symm_mem.rendezvous(inp, group=group)
+    return inp, group.group_name
+def _setup(path: str) -> tuple[int, int]:
+    rank = int(os.environ["LOCAL_RANK"])
+    W = int(os.environ["WORLD_SIZE"])
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    torch.ops.fbgemm.nccl_init(rank, W, os.path.join(path, "rdvz"))
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=f"file://{os.path.join(path, 'gloo_rdvz')}",
+        world_size=W,
+        rank=rank,
+    )
+    buffer = torch.ops.fbgemm.car_tensor()
+    barrier = torch.ops.fbgemm.car_tensor()
+    barrier.zero_()
+    buffer_handle = torch.ops.fbgemm.car_ipc_handle(buffer)
+    all_buffer_handles = [torch.empty_like(buffer_handle) for _ in range(W)]
+    torch.distributed.all_gather(all_buffer_handles, buffer_handle)
+    barrier_handle = torch.ops.fbgemm.car_ipc_handle(barrier)
+    all_barrier_handles = [torch.empty_like(barrier_handle) for _ in range(W)]
+    torch.distributed.all_gather(all_barrier_handles, barrier_handle)
+    torch.ops.fbgemm.car_init(
+        rank, W, barrier, all_barrier_handles, buffer, all_buffer_handles
+    )
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    group = dist.group.WORLD
+    _ = get_symm_buffer(group)
+    return rank, W
+def symm_one_shot_allreduce(dst_tensor, src_tensor, bias=None, comm_idx=None):
+    # get_symm_buffer should be called for the first time during model init,
+    # and now return cached values. Make sure group is the same as during init
+    symm_buffer, group_name = get_symm_buffer(dist.group.WORLD)
+    symm_buffer = symm_buffer[: src_tensor.numel()].view_as(src_tensor)
+    torch.ops.symm_mem.one_shot_all_reduce_copy_out(
+        symm_buffer, src_tensor, "sum", group_name, dst_tensor
+    )
+    if bias is not None:
+        dst_tensor.add_(bias)
+def symm_two_shot_allreduce(dst_tensor, src_tensor, bias=None, comm_idx=None):
+    # get_symm_buffer should be called for the first time during model init,
+    # and now return cached values. Make sure group is the same as during init
+    symm_buffer, group_name = get_symm_buffer(dist.group.WORLD)
+    # car is also doing explicit copy
+    symm_buffer = symm_buffer[: src_tensor.numel()].view_as(src_tensor)
+    symm_buffer.copy_(src_tensor)
+    torch.ops.symm_mem.two_shot_all_reduce_out(
+        symm_buffer, "sum", group_name, dst_tensor
+    )
+    if bias is not None:
+        dst_tensor.add_(bias)
+def symm_reduce_scatter(dst_tensor, src_tensor, comm_idx=None):
+    symm_buffer, group_name = get_symm_buffer(dist.group.WORLD)
+    symm_buffer = symm_buffer[: src_tensor.numel()].view_as(src_tensor)
+    symm_buffer.copy_(src_tensor)
+    torch.ops.symm_mem.reduce_scatter_out(symm_buffer, group_name, False, dst_tensor)
+def run_one_algo(fn, out, inp, num_iters, num_warmup_iters):
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    for _ in range(num_warmup_iters):
+        fn(out, inp)
+    start_event.record()
+    for _ in range(num_iters):
+        fn(out, inp)
+    end_event.record()
+    torch.cuda.synchronize()
+    time = start_event.elapsed_time(end_event) / num_iters
+    return time
+def run_benchmark(args, path):
+    rank, W = _setup(path)
+    if rank == 0:
+        print(f"Running benchmark with {W} ranks")
+    # benchmark_results = defaultdict(defaultdict)
+    benchmark_results = []
+    # with torch.profiler.profile() as p:
+    for N in torch.logspace(
+        args.min_size, args.max_size, steps=args.size_steps, base=2
+    ).tolist():
+        def round_up(a: int, b: int) -> int:
+            return ((a + b - 1) // b) * b
+        N_even_divisor = 8 * 64 if torch.version.hip else 8 * 32
+        N = round_up(int(N), N_even_divisor)
+        inp = torch.rand(N, dtype=torch.bfloat16, device="cuda")
+        results = {"N": N}
+        if args.op == "allreduce":
+            out = torch.full_like(inp, -1)
+            fns = (
+                torch.ops.fbgemm.one_shot_car_allreduce,
+                symm_one_shot_allreduce,
+                torch.ops.fbgemm.two_shot_car_allreduce,
+                symm_two_shot_allreduce,
+                torch.ops.fbgemm.nccl_allreduce,
+            )
+            labels = (
+                "fbgemm_1shot",
+                "symm_1shot",
+                "fbgemm_2shot",
+                "symm_2shot",
+                "nccl",
+            )
+            for fn, label in zip(fns, labels):
+                time = run_one_algo(
+                    fn,
+                    out,
+                    inp,
+                    args.num_iters,
+                    args.num_warmup_iters,
+                )
+                results[f"{label}_time"] = time
+                results[f"{label}_bwidth"] = (
+                    N * inp.element_size() / (time * 1e-3) / 1e9
+                )
+        else:
+            out = torch.full(
+                (inp.shape[0] // W,), -1, dtype=inp.dtype, device=inp.device
+            )
+            fns = (
+                torch.ops.fbgemm.car_reducescatter,
+                symm_reduce_scatter,
+                torch.ops.fbgemm.nccl_reducescatter,
+            )
+            labels = ("fbgemm_rs", "symm_rs", "nccl_rs")
+            for fn, label in zip(fns, labels):
+                time = run_one_algo(
+                    fn,
+                    out,
+                    inp,
+                    args.num_iters,
+                    args.num_warmup_iters,
+                )
+                results[f"{label}_time"] = time
+                results[f"{label}_bwidth"] = (
+                    N * inp.element_size() / (time * 1e-3) / 1e9
+                )
+        benchmark_results.append(results)
+    if rank == 0:
+        pprint(benchmark_results)
+        if args.export_csv:
+            csv_file = os.path.join(args.output_dir, "comm_ops_benchmark.csv")
+            # Export results to a CSV file.
+            df = pd.DataFrame(benchmark_results)
+            df.to_csv(csv_file, index=False)
+def main(args, path):
+    if args.export_csv:
+        os.makedirs(args.output_dir, exist_ok=True)
+        print("csv and images will be saved to " + args.output_dir)
+    lc = LaunchConfig(
+        min_nodes=1,
+        max_nodes=1,
+        nproc_per_node=args.num_ranks,
+        run_id=str(uuid.uuid4()),
+        rdzv_backend="c10d",
+        rdzv_endpoint="localhost:0",
+        max_restarts=0,
+        monitor_interval=1,
+    )
+    elastic_launch(lc, entrypoint=run_benchmark)(args, path)
+def invoke_main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir", default="/tmp", help="Directory to save plots and csvs to"
+    )
+    parser.add_argument(
+        "--export_csv",
+        action="store_true",
+        help="Export results to a CSV file.",
+    )
+    parser.add_argument("--num_ranks", type=int, default=8)
+    parser.add_argument("--num_iters", type=int, default=20)
+    parser.add_argument("--num_warmup_iters", type=int, default=10)
+    parser.add_argument(
+        "--min_size",
+        type=int,
+        default=10,
+        help="minimum size will be set to 2**min_size",
+    )
+    parser.add_argument(
+        "--max_size",
+        type=int,
+        default=24,
+        help="maximum size will be set to 2**max_size",
+    )
+    parser.add_argument(
+        "--size_steps", type=int, default=20, help="number of size steps to run"
+    )
+    parser.add_argument(
+        "--op",
+        type=str,
+        default="allreduce",
+        choices=["allreduce", "reduce_scatter"],
+        help="op to benchmark, allreduce or reduce_scatter",
+    )
+    args = parser.parse_args()
+    with tempfile.TemporaryDirectory() as path:
+        main(args, path)
+if __name__ == "__main__":
+    invoke_main()

fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py ADDED Viewed

@@ -0,0 +1,348 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+import itertools
+from typing import Optional
+import click
+import torch
+import triton  # noqa: F401
+from fbgemm_gpu.experimental.gen_ai.moe import (
+    combine_shuffling,
+    gather_along_first_dim,
+    gather_scale_dense_tokens,
+    gather_scale_quant_dense_tokens,
+    index_shuffling,
+    scatter_add_along_first_dim,
+    scatter_add_dense_tokens,
+    split_shuffling,
+)
+from triton.testing import do_bench, do_bench_cudagraph
+_ACCELERATOR_TAG = torch.accelerator.current_accelerator()
+def bench_gather_along_first_dim(M: int, N: int, K: int) -> None:
+    src = torch.randn([M, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    if M == N:
+        indices = torch.randperm(N, device=_ACCELERATOR_TAG, dtype=torch.int32)
+    else:
+        indices = torch.randint(0, M, [N], device=_ACCELERATOR_TAG, dtype=torch.int32)
+    def fn():
+        return gather_along_first_dim(src, indices)
+    def ref_fn():
+        return torch.index_select(src, 0, indices)
+    # Load src, store dst. x2.
+    data_size_in_gigabytes = N * K * 2 * 2 / 1e9
+    time_in_us = triton.testing.do_bench(fn) * 1e3
+    time_in_second = time_in_us / 1e6
+    gigabytes_per_second = data_size_in_gigabytes / time_in_second
+    ref_time_in_us = triton.testing.do_bench(ref_fn) * 1e3
+    ref_time_in_second = ref_time_in_us / 1e6
+    ref_gigabytes_per_second = data_size_in_gigabytes / ref_time_in_second
+    print(
+        f"Benchmark gather_along_first_dim: {M=:5d}, {N=:5d}, {K=:5d}, "
+        f"FBGEMM time: {time_in_us:10.3f} us. Bandwidth: {gigabytes_per_second:10.3f} GB/s, "
+        f"Torch time: {ref_time_in_us:10.3f} us. Bandwidth: {ref_gigabytes_per_second:10.3f} GB/s"
+    )
+def bench_scatter_add_along_first_dim_(op, M: int, N: int, K: int) -> None:
+    src = torch.randn([M, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    dst = torch.randn([N, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    if M == N:
+        indices_1d = torch.randperm(N, device=_ACCELERATOR_TAG, dtype=torch.int64)
+    else:
+        indices_1d = torch.randint(
+            0, N, [M], device=_ACCELERATOR_TAG, dtype=torch.int64
+        )
+    indices_2d = indices_1d.to(torch.int64).unsqueeze(1).expand(-1, K)
+    test_dst = dst.clone()
+    ref_dst = dst.clone()
+    def fn():
+        op(test_dst, src, indices_1d)
+    def ref_fn():
+        ref_dst.scatter_add_(0, indices_2d, src)
+    # Load src, load dst, store dst. x3.
+    data_size_in_gigabytes = N * K * 2 * 3 / 1e9
+    time_in_us = triton.testing.do_bench(fn) * 1e3
+    time_in_second = time_in_us / 1e6
+    gigabytes_per_second = data_size_in_gigabytes / time_in_second
+    ref_time_in_us = triton.testing.do_bench(ref_fn) * 1e3
+    ref_time_in_second = ref_time_in_us / 1e6
+    ref_gigabytes_per_second = data_size_in_gigabytes / ref_time_in_second
+    print(
+        f"Benchmark {op.__name__}: {M=:5d}, {N=:5d}, {K=:5d}, "
+        f"FBGEMM time: {time_in_us:10.3f} us. Bandwidth: {gigabytes_per_second:10.3f} GB/s, "
+        f"Torch time: {ref_time_in_us:10.3f} us. Bandwidth: {ref_gigabytes_per_second:10.3f} GB/s"
+    )
+bench_scatter_add_along_first_dim = functools.partial(
+    bench_scatter_add_along_first_dim_, scatter_add_along_first_dim
+)
+bench_scatter_add_dense_tokens = functools.partial(
+    bench_scatter_add_along_first_dim_, scatter_add_dense_tokens
+)
+def bench_gather_scale_dense_tokens(E: int, T: int, D: int, quantize: bool):
+    x = torch.randn((T, D), dtype=torch.bfloat16, device=_ACCELERATOR_TAG).abs()
+    expert_indices = torch.randint(0, E, (T,), device=_ACCELERATOR_TAG)
+    token_indices = torch.randperm(T, device=_ACCELERATOR_TAG)
+    scores = torch.rand((E, T), dtype=torch.bfloat16, device=_ACCELERATOR_TAG)
+    def torch_fn():
+        shuffled_x = torch.index_select(x, dim=0, index=token_indices)
+        shuffled_scores = torch.index_select(scores, dim=1, index=token_indices)
+        shuffled_selected_scores = torch.gather(
+            shuffled_scores, dim=0, index=expert_indices.view(1, T)
+        )
+        ref_output = shuffled_x * shuffled_selected_scores.view(-1, 1)
+        return ref_output
+    torch_fn()
+    scores_TE = scores.transpose(0, 1).contiguous()
+    fbgemm_fn = (
+        gather_scale_quant_dense_tokens if quantize else gather_scale_dense_tokens
+    )
+    def triton_fn():
+        test_output = fbgemm_fn(x, token_indices, expert_indices, scores_TE)
+        return test_output
+    triton_fn()
+    # Run benchmark
+    if quantize:
+        data_size_in_gigabytes = T * D * 3 / 1e9
+    else:
+        data_size_in_gigabytes = T * D * 4 / 1e9
+    fbgemm_time = do_bench(triton_fn, rep=1000) * 1e3
+    fbgemm_bw = data_size_in_gigabytes / (fbgemm_time / 1e6)
+    torch_time = do_bench(torch_fn, rep=1000) * 1e3
+    torch_bw = data_size_in_gigabytes / (torch_time / 1e6)
+    print(
+        f"Benchmark gather_scale_dense_tokens({quantize=}), {E=:3d}, {T=:5d}, {D=:5d}, "
+        f"FBGEMM time: {fbgemm_time:10.3f} us. Bandwidth: {fbgemm_bw:10.3f} GB/s, "
+        f"Torch time: {torch_time:10.3f} us. Bandwidth: {torch_bw:10.3f} GB/s"
+    )
+def bench_topk_index_shuffling(T: int, E: int, K: int) -> None:
+    torch.manual_seed(0)
+    num_rotating_buffers = min(max(2, triton.cdiv(1024 * 1024 * 1024, T * E * 2)), 1000)
+    scores_list: list[torch.Tensor] = [
+        torch.randn(T, E, device=_ACCELERATOR_TAG, dtype=torch.bfloat16)
+        for i in range(num_rotating_buffers)
+    ]
+    def fn() -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        for scores in scores_list:
+            index_shuffling(scores, top_k=K)
+    def ref_fn() -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        for scores in scores_list:
+            _, selected_expert_indices = torch.topk(scores, K, dim=1)
+            expert_indices, _ = torch.sort(
+                selected_expert_indices.flatten(), dim=0, stable=True
+            )
+            _ = (
+                expert_indices[:, None]
+                == torch.arange(E, device=expert_indices.device)[None, :]
+            ).sum(dim=0)
+    fbgemm_time = do_bench_cudagraph(fn) * 1e3 / num_rotating_buffers
+    torch_time = do_bench_cudagraph(ref_fn) * 1e3 / num_rotating_buffers
+    print(
+        f"Benchmark index_shuffling, num_tokens={T:4}, num_experts={E:4}, top_k={K:4}, "
+        f"fbgemm_time={fbgemm_time:7.3f}us, torch_time={torch_time:7.3f}us"
+    )
+def bench_combine_or_split_shuffling(
+    T: int,
+    D: int,
+    E: int,
+    EP: bool,
+    is_padded: bool,
+    is_balanced: bool,
+    is_combine_shuffling: bool,
+):
+    torch.manual_seed(0)
+    assert E % EP == 0
+    if is_padded:
+        # graph. allgather
+        input_num_tokens: int = EP * T
+        input_num_experts: int = E
+        output_num_experts: int = E // EP
+        start_expert_index: int = 1
+        end_expert_index: int = 1 + output_num_experts
+    else:
+        # eager. all2all
+        input_num_tokens: int = T
+        input_num_experts: int = E // EP
+        output_num_experts: int = E // EP
+        start_expert_index: int = 0
+        end_expert_index: int = output_num_experts
+    tokens = torch.randn(
+        input_num_tokens, D, device=_ACCELERATOR_TAG, dtype=torch.bfloat16
+    )
+    if input_num_tokens < (EP * input_num_experts) != 0:
+        return
+    input_num_tokens_per_expert: int = input_num_tokens // (EP * input_num_experts)
+    token_counts: torch.Tensor = (
+        torch.ones(
+            [EP, input_num_experts],
+            dtype=torch.int32,
+            device=_ACCELERATOR_TAG,
+        )
+        * input_num_tokens_per_expert
+    )
+    if not is_balanced:
+        for i in range(EP):
+            token_counts[i, start_expert_index] -= input_num_tokens_per_expert
+            token_counts[i, end_expert_index - 1] += input_num_tokens_per_expert
+    assert token_counts.sum().item() == input_num_tokens
+    num_rotating_buffers = triton.cdiv(1024 * 1024 * 1024, tokens.numel() * 2)
+    token_list: list[torch.Tensor] = [
+        tokens.clone() for _ in range(num_rotating_buffers)
+    ]
+    token_count_list: list[torch.Tensor] = [
+        token_counts.clone() for _ in range(num_rotating_buffers)
+    ]
+    def fn() -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        for tokens, token_counts in zip(token_list, token_count_list):
+            if is_combine_shuffling:
+                combine_shuffling(
+                    tokens,
+                    token_counts,
+                    expert_start=start_expert_index,
+                    expert_end=end_expert_index,
+                    is_balanced=is_balanced,
+                )
+            else:
+                split_shuffling(
+                    tokens,
+                    token_counts,
+                    expert_start=start_expert_index,
+                    expert_end=end_expert_index,
+                    is_balanced=is_balanced,
+                )
+    fn()
+    output_num_tokens = 0
+    for per_rank_counts in token_counts.tolist():
+        for expert_index, per_expert_counts in enumerate(per_rank_counts):
+            if expert_index >= start_expert_index and expert_index < end_expert_index:
+                output_num_tokens += per_expert_counts
+    mem_bytes = output_num_tokens * D * 2 * 2
+    fbgemm_time = do_bench_cudagraph(fn) * 1e3 / num_rotating_buffers
+    fbgemm_bw = mem_bytes * 1e-9 / (fbgemm_time * 1e-6)
+    print(
+        f"Benchmark {'combine_shuffling' if is_combine_shuffling else 'split_shuffling'}, "
+        f"num_tokens={T:4}, dim={D:4}, num_experts={E:4}, expert_parallelism={EP:4}, output_num_tokens={output_num_tokens:4}, "
+        f"{is_balanced=}, {is_padded=}, "
+        f"fbgemm_time={fbgemm_time:7.3f}us, fbgemm_bw={fbgemm_bw:8.3f}GBytes/s."
+    )
+@click.command()
+@click.option(
+    "--kernels",
+    default=None,
+    help="Comma separated list of kernels to benchmark. Defaults to all kernels.",
+)
+def main(kernels: Optional[str]):
+    if kernels is not None:
+        kernels = kernels.split(",")
+    def should_bench_kernel(fn):
+        return (fn is not None) and (kernels is None or fn.__name__ in kernels)
+    Es = [16, 128]
+    Ts = [1, 128, 2048, 4096, 8192, 16384]
+    Ds = [5120]
+    # Gather/Scatter
+    if should_bench_kernel(gather_scale_dense_tokens):
+        for E, T, D in itertools.product(Es, Ts, Ds):
+            bench_gather_scale_dense_tokens(E, T, D, quantize=False)
+    if should_bench_kernel(gather_scale_quant_dense_tokens):
+        for E, T, D in itertools.product(Es, Ts, Ds):
+            bench_gather_scale_dense_tokens(E, T, D, quantize=True)
+    if should_bench_kernel(gather_along_first_dim):
+        for T, D in itertools.product(Ts, Ds):
+            bench_gather_along_first_dim(T, T, D)
+    if should_bench_kernel(scatter_add_along_first_dim):
+        for T, D in itertools.product(Ts, Ds):
+            bench_scatter_add_along_first_dim(T, T, D)
+    if should_bench_kernel(scatter_add_dense_tokens):
+        for T, D in itertools.product(Ts, Ds):
+            bench_scatter_add_dense_tokens(T, T, D)
+    Ks = [1, 2, 4]
+    Es = [16, 32, 128, 320]
+    # Shuffling
+    if should_bench_kernel(index_shuffling):
+        for T, E, K in itertools.product(Ts, Es, Ks):
+            bench_topk_index_shuffling(T, E, K)
+    EPs = [2, 16]
+    Ts = [32, 128, 2048, 4096, 8192, 16384]
+    padded = [True, False]
+    balanced = [True, False]
+    if should_bench_kernel(combine_shuffling):
+        for T, D, E, EP, p, b in itertools.product(Ts, Ds, Es, EPs, padded, balanced):
+            bench_combine_or_split_shuffling(
+                T, D, E, EP, p, b, is_combine_shuffling=True
+            )
+    if should_bench_kernel(split_shuffling):
+        for T, D, E, EP, p, b in itertools.product(Ts, Ds, Es, EPs, padded, balanced):
+            bench_combine_or_split_shuffling(
+                T, D, E, EP, p, b, is_combine_shuffling=False
+            )
+if __name__ == "__main__":
+    main()