PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py ADDED Viewed

@@ -0,0 +1,1192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import functools
+import inspect
+import warnings
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from fbgemm_gpu.experimental.gemm.triton_gemm import utils
+from triton.runtime import driver  # @manual
+_NV_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+            "NUM_CONSUMER_GROUPS": 1,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+        num_ctas=num_ctas,
+    )
+    for block_size_m in [64, 128]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+    for num_ctas in [1]
+]
+_HAS_WS_SUPPORT = None
+def _check_ws_support():
+    if not hasattr(tl, "async_task"):
+        return False
+    config_signature = inspect.signature(triton.Config).parameters
+    if (
+        "num_consumer_groups" not in config_signature
+        or "num_buffers_warp_spec" not in config_signature
+    ):
+        return False
+    if not utils.HAS_TMA_DESC:
+        return False
+    return True
+def _set_ws_support():
+    global _HAS_WS_SUPPORT
+    if _HAS_WS_SUPPORT is None:
+        _HAS_WS_SUPPORT = _check_ws_support()
+_set_ws_support()
+if _HAS_WS_SUPPORT:
+    _NV_WS_CONFIGS = [
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": block_size_m,
+                "BLOCK_SIZE_N": block_size_n,
+                "BLOCK_SIZE_K": block_size_k,
+                "NUM_CONSUMER_GROUPS": max(1, num_consumer_groups),
+                "USE_TMA_LOAD_ON_SCALES": use_tma_load_on_scales,
+                "USE_TMA_STORE": use_tma_store,
+            },
+            num_stages=num_stages,
+            num_warps=num_warps,
+            num_ctas=num_ctas,
+            num_consumer_groups=num_consumer_groups,
+            num_buffers_warp_spec=num_stages,
+        )
+        for block_size_m in [64, 128, 256]
+        for block_size_n in [64, 128, 256]
+        for block_size_k in [64, 128, 256]
+        for num_stages in [2, 3, 4]
+        for num_warps in [4, 8, 16]
+        # TODO(shikaili): Resolve LLVM error.
+        for num_ctas in [1]
+        for num_consumer_groups in [0, 2]
+        for use_tma_load_on_scales in [True, False]
+        # TODO(shikaili): Resolve compatibility with ws.
+        for use_tma_store in [False]
+    ]
+else:
+    _NV_WS_CONFIGS = _NV_CONFIGS
+_AMD_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+            "waves_per_eu": waves_per_cu,
+            "matrix_instr_nonkdim": matrix_instr_nonkdim,
+            "NUM_CONSUMER_GROUPS": 1,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+    )
+    for block_size_m in [32, 64, 128]
+    for block_size_n in [32, 64, 128, 256]
+    for block_size_k in [128, 256]
+    for num_stages in [1, 2]
+    for num_warps, waves_per_cu in [(4, 1), (8, 2), (16, 4)]
+    for matrix_instr_nonkdim in [16]
+]
+def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    if dtsize is None:
+        dtsize = named_args["c_ptr"].element_size()
+    if dtype is None:
+        dtype = named_args["c_ptr"].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        (
+            BLOCK_M,
+            BLOCK_N,
+            BLOCK_K,
+            num_stages,
+            use_tma_load_on_scales,
+        ) = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+            kw.get("USE_TMA_LOAD_ON_SCALES", False),
+        )
+        G, M, N = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        if torch.version.hip:
+            required_shared_memory = BLOCK_N * BLOCK_K * num_stages * dtsize
+        else:
+            required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 32 if torch.version.hip else 64
+        # 2. make sure we don't load M tiles that are too big
+        if BLOCK_M > MIN_M_TILES and BLOCK_M > (M_PER_GROUP * 2):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = (N + BLOCK_N - 1) // BLOCK_N
+        MIN_N_TILES = 32 if torch.version.hip else 64
+        # 4. make sure we don't load N tiles that are too big
+        if BLOCK_N > MIN_N_TILES and M * N_TILES < num_sm:
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        if dtsize >= 2:
+            if use_tma_load_on_scales:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+def early_config_prune_ws(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    if dtsize is None:
+        dtsize = named_args["c_ptr"].element_size()
+    if dtype is None:
+        dtype = named_args["c_ptr"].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        (
+            BLOCK_M,
+            BLOCK_N,
+            BLOCK_K,
+            num_stages,
+            num_warps,
+            num_consumer_groups,
+            use_tma_load_on_scales,
+        ) = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+            config.num_warps,
+            config.num_consumer_groups,
+            kw.get("USE_TMA_LOAD_ON_SCALES", False),
+        )
+        G, M, N = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        if torch.version.hip:
+            required_shared_memory = BLOCK_N * BLOCK_K * num_stages * dtsize
+        else:
+            required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        use_warp_specialization = num_consumer_groups >= 1
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 32 if torch.version.hip else 64
+        # 2. make sure we don't load M tiles that are too big
+        if (
+            not use_warp_specialization
+            and BLOCK_M > MIN_M_TILES
+            and BLOCK_M > (M_PER_GROUP * 2)
+        ):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = (N + BLOCK_N - 1) // BLOCK_N
+        MIN_N_TILES = 32 if torch.version.hip else 64
+        # 4. make sure we don't load N tiles that are too big
+        if (
+            not use_warp_specialization
+            and BLOCK_N > MIN_N_TILES
+            and M * N_TILES < num_sm
+        ):
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        # 6. make sure we can partition for ws
+        if use_warp_specialization:
+            if num_warps != 4:
+                continue
+            # "tritongpu-warp-spec-data-partition"
+            m_slice = BLOCK_M // num_consumer_groups
+            n_slice = BLOCK_N // num_consumer_groups
+            if m_slice < 64 and n_slice < 256:
+                continue
+        if dtsize >= 2:
+            if use_tma_load_on_scales:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+@triton.autotune(
+    configs=_AMD_CONFIGS if torch.version.hip else _NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _fbgemm_grouped_gemm(
+    a_desc_ptr,
+    b_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+) -> None:
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    dtype: tl.dtype = c_ptr.dtype.element_ty
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g)
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            n_size = N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            num_tiles = num_m_tiles * num_n_tiles
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, n_size],
+                    # pyre-ignore
+                    strides=[n_size, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            while tidx >= iterated_tiles and tidx < iterated_tiles + num_tiles:
+                gidx = tidx - iterated_tiles
+                # Split M first and N second.
+                tile_m_idx = gidx % num_m_tiles
+                tile_n_idx = gidx // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                if USE_TMA_LOAD:
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                else:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    offs_k = tl.arange(0, BLOCK_SIZE_K)
+                    a_ptrs = (
+                        a_desc_ptr
+                        + (M_start_offset + offs_am[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    b_ptrs = (
+                        b_desc_ptr
+                        + (N_start_offset + offs_bn[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        updated_k_offset = k_offset + offs_k
+                        updated_k_offset_mask = updated_k_offset[None, :] < K  # type: ignore[16]
+                        a = tl.load(
+                            a_ptrs,
+                            mask=((offs_am[:, None] < m_size) & updated_k_offset_mask),
+                            other=0.0,
+                        )
+                        b = tl.load(
+                            b_ptrs,
+                            mask=((offs_bn[:, None] < n_size) & updated_k_offset_mask),
+                            other=0.0,
+                        )
+                        accumulator += tl.dot(a, b.T)
+                        a_ptrs += BLOCK_SIZE_K
+                        b_ptrs += BLOCK_SIZE_K
+                if USE_TMA_STORE:
+                    m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    # pyre-ignore
+                    c_desc_ptr.store(
+                        [m_offset, n_offset], accumulator.to(c_ptr.dtype.element_ty)
+                    )
+                elif FUSE_SCATTER_ADD:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    mask = offs_am < m_size
+                    m_offsets = tl.load(
+                        scatter_add_indices + M_start_offset + offs_am,
+                        mask=mask,
+                        cache_modifier=".ca",
+                    )
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    c = accumulator.to(c_ptr.dtype.element_ty)
+                    tl.atomic_add(
+                        c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                        c,
+                        mask=mask[:, None] and offs_bn[None, :] < n_size,
+                        sem="relaxed",
+                    )
+                else:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    c = accumulator.to(c_ptr.dtype.element_ty)
+                    tl.store(
+                        c_ptr
+                        + (M_start_offset + offs_am[:, None]) * N
+                        + offs_bn[None, :],
+                        c,
+                        mask=offs_am[:, None] < m_size and offs_bn[None, :] < n_size,
+                    )
+                tidx += NUM_SMS
+            iterated_tiles += num_tiles
+# TODO(shikaili): Too much code duplication. Need to refactor.
+@triton.autotune(
+    configs=_NV_WS_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune_ws},
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _fbgemm_grouped_gemm_ws(
+    a_desc_ptr,
+    b_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+    USE_TMA_LOAD_ON_SCALES: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+) -> None:
+    tl.static_assert(USE_TMA_LOAD, "Always use TMA load with warp specialziation!")
+    tl.static_assert(not USE_TMA_LOAD_ON_SCALES, "Not supported!")
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    dtype: tl.dtype = c_ptr.dtype.element_ty
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g, cache_modifier=".ca")
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            tl.static_assert(N % BLOCK_SIZE_N == 0, f"{N=} {BLOCK_SIZE_N=}")
+            NUM_N_TILES: tl.constexpr = N // BLOCK_SIZE_N
+            num_tiles = num_m_tiles * NUM_N_TILES
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            next_iterated_tiles = iterated_tiles + num_tiles
+            if (tidx >= iterated_tiles) and (tidx < next_iterated_tiles):
+                for i in range(tidx, next_iterated_tiles, NUM_SMS):
+                    gidx = i - iterated_tiles
+                    # Split M first and N second.
+                    tile_m_idx = gidx % num_m_tiles
+                    tile_n_idx = gidx // num_m_tiles
+                    accumulator = tl.zeros(
+                        (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32
+                    )
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                    if USE_TMA_STORE:
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset],
+                            accumulator.to(c_ptr.dtype.element_ty),
+                        )
+                    elif FUSE_SCATTER_ADD:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
+                    else:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
+                    tidx += NUM_SMS
+            iterated_tiles += num_tiles
+TT_FP8_DTYPE = tl.float8e4b8 if torch.version.hip else tl.float8e4nv
+# TODO(shikaili): clean up redundant 'b_scale_desc_ptr' argument.
+@triton.autotune(
+    configs=_AMD_CONFIGS if torch.version.hip else _NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={
+        "early_config_prune": functools.partial(
+            early_config_prune, dtype=TT_FP8_DTYPE, dtsize=1
+        )
+    },
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _fbgemm_grouped_gemm_fp8_rowwise(
+    a_desc_ptr,
+    a_scale_ptr,
+    b_desc_ptr,
+    b_scale_ptr,
+    b_scale_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+) -> None:
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    dtype = TT_FP8_DTYPE
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g)
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            n_size = N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            num_tiles = num_m_tiles * num_n_tiles
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, n_size],
+                    # pyre-ignore
+                    strides=[n_size, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            while tidx >= iterated_tiles and tidx < iterated_tiles + num_tiles:
+                gidx = tidx - iterated_tiles
+                # Split M first and N second.
+                tile_m_idx = gidx % num_m_tiles
+                tile_n_idx = gidx // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                tl.static_assert(K % BLOCK_SIZE_K == 0)
+                if USE_TMA_LOAD:
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                else:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    offs_k = tl.arange(0, BLOCK_SIZE_K)
+                    a_ptrs = (
+                        a_desc_ptr
+                        + (M_start_offset + offs_am[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    b_ptrs = (
+                        b_desc_ptr
+                        + (N_start_offset + offs_bn[:, None]) * K
+                        + offs_k[None, :]
+                    )
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
+                        b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
+                        accumulator += tl.dot(a, b.T)
+                        a_ptrs += BLOCK_SIZE_K
+                        b_ptrs += BLOCK_SIZE_K
+                offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                a_scale = tl.load(
+                    a_scale_ptr + M_start_offset + offs_am[:, None],
+                    mask=offs_am[:, None] < m_size,
+                )
+                b_scale = tl.load(
+                    b_scale_ptr + N_start_offset + offs_bn[None, :],
+                    mask=offs_bn[None, :] < n_size,
+                )
+                c = accumulator.to(tl.float32) * a_scale * b_scale
+                if USE_TMA_STORE:
+                    m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    # pyre-ignore
+                    c_desc_ptr.store([m_offset, n_offset], c.to(c_ptr.dtype.element_ty))
+                elif FUSE_SCATTER_ADD:
+                    offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                    mask = offs_am < m_size
+                    m_offsets = tl.load(
+                        scatter_add_indices + M_start_offset + offs_am,
+                        mask=mask,
+                        cache_modifier=".ca",
+                    )
+                    offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                    tl.atomic_add(
+                        c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                        c.to(c_ptr.dtype.element_ty),
+                        mask=mask[:, None] and offs_bn[None, :] < n_size,
+                        sem="relaxed",
+                    )
+                else:
+                    tl.store(
+                        c_ptr
+                        + (M_start_offset + offs_am[:, None]) * N
+                        + offs_bn[None, :],
+                        c,
+                        mask=offs_am[:, None] < m_size and offs_bn[None, :] < n_size,
+                    )
+                tidx += NUM_SMS
+            iterated_tiles += num_tiles
+# TODO(shikaili): Too much code duplication. Need to refactor.
+@triton.autotune(
+    configs=_NV_WS_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={
+        "early_config_prune": functools.partial(
+            early_config_prune_ws, dtype=TT_FP8_DTYPE, dtsize=1
+        )
+    },
+    restore_value=["c_ptr"],  # restore for scatter_add fusion
+)
+@triton.jit
+def _fbgemm_grouped_gemm_fp8_rowwise_ws(
+    a_desc_ptr,
+    a_scale_ptr,
+    b_desc_ptr,
+    b_scale_ptr,
+    b_scale_desc_ptr,
+    c_ptr,
+    scatter_add_indices,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    FUSE_SCATTER_ADD: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
+    # tile sizes
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_CONSUMER_GROUPS: tl.constexpr,
+    USE_TMA_LOAD_ON_SCALES: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+) -> None:
+    tl.static_assert(USE_TMA_LOAD, "Always use TMA load with warp specialziation!")
+    tl.static_assert(
+        not (FUSE_SCATTER_ADD and USE_TMA_STORE),
+        "Cannot fuse scatter add with TMA store!",
+    )
+    tidx = tl.program_id(0)
+    dtype = TT_FP8_DTYPE
+    M_end_offset = 0
+    M_end_offset = M_end_offset.to(tl.int64)  # pyre-ignore
+    iterated_tiles = 0
+    for g in tl.range(G):
+        # Move across groups
+        m_size = tl.load(m_sizes + g, cache_modifier=".ca")
+        if m_size > 0:
+            M_start_offset = M_end_offset
+            M_end_offset = M_start_offset + m_size
+            N_start_offset = g.to(tl.int64) * N
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            tl.static_assert(N % BLOCK_SIZE_N == 0)
+            NUM_N_TILES: tl.constexpr = N // BLOCK_SIZE_N
+            num_tiles = num_m_tiles * NUM_N_TILES
+            if USE_TMA_STORE:
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
+            # Move across tiles
+            next_iterated_tiles = iterated_tiles + num_tiles
+            if (tidx >= iterated_tiles) and (tidx < next_iterated_tiles):
+                for i in range(tidx, next_iterated_tiles, NUM_SMS):
+                    gidx = i - iterated_tiles
+                    # Split M first and N second.
+                    tile_m_idx = gidx % num_m_tiles
+                    tile_n_idx = gidx // num_m_tiles
+                    accumulator = tl.zeros(
+                        (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32
+                    )
+                    tl.static_assert(K % BLOCK_SIZE_K == 0)
+                    m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                    n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                    for k_offset in range(0, K, BLOCK_SIZE_K):
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
+                    if USE_TMA_LOAD_ON_SCALES:
+                        b_scale = tl._experimental_descriptor_load(
+                            b_scale_desc_ptr,
+                            [n_offset],
+                            [BLOCK_SIZE_N],
+                            tl.float32,
+                        )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        a_scale = tl.load(
+                            a_scale_ptr + M_start_offset + offs_am[:, None],
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".ca",
+                        )
+                        c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
+                    else:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        a_scale = tl.load(
+                            a_scale_ptr + M_start_offset + offs_am[:, None],
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".ca",
+                        )
+                        b_scale = tl.load(
+                            b_scale_ptr + N_start_offset + offs_bn[None, :],
+                            cache_modifier=".ca",
+                        )
+                        c = accumulator.to(tl.float32) * a_scale * b_scale
+                    if USE_TMA_STORE:
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
+                        )
+                    elif FUSE_SCATTER_ADD:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
+                    else:
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
+                    tidx += NUM_SMS
+            iterated_tiles += num_tiles
+warnings.simplefilter("once")
+def _grouped_gemm(
+    *,
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    x_scale: Optional[torch.Tensor],
+    w_scale: Optional[torch.Tensor],
+    use_fast_accum: bool,
+    use_warp_specialization: bool,
+    output_tensor: Optional[torch.Tensor],
+    scatter_add_indices: Optional[torch.Tensor],
+) -> torch.Tensor:
+    USE_TMA_LOAD = not torch.version.hip
+    USE_TMA_STORE = False
+    if USE_TMA_LOAD and not utils.HAS_TMA_DESC:
+        USE_TMA_LOAD = False
+        warnings.warn(
+            "TMA load is disabled as there is no TMA descriptor support!", stacklevel=2
+        )
+    if USE_TMA_STORE and not utils.HAS_TMA_DESC:
+        USE_TMA_STORE = False
+        warnings.warn(
+            "TMA store is disabled as there is no TMA descriptor support!", stacklevel=2
+        )
+    # TODO(shikaili): Check the readniess of WS on ROCm side in Meta's Triton.
+    if use_warp_specialization and torch.version.hip:
+        warnings.warn(
+            "Warp specialization is disabled as it is not supported on ROCm.",
+            stacklevel=2,
+        )
+        use_warp_specialization = False
+    if use_warp_specialization and not _HAS_WS_SUPPORT:
+        warnings.warn(
+            "Warp specialization is disabled as the Triton build in current environment doesn't have such support. Please build from https://github.com/facebookexperimental/triton/tree/ws-3.2.x to enable it for best performance on Nvidia's SM90 GPUs.",
+            stacklevel=2,
+        )
+        use_warp_specialization = False
+    if use_warp_specialization:
+        assert utils.HAS_TMA_DESC
+        USE_TMA_STORE = True  # Tuning decision
+    G = m_sizes.shape[0]
+    assert x.is_contiguous()
+    assert w.is_contiguous()
+    assert m_sizes.is_contiguous()
+    M, K = x.shape
+    N = w.shape[0] // G
+    assert K == w.shape[1]
+    if K % 8 != 0 or N % 8 != 0:
+        use_warp_specialization = False
+        USE_TMA_LOAD = False
+        USE_TMA_STORE = False
+        warnings.warn(
+            f"TMA load and warp specialization are disabled since K or N is not a multiple of 8: {K=}, {N=}.",
+            stacklevel=2,
+        )
+        assert (
+            x_scale is None
+        ), f"Quantisation is not supported yet when K or N is not a multiple of 8: {K=}, {N=}."
+        assert (
+            output_tensor is None
+        ), f"Fused scatter add has large rounding error when K or N is not a multiple of 8: {K=}, {N=}."
+    if output_tensor is None:
+        FUSE_SCATTER_ADD = False
+        assert scatter_add_indices is None
+        y = torch.empty((M, N), device=x.device, dtype=torch.bfloat16)
+    else:
+        FUSE_SCATTER_ADD = True
+        assert scatter_add_indices is not None
+        assert scatter_add_indices.is_contiguous()
+        assert scatter_add_indices.shape == (M,)
+        y = output_tensor
+    if M == 0 or N == 0:
+        return y
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    desc_helper = None
+    desc_x = x
+    desc_w = w
+    desc_ws = w_scale
+    if USE_TMA_LOAD:
+        desc_helper = utils.TmaAutoTuneHelper()
+        desc_helper.init_tma_descriptor("x")
+        desc_helper.init_tma_descriptor("w")
+        desc_x = desc_helper.get_tma_descriptor_kernel_param("x")
+        desc_w = desc_helper.get_tma_descriptor_kernel_param("w")
+        if use_warp_specialization and w_scale is not None:
+            desc_helper.init_tma_descriptor("ws")
+            desc_ws = desc_helper.get_tma_descriptor_kernel_param("ws")
+    if USE_TMA_STORE:
+        def alloc_fn(size: int, alignment: int, stream: Optional[int]):
+            return torch.empty(size, device="cuda", dtype=torch.int8)
+        triton.set_allocator(alloc_fn)
+    def grid(META):
+        if USE_TMA_LOAD:
+            nonlocal desc_helper  # noqa: F824
+            desc_helper.fill_2d_tma_descriptor(
+                "x",
+                x.data_ptr(),
+                M,
+                K,
+                META["BLOCK_SIZE_M"] // META["NUM_CONSUMER_GROUPS"],
+                META["BLOCK_SIZE_K"],
+                x.element_size(),
+            )
+            desc_helper.fill_2d_tma_descriptor(
+                "w",
+                w.data_ptr(),
+                N * G,
+                K,
+                META["BLOCK_SIZE_N"],
+                META["BLOCK_SIZE_K"],
+                w.element_size(),
+            )
+            if META.get("USE_TMA_LOAD_ON_SCALES", False):
+                desc_helper.fill_1d_tma_descriptor(
+                    "ws",
+                    w_scale.data_ptr(),
+                    N * G,
+                    META["BLOCK_SIZE_N"],
+                    w_scale.element_size(),
+                )
+        return (NUM_SMS,)
+    M_BUCKET_CAP = 16384
+    M_BUCKET = min(triton.next_power_of_2(M), M_BUCKET_CAP)
+    if x_scale is not None and w_scale is not None:
+        assert x_scale.is_contiguous()
+        assert w_scale.is_contiguous()
+        fn = (
+            _fbgemm_grouped_gemm_fp8_rowwise_ws
+            if use_warp_specialization
+            else _fbgemm_grouped_gemm_fp8_rowwise
+        )
+        args = (
+            desc_x,
+            x_scale,
+            desc_w,
+            w_scale,
+            desc_ws,
+            y,
+            scatter_add_indices,
+            m_sizes,
+            G,
+            M_BUCKET,
+            N,
+            K,
+            NUM_SMS,
+            FUSE_SCATTER_ADD,
+            USE_TMA_LOAD,
+        )
+        if use_warp_specialization:
+            args += (use_fast_accum,)
+        else:
+            args += (USE_TMA_STORE, use_fast_accum)
+        fn[grid](*args)
+    else:
+        assert x_scale is None
+        assert w_scale is None
+        fn = (
+            _fbgemm_grouped_gemm_ws if use_warp_specialization else _fbgemm_grouped_gemm
+        )
+        args = (
+            desc_x,
+            desc_w,
+            y,
+            scatter_add_indices,
+            m_sizes,
+            G,
+            M_BUCKET,
+            N,
+            K,
+            NUM_SMS,
+            FUSE_SCATTER_ADD,
+            USE_TMA_LOAD,
+        )
+        if use_warp_specialization:
+            args += (use_fast_accum,)
+        else:
+            args += (USE_TMA_STORE, use_fast_accum)
+        fn[grid](*args)
+    return y
+def grouped_gemm(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    use_fast_accum: bool = True,
+    *,
+    _use_warp_specialization: bool = True,
+    _output_tensor: Optional[torch.Tensor] = None,
+    _scatter_add_indices: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return _grouped_gemm(
+        x=x,
+        w=w,
+        m_sizes=m_sizes,
+        x_scale=None,
+        w_scale=None,
+        use_fast_accum=use_fast_accum,
+        use_warp_specialization=_use_warp_specialization,
+        output_tensor=_output_tensor,
+        scatter_add_indices=_scatter_add_indices,
+    )
+def grouped_gemm_fp8_rowwise(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    x_scale: torch.Tensor,
+    w_scale: torch.Tensor,
+    use_fast_accum: bool = True,
+    *,
+    _use_warp_specialization: bool = True,
+    _output_tensor: Optional[torch.Tensor] = None,
+    _scatter_add_indices: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return _grouped_gemm(
+        x=x,
+        w=w,
+        m_sizes=m_sizes,
+        x_scale=x_scale,
+        w_scale=w_scale,
+        use_fast_accum=use_fast_accum,
+        use_warp_specialization=_use_warp_specialization,
+        output_tensor=_output_tensor,
+        scatter_add_indices=_scatter_add_indices,
+    )