PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py CHANGED Viewed

@@ -31,15 +31,18 @@ except Exception:
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
+from fbgemm_gpu.split_embedding_configs import sparse_type_int_to_dtype
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
 def generate_vbe_metadata(
     offsets: Tensor,
-    batch_size_per_feature_per_rank: Optional[List[List[int]]],
+    batch_size_per_feature_per_rank: Optional[list[list[int]]],
     pooling_mode: PoolingMode,
     feature_dims_cpu: Tensor,
     device: torch.device,
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
 ) -> invokers.lookup_args.VBEMetadata:
     """
     Generate VBE metadata based on batch_size_per_feature_per_rank.
@@ -133,6 +136,8 @@ def generate_vbe_metadata(
             max_B_feature_rank=max_B_feature_rank,
             # pyre-ignore
             output_size=output_size,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
     else:
         vbe_metadata = invokers.lookup_args.VBEMetadata(
@@ -142,5 +147,43 @@ def generate_vbe_metadata(
             max_B=-1,
             max_B_feature_rank=-1,
             output_size=-1,
+            vbe_output=None,
+            vbe_output_offsets=None,
         )
     return vbe_metadata
+def check_allocated_vbe_output(
+    output_dtype: int,
+    batch_size_per_feature_per_rank: Optional[List[List[int]]],
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
+) -> None:
+    assert (
+        batch_size_per_feature_per_rank is not None
+    ), "[Merged_VBE] vbe_output is passed, batch_size_per_feature_per_rank cannot be None"
+    assert (
+        vbe_output is not None
+    ), "[Merged_VBE] vbe_output_offsets is not None, vbe_output cannot be None"
+    assert (
+        vbe_output_offsets is not None
+    ), "[Merged_VBE] vbe_output is not None, vbe_output_offsets cannot be None"
+    num_features = len(batch_size_per_feature_per_rank)
+    num_ranks = len(batch_size_per_feature_per_rank[0])
+    assert vbe_output_offsets.shape == torch.Size(
+        [num_ranks, num_features]
+    ), f"[Merged_VBE] Mismatched vbe_output_offsets shape. batch_size_per_feature_per_rank={batch_size_per_feature_per_rank}. Expected: {torch.Size([num_ranks, num_features])}, Actual: {vbe_output_offsets.shape}"
+    assert (
+        vbe_output.dim() == 1
+    ), f"[Merged_VBE] vbe_output must have 1 dimension, but got {vbe_output.dim()}. vbe_output shape is {vbe_output.shape}"
+    assert (
+        vbe_output_offsets.device == vbe_output.device
+    ), "[Merged_VBE] vbe_output_offsets and vbe_output must be on the same device"
+    _output_dtype = sparse_type_int_to_dtype(output_dtype)
+    assert (
+        vbe_output.dtype == _output_dtype
+    ), f"[Merged_VBE] vbe_output dtype must match TBE output dtype {_output_dtype} (SparseType {output_dtype}), but got {vbe_output.dtype}"
+    assert (
+        vbe_output_offsets.is_contiguous()
+    ), "[Merged_VBE] vbe_output_offsets needs to be contiguous"
+    assert vbe_output.is_contiguous(), "[Merged_VBE] vbe_output needs to be contiguous"

fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py CHANGED Viewed

@@ -16,7 +16,6 @@ from fbgemm_gpu.tbe.ssd import (  # noqa: F401
     SSDTableBatchedEmbeddingBags,  # noqa: F401
 )
 warnings.warn(  # noqa: B028
     f"""\033[93m
     The Python module {__name__} is now DEPRECATED and will be removed in the

fbgemm_gpu/tbe/bench/__init__.py CHANGED Viewed

@@ -12,15 +12,19 @@ import torch
 from .bench_config import (  # noqa F401
     TBEBenchmarkingConfig,
     TBEBenchmarkingConfigLoader,
+    TBEBenchmarkingHelperText,
 )
 from .bench_runs import (  # noqa F401
     bench_warmup,
     benchmark_cpu_requests,
+    benchmark_cpu_requests_mp,
     benchmark_pipelined_requests,
     benchmark_requests,
     benchmark_requests_refer,
+    benchmark_requests_with_spec,
     benchmark_vbe,
 )
+from .benchmark_click_interface import TbeBenchClickInterface  # noqa F401
 from .embedding_ops_common_config import EmbeddingOpsCommonConfigLoader  # noqa F401
 from .eval_compression import (  # noqa F401
     benchmark_eval_compression,
@@ -28,13 +32,20 @@ from .eval_compression import (  # noqa F401
 )
 from .reporter import BenchmarkReporter  # noqa F401
 from .tbe_data_config import TBEDataConfig  # noqa F401
-from .tbe_data_config_loader import TBEDataConfigLoader  # noqa F401
+from .tbe_data_config_loader import (  # noqa F401
+    TBEDataConfigHelperText,
+    TBEDataConfigLoader,
+)
 from .tbe_data_config_param_models import (  # noqa F401
     BatchParams,
     IndicesParams,
     PoolingParams,
 )
-from .utils import fill_random_scale_bias  # noqa F401
+from .utils import (  # noqa F401
+    check_oom,
+    fill_random_scale_bias,
+    generate_merged_output_and_offsets,
+)
 try:
     torch.ops.load_library(

fbgemm_gpu/tbe/bench/bench_config.py CHANGED Viewed

@@ -9,7 +9,8 @@
 import dataclasses
 import json
-from typing import Any, Dict, Optional
+from enum import Enum
+from typing import Any, Optional
 import click
@@ -28,10 +29,12 @@ class TBEBenchmarkingConfig:
     export_trace: bool
     # The path for exporting the trace
     trace_url: Optional[str]
+    # If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba
+    upload_perf_data: bool
     @classmethod
     # pyre-ignore [3]
-    def from_dict(cls, data: Dict[str, Any]):
+    def from_dict(cls, data: dict[str, Any]):
         return cls(**data)
     @classmethod
@@ -39,7 +42,7 @@ class TBEBenchmarkingConfig:
     def from_json(cls, data: str):
         return cls.from_dict(json.loads(data))
-    def dict(self) -> Dict[str, Any]:
+    def dict(self) -> dict[str, Any]:
         return dataclasses.asdict(self)
     def json(self, format: bool = False) -> str:
@@ -56,6 +59,23 @@ class TBEBenchmarkingConfig:
         return self
+@dataclasses.dataclass(frozen=True)
+class TBEBenchmarkingHelperText(Enum):
+    BENCH_ITERATIONS = "Number of benchmark iterations to run"
+    BENCH_NUM_REQUESTS = "Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used"
+    BENCH_WARMUP_ITERATIONS = (
+        "Number of warmup iterations to run before making measurements"
+    )
+    BENCH_FLUSH_GPU_CACHE_SIZE = (
+        "Amount of memory to use for flushing the GPU cache after each iteration (MB)"
+    )
+    BENCH_EXPORT_TRACE = (
+        "If set, trace will be exported to the path specified in trace url"
+    )
+    BENCH_TRACE_URL = "The path for exporting the trace"
+    BENCH_UPLOAD_PERF_DATA = "If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba"
 class TBEBenchmarkingConfigLoader:
     @classmethod
     # pyre-ignore [2]
@@ -65,38 +85,44 @@ class TBEBenchmarkingConfigLoader:
                 "--bench-iterations",
                 type=int,
                 default=100,
-                help="Number of benchmark iterations to run",
+                help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value,
             ),
             click.option(
                 "--bench-num-requests",
                 type=int,
                 default=-1,
-                help="Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used",
+                help=TBEBenchmarkingHelperText.BENCH_NUM_REQUESTS.value,
             ),
             click.option(
                 "--bench-warmup-iterations",
                 type=int,
                 default=0,
-                help="Number of warmup iterations to run before making measurements",
+                help=TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value,
             ),
             click.option(
                 "--bench-flush-gpu-cache-size",
                 type=int,
                 default=0,
-                help="Amount of memory to use for flushing the GPU cache after each iteration (MB)",
+                help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
             ),
             click.option(
                 "--bench-export-trace",
                 is_flag=True,
                 default=False,
-                help="If set, a trace will be exported",
+                help=TBEBenchmarkingHelperText.BENCH_EXPORT_TRACE.value,
             ),
             click.option(
                 "--bench-trace-url",
                 type=str,
                 required=False,
                 default="{emb_op_type}_tbe_{phase}_trace_{ospid}.json",
-                help="The path for exporting the trace",
+                help=TBEBenchmarkingHelperText.BENCH_TRACE_URL.value,
+            ),
+            click.option(
+                "--upload-perf-data",
+                is_flag=True,
+                default=False,
+                help=TBEBenchmarkingHelperText.BENCH_UPLOAD_PERF_DATA.value,
             ),
         ]
@@ -114,6 +140,7 @@ class TBEBenchmarkingConfigLoader:
         flush_gpu_cache_size = params["bench_flush_gpu_cache_size"]
         export_trace = params["bench_export_trace"]
         trace_url = params["bench_trace_url"]
+        upload_perf_data = params["upload_perf_data"]
         # Default the number of TBE requests to number of iterations specified
         num_requests = iterations if num_requests == -1 else num_requests
@@ -125,4 +152,5 @@ class TBEBenchmarkingConfigLoader:
             flush_gpu_cache_size,
             export_trace,
             trace_url,
+            upload_perf_data,
         ).validate()

fbgemm_gpu/tbe/bench/bench_runs.py CHANGED Viewed

@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -8,12 +9,16 @@
 import logging
 import statistics
+import threading
 import time
-from typing import Callable, List, Optional, Tuple
+from subprocess import Popen
+from typing import Callable, Optional
 import torch
-from fbgemm_gpu.tbe.utils import b_indices, TBERequest  # noqa: F401
+# fmt:skip
+from fbgemm_gpu.tbe.utils import b_indices, TBERequest
+from fbgemm_gpu.tbe.utils.common import get_device
 logging.basicConfig(level=logging.DEBUG)
@@ -40,8 +45,177 @@ def bench_warmup(
                 out.backward(grad)
+def bench_warmup_with_spec(
+    request: TBERequest,
+    warmup_ms: int,
+    warmup_runs: int,
+    func: Callable[
+        [torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
+        torch.Tensor,
+    ],
+    bwd_only: bool = False,
+    grad: Optional[torch.Tensor] = None,
+) -> None:
+    indices, offsets, weights, batch_size_per_feature_per_rank = request.unpack_4()
+    if warmup_ms:
+        start_time_ms = time.time() * 1000
+        while time.time() * 1000 - start_time_ms < warmup_ms:
+            out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
+            if bwd_only:
+                out.backward(grad)
+    else:
+        for _ in range(warmup_runs):
+            out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
+            if bwd_only:
+                out.backward(grad)
+class BMBarrier:
+    def __init__(self) -> None:
+        self.bar: Optional[threading.Barrier] = None
+    def create_barrier(self, party_size: int) -> None:
+        if self.bar is not None:
+            self.bar.reset()
+            self.bar = None
+        self.bar = torch.multiprocessing.Barrier(party_size)
+    def wait(self) -> None:
+        if self.bar is not None:
+            self.bar.wait()
+# This barrier ensures all CPU TBE workers start the embedding workload
+# together so that we get the most accurate measurement. This needs to be
+# a global variable because it will be shared among worker processes.
+cpu_bm_barrier = BMBarrier()
+def cpu_tbe_worker(
+    requests_: list[TBERequest],
+    func_: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
+    use_barrier: bool = False,
+) -> float:
+    """
+    Worker function to process CPU TBE workload.
+    Args:
+        requests_ (List[TBERequest]): A list of TBERequest objects to be processed. Namely, the dataset.
+        func_ (Callable[[Tensor, Tensor, Optional[Tensor]], Tensor]):
+            The function to process each request, usually the `.forward()` method
+            n the embedding module instance.
+        use_barrier (bool, optional): Whether to use a barrier to synchronize the
+            start of embedding workload. Defaults to False.
+    Returns:
+        float: The average runtime per iteration in seconds.
+    """
+    import time
+    if use_barrier:
+        cpu_bm_barrier.wait()
+    start_time = time.perf_counter()
+    for req in requests_:
+        func_(*(req.unpack_3()))
+    end_time = time.perf_counter()
+    return (end_time - start_time) / len(requests_)
+def benchmark_cpu_requests_mp(
+    requests: list[TBERequest],
+    emb_module: torch.nn.Module,
+    num_warmups: int = 0,
+    num_copies: int = 1,
+    start_script: str = "",
+    end_script: str = "",
+) -> float:
+    """
+    CPU benchmark request handler with multi-processing support
+    Args:
+        requests (List[TBERequest]): A list of TBERequest objects to be processed.
+        emb_module (torch.nn.Module): The embedding module to be used for processing requests,
+            for example, an instance of `IntNBitTableBatchedEmbeddingBagsCodegen` module.
+        num_warmups (int, optional): Number of warm-up iterations to perform before benchmarking. Defaults to 0.
+        num_copies (int, optional): Number of parallel copies of the workloads. By `copies`,
+            we mean the number of parallel processes working on the same dataset described in `requests`.
+            Defaults to 1 (which means single threaded). Increasing this will enable the benchmark to use
+            more CPU cores and push higher memory bandwidth.
+        start_script (str, optional): Path to a script to be executed before starting the benchmark.
+            Defaults to empty (not running anything). This can be used to collect perf counters.
+            The script will be terminated upon benchmark finishing.
+        end_script (str, optional): Path to a script to be executed after completing the benchmark.
+            Defaults to empty (not running anything). This can be used to post-process perf counters.
+    Returns:
+        float: The average runtime per iteration in seconds.
+    """
+    import os
+    strategy = os.environ.get("PYTORCH_SHARE_STRATEGY")
+    current_strategy = torch.multiprocessing.get_sharing_strategy()
+    if strategy is not None and current_strategy != strategy:
+        torch.multiprocessing.set_sharing_strategy(strategy)
+    cpu_bm_barrier.create_barrier(num_copies)
+    worker_pool = torch.multiprocessing.Pool(num_copies)
+    if num_warmups > 0:
+        asyncres = []
+        for _ in range(num_copies):
+            asyncres.append(
+                worker_pool.apply_async(
+                    cpu_tbe_worker,
+                    args=(
+                        [requests[0]],
+                        emb_module.forward,
+                        False,
+                        num_warmups,
+                    ),
+                )
+            )
+        for res in asyncres:
+            res.wait()
+    if start_script:
+        p_start = Popen([start_script, str(num_copies)])
+    asyncres = []
+    for _ in range(num_copies):
+        asyncres.append(
+            worker_pool.apply_async(
+                cpu_tbe_worker,
+                args=(
+                    requests,
+                    emb_module.forward,
+                    True,
+                ),
+            )
+        )
+    runtime_per_iter = 0.0
+    for res in asyncres:
+        res.wait()
+        runtime_per_iter += res.get()
+    worker_pool.close()
+    worker_pool.join()
+    worker_pool.terminate()
+    if start_script:
+        p_start.terminate()
+    if end_script:
+        p_end = Popen([end_script, str(num_copies)])
+        p_end.wait()
+    return runtime_per_iter / num_copies
 def benchmark_cpu_requests(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     num_warmups: int = 0,
 ) -> float:
@@ -59,7 +233,7 @@ def benchmark_cpu_requests(
 def benchmark_requests(  # noqa: C901
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     flush_gpu_cache_size_mb: int = 0,
     check_median: bool = False,
@@ -126,7 +300,7 @@ def benchmark_requests(  # noqa: C901
                 _ = torch.rand(
                     flush_gpu_cache_size_mb * 1024 * 1024 // 4,
                     dtype=torch.float,
-                    device="cuda",
+                    device=get_device(),
                 )
             start_events[it].record()
@@ -168,8 +342,123 @@ def benchmark_requests(  # noqa: C901
     return median_time if check_median else avg_time
+def benchmark_requests_with_spec(  # noqa: C901
+    requests: list[TBERequest],
+    func: Callable[
+        [torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
+        torch.Tensor,
+    ],
+    flush_gpu_cache_size_mb: int = 0,
+    check_median: bool = False,
+    num_warmups: int = 0,
+    bwd_only: bool = False,
+    grad: Optional[torch.Tensor] = None,
+    # Used to label benchmark iterations differently in nsys profile result
+    # so that we can compare performance of two different models for example.
+    # If empty string is provided, it won't have any effect.
+    nvtx_range: str = "",
+    # Can be used to clear model's stats after warmup for example.
+    callback_after_warmup: Optional[Callable[[], None]] = None,
+    periodic_logs: bool = False,
+    warmup_ms: Optional[int] = None,
+    iters: int = -1,
+) -> float:
+    times = []
+    # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
+    # for the first kernel if warmup_ms > 0
+    # warmup_ms is prioritized over num_warmups
+    if warmup_ms is None:
+        num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
+    # warm-up the GPU before profiling
+    bench_warmup_with_spec(
+        requests[0],
+        # pyre-ignore[6]
+        warmup_ms,
+        num_warmups,
+        lambda indices, offsets, per_sample_weights, batch_size_per_feature_per_rank: func(
+            indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
+        ),
+        bwd_only=bwd_only,
+        grad=grad,
+    )
+    if callback_after_warmup is not None:
+        callback_after_warmup()
+    num_reqs = len(requests)
+    iters = num_reqs if iters == -1 else iters
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
+        end_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
+    else:
+        start_events = []
+        end_events = []
+    for it in range(iters):
+        req = requests[it % num_reqs]
+        indices, offsets, weights, batch_size_per_feature_per_rank = req.unpack_4()
+        # logging.info(
+        #     f"[Benchmark Request] batch_size_per_feature_per_rank {batch_size_per_feature_per_rank} {indices.device}"
+        # )
+        if bwd_only:
+            # Run forward before profiling if does backward only
+            out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
+        start_time = time.time()
+        if torch.cuda.is_available():
+            if flush_gpu_cache_size_mb:
+                _ = torch.rand(
+                    flush_gpu_cache_size_mb * 1024 * 1024 // 4,
+                    dtype=torch.float,
+                    device=get_device(),
+                )
+            start_events[it].record()
+        if nvtx_range:
+            torch.cuda.nvtx.range_push(f"{nvtx_range}-{it}")
+        if bwd_only:
+            out.backward(grad)
+        else:
+            func(indices, offsets, weights, batch_size_per_feature_per_rank)
+        if nvtx_range:
+            torch.cuda.nvtx.range_pop()
+        if torch.cuda.is_available():
+            end_events[it].record()
+        else:
+            it_time = time.time() - start_time
+            times.append(it_time)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        times = [
+            start.elapsed_time(end) * 1.0e-3
+            for start, end in zip(start_events, end_events)
+        ]
+    if periodic_logs:
+        for it in range(100, iters + 1, 100):
+            times_ = times[0:it]
+            avg_time = sum(times_) / len(times_) * 1.0e6
+            last_100_avg = sum(times_[-100:]) / 100 * 1.0e6
+            logging.info(
+                f"Iteration [{it}/{len(requests)}]: Last 100: {last_100_avg:.2f} us, Running avg: {avg_time:.2f} us"
+            )
+    avg_time = sum(times) / iters
+    median_time = statistics.median(times)
+    return median_time if check_median else avg_time
 def benchmark_requests_refer(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     T: int,
     B: int,
     L: int,
@@ -208,7 +497,7 @@ def benchmark_requests_refer(
                 _ = torch.rand(
                     flush_gpu_cache_size_mb * 1024 * 1024 // 4,
                     dtype=torch.float,
-                    device="cuda",
+                    device=get_device(),
                 )
                 torch.cuda.synchronize()
             start_event.record()
@@ -261,12 +550,12 @@ def benchmark_requests_refer(
 def benchmark_pipelined_requests(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     func1: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
     func2: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
     flush_gpu_cache_size_mb: int = 0,
     check_median: bool = False,
-) -> Tuple[float, float]:
+) -> tuple[float, float]:
     torch.cuda.synchronize()
     start_events = [
         (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
@@ -282,7 +571,7 @@ def benchmark_pipelined_requests(
             _ = torch.rand(
                 flush_gpu_cache_size_mb * 1024 * 1024 // 4,
                 dtype=torch.float,
-                device="cuda",
+                device=get_device(),
             )
             torch.cuda.synchronize()
         start_event[0].record()
@@ -318,10 +607,10 @@ def benchmark_pipelined_requests(
 def benchmark_vbe(
-    requests: List[Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
+    requests: list[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
     func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     num_warmups: int = 0,
-) -> Tuple[float, float]:
+) -> tuple[float, float]:
     """
     A benchmark function to return the average execution time in seconds of
     forward and backward of VBE kernels.