PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py CHANGED Viewed

@@ -31,15 +31,18 @@ except Exception:
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
+from fbgemm_gpu.split_embedding_configs import sparse_type_int_to_dtype
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
 def generate_vbe_metadata(
     offsets: Tensor,
-    batch_size_per_feature_per_rank: Optional[List[List[int]]],
+    batch_size_per_feature_per_rank: Optional[list[list[int]]],
     pooling_mode: PoolingMode,
     feature_dims_cpu: Tensor,
     device: torch.device,
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
 ) -> invokers.lookup_args.VBEMetadata:
     """
     Generate VBE metadata based on batch_size_per_feature_per_rank.
@@ -133,6 +136,8 @@ def generate_vbe_metadata(
             max_B_feature_rank=max_B_feature_rank,
             # pyre-ignore
             output_size=output_size,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
     else:
         vbe_metadata = invokers.lookup_args.VBEMetadata(
@@ -142,5 +147,43 @@ def generate_vbe_metadata(
             max_B=-1,
             max_B_feature_rank=-1,
             output_size=-1,
+            vbe_output=None,
+            vbe_output_offsets=None,
         )
     return vbe_metadata
+def check_allocated_vbe_output(
+    output_dtype: int,
+    batch_size_per_feature_per_rank: Optional[List[List[int]]],
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
+) -> None:
+    assert (
+        batch_size_per_feature_per_rank is not None
+    ), "[Merged_VBE] vbe_output is passed, batch_size_per_feature_per_rank cannot be None"
+    assert (
+        vbe_output is not None
+    ), "[Merged_VBE] vbe_output_offsets is not None, vbe_output cannot be None"
+    assert (
+        vbe_output_offsets is not None
+    ), "[Merged_VBE] vbe_output is not None, vbe_output_offsets cannot be None"
+    num_features = len(batch_size_per_feature_per_rank)
+    num_ranks = len(batch_size_per_feature_per_rank[0])
+    assert vbe_output_offsets.shape == torch.Size(
+        [num_ranks, num_features]
+    ), f"[Merged_VBE] Mismatched vbe_output_offsets shape. batch_size_per_feature_per_rank={batch_size_per_feature_per_rank}. Expected: {torch.Size([num_ranks, num_features])}, Actual: {vbe_output_offsets.shape}"
+    assert (
+        vbe_output.dim() == 1
+    ), f"[Merged_VBE] vbe_output must have 1 dimension, but got {vbe_output.dim()}. vbe_output shape is {vbe_output.shape}"
+    assert (
+        vbe_output_offsets.device == vbe_output.device
+    ), "[Merged_VBE] vbe_output_offsets and vbe_output must be on the same device"
+    _output_dtype = sparse_type_int_to_dtype(output_dtype)
+    assert (
+        vbe_output.dtype == _output_dtype
+    ), f"[Merged_VBE] vbe_output dtype must match TBE output dtype {_output_dtype} (SparseType {output_dtype}), but got {vbe_output.dtype}"
+    assert (
+        vbe_output_offsets.is_contiguous()
+    ), "[Merged_VBE] vbe_output_offsets needs to be contiguous"
+    assert vbe_output.is_contiguous(), "[Merged_VBE] vbe_output needs to be contiguous"

fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py CHANGED Viewed

@@ -16,7 +16,6 @@ from fbgemm_gpu.tbe.ssd import (  # noqa: F401
     SSDTableBatchedEmbeddingBags,  # noqa: F401
 )
 warnings.warn(  # noqa: B028
     f"""\033[93m
     The Python module {__name__} is now DEPRECATED and will be removed in the

fbgemm_gpu/tbe/bench/__init__.py CHANGED Viewed

@@ -21,6 +21,7 @@ from .bench_runs import (  # noqa F401
     benchmark_pipelined_requests,
     benchmark_requests,
     benchmark_requests_refer,
+    benchmark_requests_with_spec,
     benchmark_vbe,
 )
 from .benchmark_click_interface import TbeBenchClickInterface  # noqa F401
@@ -40,7 +41,11 @@ from .tbe_data_config_param_models import (  # noqa F401
     IndicesParams,
     PoolingParams,
 )
-from .utils import fill_random_scale_bias  # noqa F401
+from .utils import (  # noqa F401
+    check_oom,
+    fill_random_scale_bias,
+    generate_merged_output_and_offsets,
+)
 try:
     torch.ops.load_library(

fbgemm_gpu/tbe/bench/bench_config.py CHANGED Viewed

@@ -10,7 +10,7 @@
 import dataclasses
 import json
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 import click
@@ -29,10 +29,12 @@ class TBEBenchmarkingConfig:
     export_trace: bool
     # The path for exporting the trace
     trace_url: Optional[str]
+    # If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba
+    upload_perf_data: bool
     @classmethod
     # pyre-ignore [3]
-    def from_dict(cls, data: Dict[str, Any]):
+    def from_dict(cls, data: dict[str, Any]):
         return cls(**data)
     @classmethod
@@ -40,7 +42,7 @@ class TBEBenchmarkingConfig:
     def from_json(cls, data: str):
         return cls.from_dict(json.loads(data))
-    def dict(self) -> Dict[str, Any]:
+    def dict(self) -> dict[str, Any]:
         return dataclasses.asdict(self)
     def json(self, format: bool = False) -> str:
@@ -71,6 +73,7 @@ class TBEBenchmarkingHelperText(Enum):
         "If set, trace will be exported to the path specified in trace url"
     )
     BENCH_TRACE_URL = "The path for exporting the trace"
+    BENCH_UPLOAD_PERF_DATA = "If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba"
 class TBEBenchmarkingConfigLoader:
@@ -115,6 +118,12 @@ class TBEBenchmarkingConfigLoader:
                 default="{emb_op_type}_tbe_{phase}_trace_{ospid}.json",
                 help=TBEBenchmarkingHelperText.BENCH_TRACE_URL.value,
             ),
+            click.option(
+                "--upload-perf-data",
+                is_flag=True,
+                default=False,
+                help=TBEBenchmarkingHelperText.BENCH_UPLOAD_PERF_DATA.value,
+            ),
         ]
         for option in reversed(options):
@@ -131,6 +140,7 @@ class TBEBenchmarkingConfigLoader:
         flush_gpu_cache_size = params["bench_flush_gpu_cache_size"]
         export_trace = params["bench_export_trace"]
         trace_url = params["bench_trace_url"]
+        upload_perf_data = params["upload_perf_data"]
         # Default the number of TBE requests to number of iterations specified
         num_requests = iterations if num_requests == -1 else num_requests
@@ -142,4 +152,5 @@ class TBEBenchmarkingConfigLoader:
             flush_gpu_cache_size,
             export_trace,
             trace_url,
+            upload_perf_data,
         ).validate()

fbgemm_gpu/tbe/bench/bench_runs.py CHANGED Viewed

@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -11,12 +12,13 @@ import statistics
 import threading
 import time
 from subprocess import Popen
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, Optional
 import torch
+# fmt:skip
 from fbgemm_gpu.tbe.utils import b_indices, TBERequest
+from fbgemm_gpu.tbe.utils.common import get_device
 logging.basicConfig(level=logging.DEBUG)
@@ -43,6 +45,31 @@ def bench_warmup(
                 out.backward(grad)
+def bench_warmup_with_spec(
+    request: TBERequest,
+    warmup_ms: int,
+    warmup_runs: int,
+    func: Callable[
+        [torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
+        torch.Tensor,
+    ],
+    bwd_only: bool = False,
+    grad: Optional[torch.Tensor] = None,
+) -> None:
+    indices, offsets, weights, batch_size_per_feature_per_rank = request.unpack_4()
+    if warmup_ms:
+        start_time_ms = time.time() * 1000
+        while time.time() * 1000 - start_time_ms < warmup_ms:
+            out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
+            if bwd_only:
+                out.backward(grad)
+    else:
+        for _ in range(warmup_runs):
+            out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
+            if bwd_only:
+                out.backward(grad)
 class BMBarrier:
     def __init__(self) -> None:
@@ -66,7 +93,7 @@ cpu_bm_barrier = BMBarrier()
 def cpu_tbe_worker(
-    requests_: List[TBERequest],
+    requests_: list[TBERequest],
     func_: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     use_barrier: bool = False,
 ) -> float:
@@ -98,7 +125,7 @@ def cpu_tbe_worker(
 def benchmark_cpu_requests_mp(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     emb_module: torch.nn.Module,
     num_warmups: int = 0,
     num_copies: int = 1,
@@ -127,6 +154,13 @@ def benchmark_cpu_requests_mp(
         float: The average runtime per iteration in seconds.
     """
+    import os
+    strategy = os.environ.get("PYTORCH_SHARE_STRATEGY")
+    current_strategy = torch.multiprocessing.get_sharing_strategy()
+    if strategy is not None and current_strategy != strategy:
+        torch.multiprocessing.set_sharing_strategy(strategy)
     cpu_bm_barrier.create_barrier(num_copies)
     worker_pool = torch.multiprocessing.Pool(num_copies)
@@ -181,7 +215,7 @@ def benchmark_cpu_requests_mp(
 def benchmark_cpu_requests(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     num_warmups: int = 0,
 ) -> float:
@@ -199,7 +233,7 @@ def benchmark_cpu_requests(
 def benchmark_requests(  # noqa: C901
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     flush_gpu_cache_size_mb: int = 0,
     check_median: bool = False,
@@ -266,7 +300,7 @@ def benchmark_requests(  # noqa: C901
                 _ = torch.rand(
                     flush_gpu_cache_size_mb * 1024 * 1024 // 4,
                     dtype=torch.float,
-                    device="cuda",
+                    device=get_device(),
                 )
             start_events[it].record()
@@ -308,8 +342,123 @@ def benchmark_requests(  # noqa: C901
     return median_time if check_median else avg_time
+def benchmark_requests_with_spec(  # noqa: C901
+    requests: list[TBERequest],
+    func: Callable[
+        [torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
+        torch.Tensor,
+    ],
+    flush_gpu_cache_size_mb: int = 0,
+    check_median: bool = False,
+    num_warmups: int = 0,
+    bwd_only: bool = False,
+    grad: Optional[torch.Tensor] = None,
+    # Used to label benchmark iterations differently in nsys profile result
+    # so that we can compare performance of two different models for example.
+    # If empty string is provided, it won't have any effect.
+    nvtx_range: str = "",
+    # Can be used to clear model's stats after warmup for example.
+    callback_after_warmup: Optional[Callable[[], None]] = None,
+    periodic_logs: bool = False,
+    warmup_ms: Optional[int] = None,
+    iters: int = -1,
+) -> float:
+    times = []
+    # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
+    # for the first kernel if warmup_ms > 0
+    # warmup_ms is prioritized over num_warmups
+    if warmup_ms is None:
+        num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
+    # warm-up the GPU before profiling
+    bench_warmup_with_spec(
+        requests[0],
+        # pyre-ignore[6]
+        warmup_ms,
+        num_warmups,
+        lambda indices, offsets, per_sample_weights, batch_size_per_feature_per_rank: func(
+            indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
+        ),
+        bwd_only=bwd_only,
+        grad=grad,
+    )
+    if callback_after_warmup is not None:
+        callback_after_warmup()
+    num_reqs = len(requests)
+    iters = num_reqs if iters == -1 else iters
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
+        end_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
+    else:
+        start_events = []
+        end_events = []
+    for it in range(iters):
+        req = requests[it % num_reqs]
+        indices, offsets, weights, batch_size_per_feature_per_rank = req.unpack_4()
+        # logging.info(
+        #     f"[Benchmark Request] batch_size_per_feature_per_rank {batch_size_per_feature_per_rank} {indices.device}"
+        # )
+        if bwd_only:
+            # Run forward before profiling if does backward only
+            out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
+        start_time = time.time()
+        if torch.cuda.is_available():
+            if flush_gpu_cache_size_mb:
+                _ = torch.rand(
+                    flush_gpu_cache_size_mb * 1024 * 1024 // 4,
+                    dtype=torch.float,
+                    device=get_device(),
+                )
+            start_events[it].record()
+        if nvtx_range:
+            torch.cuda.nvtx.range_push(f"{nvtx_range}-{it}")
+        if bwd_only:
+            out.backward(grad)
+        else:
+            func(indices, offsets, weights, batch_size_per_feature_per_rank)
+        if nvtx_range:
+            torch.cuda.nvtx.range_pop()
+        if torch.cuda.is_available():
+            end_events[it].record()
+        else:
+            it_time = time.time() - start_time
+            times.append(it_time)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        times = [
+            start.elapsed_time(end) * 1.0e-3
+            for start, end in zip(start_events, end_events)
+        ]
+    if periodic_logs:
+        for it in range(100, iters + 1, 100):
+            times_ = times[0:it]
+            avg_time = sum(times_) / len(times_) * 1.0e6
+            last_100_avg = sum(times_[-100:]) / 100 * 1.0e6
+            logging.info(
+                f"Iteration [{it}/{len(requests)}]: Last 100: {last_100_avg:.2f} us, Running avg: {avg_time:.2f} us"
+            )
+    avg_time = sum(times) / iters
+    median_time = statistics.median(times)
+    return median_time if check_median else avg_time
 def benchmark_requests_refer(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     T: int,
     B: int,
     L: int,
@@ -348,7 +497,7 @@ def benchmark_requests_refer(
                 _ = torch.rand(
                     flush_gpu_cache_size_mb * 1024 * 1024 // 4,
                     dtype=torch.float,
-                    device="cuda",
+                    device=get_device(),
                 )
                 torch.cuda.synchronize()
             start_event.record()
@@ -401,12 +550,12 @@ def benchmark_requests_refer(
 def benchmark_pipelined_requests(
-    requests: List[TBERequest],
+    requests: list[TBERequest],
     func1: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
     func2: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
     flush_gpu_cache_size_mb: int = 0,
     check_median: bool = False,
-) -> Tuple[float, float]:
+) -> tuple[float, float]:
     torch.cuda.synchronize()
     start_events = [
         (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
@@ -422,7 +571,7 @@ def benchmark_pipelined_requests(
             _ = torch.rand(
                 flush_gpu_cache_size_mb * 1024 * 1024 // 4,
                 dtype=torch.float,
-                device="cuda",
+                device=get_device(),
             )
             torch.cuda.synchronize()
         start_event[0].record()
@@ -458,10 +607,10 @@ def benchmark_pipelined_requests(
 def benchmark_vbe(
-    requests: List[Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
+    requests: list[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
     func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
     num_warmups: int = 0,
-) -> Tuple[float, float]:
+) -> tuple[float, float]:
     """
     A benchmark function to return the average execution time in seconds of
     forward and backward of VBE kernels.

fbgemm_gpu/tbe/bench/benchmark_click_interface.py CHANGED Viewed

@@ -8,11 +8,14 @@
 # pyre-strict
 import click
+# fmt:skip
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import BoundsCheckMode
-from .bench_config import TBEBenchmarkingHelperText
-from .tbe_data_config_loader import TBEDataConfigHelperText
+# fmt:skip
+from .bench_config import TBEBenchmarkingHelperText  # usort:skip
+from .tbe_data_config_loader import TBEDataConfigHelperText  # usort:skip
 class TbeBenchClickInterface:

fbgemm_gpu/tbe/bench/eeg_cli.py CHANGED Viewed

@@ -6,11 +6,11 @@
 # pyre-strict
-from typing import List, Tuple
 import click
 import torch
+# fmt:skip
 from fbgemm_gpu.tbe.bench import IndicesParams
@@ -82,7 +82,7 @@ def estimate(indices: str) -> None:
 )
 def generate(
     hitters: str,
-    zipf: Tuple[float, float],
+    zipf: tuple[float, float],
     max_index: int,
     num_indices: int,
     output: str,
@@ -114,7 +114,7 @@ def generate(
     assert output != "", "Output file path must be provided"
     try:
-        _hitters: List[float] = (
+        _hitters: list[float] = (
             [float(x) for x in hitters.split(",")] if hitters else []
         )
     except Exception as e:

fbgemm_gpu/tbe/bench/embedding_ops_common_config.py CHANGED Viewed

@@ -8,11 +8,12 @@
 # pyre-strict
 import dataclasses
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 import click
 import torch
+# fmt:skip
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     BoundsCheckMode,
@@ -44,7 +45,7 @@ class EmbeddingOpsCommonConfig:
     def validate(self):
         return self
-    def split_args(self) -> Dict[str, Any]:
+    def split_args(self) -> dict[str, Any]:
         return {
             "weights_precision": self.weights_dtype,
             "stochastic_rounding": self.stochastic_rounding,

fbgemm_gpu/tbe/bench/eval_compression.py CHANGED Viewed

@@ -10,7 +10,7 @@
 import logging
 import statistics
 from dataclasses import dataclass
-from typing import Callable, List, Tuple
+from typing import Callable
 import torch
@@ -29,8 +29,8 @@ class EvalCompressionBenchmarkOutput:
 def benchmark_eval_compression(
-    baseline_requests: List[Tuple[torch.Tensor, torch.Tensor]],
-    compressed_requests: List[Tuple[torch.Tensor, torch.Tensor]],
+    baseline_requests: list[tuple[torch.Tensor, torch.Tensor]],
+    compressed_requests: list[tuple[torch.Tensor, torch.Tensor]],
     baseline_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
     compressed_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
     reindex: torch.Tensor,