PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/tbe/bench/benchmark_click_interface.py ADDED Viewed

@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import click
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import BoundsCheckMode
+from .bench_config import TBEBenchmarkingHelperText
+from .tbe_data_config_loader import TBEDataConfigHelperText
+class TbeBenchClickInterface:
+    @classmethod
+    # pyre-ignore [2]
+    def common_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--alpha",
+                default=1.0,
+                help="The alpha value used for the benchmark, default is 1.0. Recommended value: alpha=1.15 for training and alpha=1.09 for inference",
+            ),
+            click.option(
+                "--batch-size",
+                default=512,
+                help=TBEDataConfigHelperText.TBE_BATCH_SIZE.value + " Default is 512.",
+            ),
+            click.option(
+                "--weights-precision",
+                type=SparseType,
+                default=SparseType.FP32,
+                help="The precision type for weights, default is FP32.",
+            ),
+            click.option(
+                "--stoc",
+                is_flag=True,
+                default=False,
+                help="Flag to enable stochastic rounding, default is False.",
+            ),
+            click.option(
+                "--iters",
+                default=100,
+                help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value
+                + " Default is 100.",
+            ),
+            click.option(
+                "--warmup-runs",
+                default=0,
+                help=(
+                    TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value
+                    + " Default is 0."
+                ),
+            ),
+            click.option(  # Note: Original default for uvm bencmark is 0.1
+                "--reuse",
+                default=0.0,
+                help="The inter-batch indices reuse rate for the benchmark, default is 0.0.",
+            ),
+            click.option(
+                "--flush-gpu-cache-size-mb",
+                default=0,
+                help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    # pyre-ignore [2]
+    def table_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--bag-size",
+                default=20,
+                help=TBEDataConfigHelperText.TBE_POOLING_SIZE.value + " Default is 20.",
+            ),
+            click.option(
+                "--embedding-dim",
+                default=128,
+                help=TBEDataConfigHelperText.TBE_EMBEDDING_DIM.value
+                + " Default is 128.",
+            ),
+            click.option(
+                "--mixed",
+                is_flag=True,
+                default=False,
+                help=TBEDataConfigHelperText.TBE_MIXED_DIM.value + " Default is False.",
+            ),
+            click.option(
+                "--num-embeddings",
+                default=int(1e5),
+                help=TBEDataConfigHelperText.TBE_NUM_EMBEDDINGS.value
+                + " Default is 1e5.",
+            ),
+            click.option(
+                "--num-tables",
+                default=32,
+                help=TBEDataConfigHelperText.TBE_NUM_TABLES.value + " Default is 32.",
+            ),
+            click.option(
+                "--tables",
+                type=str,
+                default=None,
+                help="Comma-separated list of table numbers Default is None.",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    # pyre-ignore [2]
+    def device_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--cache-precision",
+                type=SparseType,
+                default=None,
+                help="The precision type for cache, default is None.",
+            ),
+            click.option(
+                "--managed",
+                type=click.Choice(
+                    ["device", "managed", "managed_caching"], case_sensitive=False
+                ),
+                default="device",
+                help="The managed option for embedding location. Choices are 'device', 'managed', or 'managed_caching'. Default is 'device'.",
+            ),
+            click.option(
+                "--row-wise/--no-row-wise",
+                default=True,
+                help="Flag to enable or disable row-wise optimization, default is enabled. Use --no-row-wise to disable.",
+            ),
+            click.option(
+                "--weighted",
+                is_flag=True,
+                default=False,
+                help=TBEDataConfigHelperText.TBE_WEIGHTED.value + " Default is False.",
+            ),
+            click.option(
+                "--pooling",
+                type=click.Choice(["sum", "mean", "none"], case_sensitive=False),
+                default="sum",
+                help="The pooling method to use. Choices are 'sum', 'mean', or 'none'. Default is 'sum'.",
+            ),
+            click.option(
+                "--bounds-check-mode",
+                type=int,
+                default=BoundsCheckMode.NONE.value,
+                help="The bounds check mode, default is NONE. Options are: FATAL (0) - Raise an exception (CPU) or device-side assert (CUDA), WARNING (1) - Log the first out-of-bounds instance per kernel, and set to zero, IGNORE (2) - Set to zero, NONE (3) - No bounds checks, V2_IGNORE (4) - IGNORE with V2 enabled, V2_WARNING (5) - WARNING with V2 enabled, V2_FATAL (6) - FATAL with V2 enabled.",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    # pyre-ignore [2]
+    def vbe_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--bag-size-list",
+                type=str,
+                default="20",
+                help="A comma-separated list of bag sizes for each table, default is '20'.",
+            ),
+            click.option(
+                "--bag-size-sigma-list",
+                type=str,
+                default="None",
+                help="A comma-separated list of bag size standard deviations for generating bag sizes (one std per table). If set, the benchmark will treat --bag-size-list as a list of bag size means. Default is 'None'.",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func

fbgemm_gpu/tbe/bench/eeg_cli.py ADDED Viewed

@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import click
+import torch
+from fbgemm_gpu.tbe.bench import IndicesParams
+@click.group()
+def cli() -> None:
+    pass
+@cli.command()
+@click.option("--indices", required=True, help="Indices tensor file (*.pt)")
+def estimate(indices: str) -> None:
+    """
+    Estimate the distribution of indices given a tensor file
+    Parameters:
+        indices (str): Indices tensor file (*.pt)
+    Returns:
+        None
+    Example:
+        estimate --indices="indices.pt"
+    """
+    indices = torch.load(indices)
+    heavy_hitters, q, s, max_index, num_indices = (
+        torch.ops.fbgemm.tbe_estimate_indices_distribution(indices)
+    )
+    params = IndicesParams(
+        heavy_hitters=heavy_hitters, zipf_q=q, zipf_s=s, index_dtype=indices.dtype
+    )
+    print(params.json(format=True), f"max_index={max_index}\nnum_indices={num_indices}")
+@cli.command()
+@click.option(
+    "--hitters",
+    type=str,
+    default="",
+    help="TBE heavy hitter indices (comma-delimited list of floats)",
+)
+@click.option(
+    "--zipf",
+    type=(float, float),
+    default=(0.1, 0.1),
+    help="Zipf distribution parameters for indices generation (q, s)",
+)
+@click.option(
+    "-e",
+    "--max-index",
+    type=int,
+    default=20,
+    help="Max index value (< E)",
+)
+@click.option(
+    "-n",
+    "--num-indices",
+    type=int,
+    default=20,
+    help="Target number of indices to generate",
+)
+@click.option(
+    "--output",
+    type=str,
+    required=True,
+    help="Tensor filepath (*.pt) to save the generated indices",
+)
+def generate(
+    hitters: str,
+    zipf: tuple[float, float],
+    max_index: int,
+    num_indices: int,
+    output: str,
+) -> None:
+    """
+    Generates a tensor of indices given the indices distribution parameters
+    Parameters:
+        hitters (str): heavy hitter indices (comma-delimited list of floats)
+        zipf (Tuple[float, float]): Zipf distribution parameters for indices generation (q, s)
+        max_index (int): Max index value (E)
+        num_indices (int): Target number of indices to generate
+        output (str): Tensor filepath (*.pt) to save the generated indices
+    Returns:
+        None
+    Example:
+        generate --hitters="2,4,6" --zipf="1.1,1.1" --max-index=10 --num-indices=100 --output="generated_indices.pt"
+    """
+    assert max_index > 0, "Max index value (E) must be greater than 0"
+    assert num_indices > 0, "Target number of indices must be greater than 0"
+    assert zipf[0] > 0, "Zipf parameter q must be greater than 0.0"
+    assert zipf[1] > 0, "Zipf parameter s must be greater than 0.0"
+    assert output != "", "Output file path must be provided"
+    try:
+        _hitters: list[float] = (
+            [float(x) for x in hitters.split(",")] if hitters else []
+        )
+    except Exception as e:
+        raise AssertionError(
+            f'Error: {e}. Please ensure to use comma-delimited list of floats, e.g., --hitters="2,4,6". '
+        )
+    heavy_hitters = torch.tensor(_hitters)
+    assert heavy_hitters.numel() <= 20, "The number of heavy hitters should be <= 20"
+    indices = torch.ops.fbgemm.tbe_generate_indices_from_distribution(
+        heavy_hitters, zipf[0], zipf[1], max_index, num_indices
+    )
+    print(f"Generated indices: {indices}")
+    torch.save(indices, output)
+    print(f"Saved indices to: {output}")
+if __name__ == "__main__":
+    cli()

fbgemm_gpu/tbe/bench/embedding_ops_common_config.py ADDED Viewed

@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import dataclasses
+from typing import Any, Optional
+import click
+import torch
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    BoundsCheckMode,
+    EmbeddingLocation,
+    PoolingMode,
+)
+@dataclasses.dataclass(frozen=True)
+class EmbeddingOpsCommonConfig:
+    # Precision of the embedding weights
+    weights_dtype: SparseType
+    # Precision of the embedding cache
+    cache_dtype: Optional[SparseType]
+    # Precision of the embedding output
+    output_dtype: SparseType
+    # Enable stochastic rounding when performing quantization
+    stochastic_rounding: bool
+    # Pooling operation to perform
+    pooling_mode: PoolingMode
+    # Use host-mapped UVM buffers
+    uvm_host_mapped: bool
+    # Memory location of the embeddings
+    embedding_location: EmbeddingLocation
+    # Bounds check mode
+    bounds_check_mode: BoundsCheckMode
+    # pyre-ignore [3]
+    def validate(self):
+        return self
+    def split_args(self) -> dict[str, Any]:
+        return {
+            "weights_precision": self.weights_dtype,
+            "stochastic_rounding": self.stochastic_rounding,
+            "output_dtype": self.output_dtype,
+            "pooling_mode": self.pooling_mode,
+            "bounds_check_mode": self.bounds_check_mode,
+            "uvm_host_mapped": self.uvm_host_mapped,
+        }
+class EmbeddingOpsCommonConfigLoader:
+    @classmethod
+    # pyre-ignore [2]
+    def options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--emb-weights-dtype",
+                type=SparseType,
+                default=SparseType.FP32,
+                help="Precision of the embedding weights",
+            ),
+            click.option(
+                "--emb-cache-dtype",
+                type=SparseType,
+                default=None,
+                help="Precision of the embedding cache",
+            ),
+            click.option(
+                "--emb-output-dtype",
+                type=SparseType,
+                default=SparseType.FP32,
+                help="Precision of the embedding output",
+            ),
+            click.option(
+                "--emb-stochastic-rounding",
+                is_flag=True,
+                default=False,
+                help="Enable stochastic rounding when performing quantization",
+            ),
+            click.option(
+                "--emb-pooling-mode",
+                type=click.Choice(["sum", "mean", "none"], case_sensitive=False),
+                default="sum",
+                help="Pooling operation to perform",
+            ),
+            click.option(
+                "--emb-uvm-host-mapped",
+                is_flag=True,
+                default=False,
+                help="Use host-mapped UVM buffers",
+            ),
+            click.option(
+                "--emb-location",
+                default="device",
+                type=click.Choice(EmbeddingLocation.str_values(), case_sensitive=False),
+                help="Memory location of the embeddings",
+            ),
+            click.option(
+                "--emb-bounds-check",
+                type=int,
+                default=BoundsCheckMode.WARNING.value,
+                help="Bounds check mode"
+                f"Available modes: FATAL={BoundsCheckMode.FATAL.value}, "
+                f"WARNING={BoundsCheckMode.WARNING.value}, "
+                f"IGNORE={BoundsCheckMode.IGNORE.value}, "
+                f"NONE={BoundsCheckMode.NONE.value}",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    def load(cls, context: click.Context) -> EmbeddingOpsCommonConfig:
+        params = context.params
+        weights_dtype = params["emb_weights_dtype"]
+        cache_dtype = params["emb_cache_dtype"]
+        output_dtype = params["emb_output_dtype"]
+        stochastic_rounding = params["emb_stochastic_rounding"]
+        pooling_mode = PoolingMode.from_str(str(params["emb_pooling_mode"]))
+        uvm_host_mapped = params["emb_uvm_host_mapped"]
+        bounds_check_mode = BoundsCheckMode(params["emb_bounds_check"])
+        embedding_location = EmbeddingLocation.from_str(str(params["emb_location"]))
+        if (
+            embedding_location is EmbeddingLocation.DEVICE
+            and not torch.cuda.is_available()
+        ):
+            embedding_location = EmbeddingLocation.HOST
+        return EmbeddingOpsCommonConfig(
+            weights_dtype,
+            cache_dtype,
+            output_dtype,
+            stochastic_rounding,
+            pooling_mode,
+            uvm_host_mapped,
+            embedding_location,
+            bounds_check_mode,
+        ).validate()

fbgemm_gpu/tbe/bench/eval_compression.py ADDED Viewed

@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import logging
+import statistics
+from dataclasses import dataclass
+from typing import Callable
+import torch
+logging.basicConfig(level=logging.DEBUG)
+@dataclass
+class EvalCompressionBenchmarkOutput:
+    avg: float
+    fwd: float
+    bwd: float
+    compressed_avg: float
+    compressed_fwd: float
+    reindex: float
+    compressed_bwd: float
+def benchmark_eval_compression(
+    baseline_requests: list[tuple[torch.Tensor, torch.Tensor]],
+    compressed_requests: list[tuple[torch.Tensor, torch.Tensor]],
+    baseline_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+    compressed_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+    reindex: torch.Tensor,
+    embedding_dim: int,
+) -> EvalCompressionBenchmarkOutput:
+    times = []
+    fwd_times = []
+    bwd_times = []
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    for indices, offsets in baseline_requests:
+        time = 0.0
+        start_event.record()
+        # forward
+        out = baseline_func(indices, offsets)
+        end_event.record()
+        torch.cuda.synchronize()
+        it_time = start_event.elapsed_time(end_event) * 1.0e-3
+        fwd_times.append(it_time)
+        time += it_time
+        grad = torch.rand_like(out)
+        start_event.record()
+        # backward
+        out.backward(grad)
+        end_event.record()
+        torch.cuda.synchronize()
+        it_time = start_event.elapsed_time(end_event) * 1.0e-3
+        bwd_times.append(it_time)
+        time += it_time
+        times.append(time)
+    avg = statistics.median(times)
+    fwd = statistics.median(fwd_times)
+    bwd = statistics.median(bwd_times)
+    times.clear()
+    fwd_times.clear()
+    bwd_times.clear()
+    reindex_times = []
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    for indices, offsets in compressed_requests:
+        time = 0.0
+        start_event.record()
+        # forward
+        out = compressed_func(indices, offsets)
+        end_event.record()
+        torch.cuda.synchronize()
+        it_time = start_event.elapsed_time(end_event) * 1.0e-3
+        fwd_times.append(it_time)
+        time += it_time
+        start_event.record()
+        # reindex
+        out = out.reshape(-1, embedding_dim)
+        out = torch.ops.fbgemm.index_select_dim0(out, reindex)
+        end_event.record()
+        torch.cuda.synchronize()
+        it_time = start_event.elapsed_time(end_event) * 1.0e-3
+        reindex_times.append(it_time)
+        time += it_time
+        grad = torch.rand_like(out)
+        start_event.record()
+        # backward
+        out.backward(grad)
+        end_event.record()
+        torch.cuda.synchronize()
+        it_time = start_event.elapsed_time(end_event) * 1.0e-3
+        bwd_times.append(it_time)
+        time += it_time
+        times.append(time)
+    compressed_avg = statistics.median(times)
+    compressed_fwd = statistics.median(fwd_times)
+    reindex = statistics.median(reindex_times)
+    compressed_bwd = statistics.median(bwd_times)
+    return EvalCompressionBenchmarkOutput(
+        avg, fwd, bwd, compressed_avg, compressed_fwd, reindex, compressed_bwd
+    )

fbgemm_gpu/tbe/bench/reporter.py ADDED Viewed

@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import logging
+from dataclasses import dataclass
+haveAIBench = False
+try:
+    from aibench_observer.utils.observer import emitMetric
+    haveAIBench = True
+except Exception:
+    haveAIBench = False
+@dataclass
+class BenchmarkReporter:
+    report: bool
+    logger: logging.Logger = logging.getLogger()
+    # pyre-ignore[3]
+    def __post_init__(self):
+        self.logger.setLevel(logging.INFO)
+    # pyre-ignore[2]
+    def emit_metric(self, **kwargs) -> None:
+        if self.report and haveAIBench:
+            self.logger.info(emitMetric(**kwargs))