PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/bench/benchmark_click_interface.py ADDED Viewed

@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import click
+# fmt:skip
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import BoundsCheckMode
+# fmt:skip
+from .bench_config import TBEBenchmarkingHelperText  # usort:skip
+from .tbe_data_config_loader import TBEDataConfigHelperText  # usort:skip
+class TbeBenchClickInterface:
+    @classmethod
+    # pyre-ignore [2]
+    def common_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--alpha",
+                default=1.0,
+                help="The alpha value used for the benchmark, default is 1.0. Recommended value: alpha=1.15 for training and alpha=1.09 for inference",
+            ),
+            click.option(
+                "--batch-size",
+                default=512,
+                help=TBEDataConfigHelperText.TBE_BATCH_SIZE.value + " Default is 512.",
+            ),
+            click.option(
+                "--weights-precision",
+                type=SparseType,
+                default=SparseType.FP32,
+                help="The precision type for weights, default is FP32.",
+            ),
+            click.option(
+                "--stoc",
+                is_flag=True,
+                default=False,
+                help="Flag to enable stochastic rounding, default is False.",
+            ),
+            click.option(
+                "--iters",
+                default=100,
+                help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value
+                + " Default is 100.",
+            ),
+            click.option(
+                "--warmup-runs",
+                default=0,
+                help=(
+                    TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value
+                    + " Default is 0."
+                ),
+            ),
+            click.option(  # Note: Original default for uvm bencmark is 0.1
+                "--reuse",
+                default=0.0,
+                help="The inter-batch indices reuse rate for the benchmark, default is 0.0.",
+            ),
+            click.option(
+                "--flush-gpu-cache-size-mb",
+                default=0,
+                help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    # pyre-ignore [2]
+    def table_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--bag-size",
+                default=20,
+                help=TBEDataConfigHelperText.TBE_POOLING_SIZE.value + " Default is 20.",
+            ),
+            click.option(
+                "--embedding-dim",
+                default=128,
+                help=TBEDataConfigHelperText.TBE_EMBEDDING_DIM.value
+                + " Default is 128.",
+            ),
+            click.option(
+                "--mixed",
+                is_flag=True,
+                default=False,
+                help=TBEDataConfigHelperText.TBE_MIXED_DIM.value + " Default is False.",
+            ),
+            click.option(
+                "--num-embeddings",
+                default=int(1e5),
+                help=TBEDataConfigHelperText.TBE_NUM_EMBEDDINGS.value
+                + " Default is 1e5.",
+            ),
+            click.option(
+                "--num-tables",
+                default=32,
+                help=TBEDataConfigHelperText.TBE_NUM_TABLES.value + " Default is 32.",
+            ),
+            click.option(
+                "--tables",
+                type=str,
+                default=None,
+                help="Comma-separated list of table numbers Default is None.",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    # pyre-ignore [2]
+    def device_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--cache-precision",
+                type=SparseType,
+                default=None,
+                help="The precision type for cache, default is None.",
+            ),
+            click.option(
+                "--managed",
+                type=click.Choice(
+                    ["device", "managed", "managed_caching"], case_sensitive=False
+                ),
+                default="device",
+                help="The managed option for embedding location. Choices are 'device', 'managed', or 'managed_caching'. Default is 'device'.",
+            ),
+            click.option(
+                "--row-wise/--no-row-wise",
+                default=True,
+                help="Flag to enable or disable row-wise optimization, default is enabled. Use --no-row-wise to disable.",
+            ),
+            click.option(
+                "--weighted",
+                is_flag=True,
+                default=False,
+                help=TBEDataConfigHelperText.TBE_WEIGHTED.value + " Default is False.",
+            ),
+            click.option(
+                "--pooling",
+                type=click.Choice(["sum", "mean", "none"], case_sensitive=False),
+                default="sum",
+                help="The pooling method to use. Choices are 'sum', 'mean', or 'none'. Default is 'sum'.",
+            ),
+            click.option(
+                "--bounds-check-mode",
+                type=int,
+                default=BoundsCheckMode.NONE.value,
+                help="The bounds check mode, default is NONE. Options are: FATAL (0) - Raise an exception (CPU) or device-side assert (CUDA), WARNING (1) - Log the first out-of-bounds instance per kernel, and set to zero, IGNORE (2) - Set to zero, NONE (3) - No bounds checks, V2_IGNORE (4) - IGNORE with V2 enabled, V2_WARNING (5) - WARNING with V2 enabled, V2_FATAL (6) - FATAL with V2 enabled.",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    # pyre-ignore [2]
+    def vbe_options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--bag-size-list",
+                type=str,
+                default="20",
+                help="A comma-separated list of bag sizes for each table, default is '20'.",
+            ),
+            click.option(
+                "--bag-size-sigma-list",
+                type=str,
+                default="None",
+                help="A comma-separated list of bag size standard deviations for generating bag sizes (one std per table). If set, the benchmark will treat --bag-size-list as a list of bag size means. Default is 'None'.",
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func

fbgemm_gpu/tbe/bench/eeg_cli.py ADDED Viewed

@@ -0,0 +1,138 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import click
+import torch
+# fmt:skip
+from fbgemm_gpu.tbe.bench import IndicesParams
+@click.group()
+def cli() -> None:
+    pass
+@cli.command()
+@click.option("--indices", required=True, help="Indices tensor file (*.pt)")
+def estimate(indices: str) -> None:
+    """
+    Estimate the distribution of indices given a tensor file
+    Parameters:
+        indices (str): Indices tensor file (*.pt)
+    Returns:
+        None
+    Example:
+        estimate --indices="indices.pt"
+    """
+    indices = torch.load(indices)
+    heavy_hitters, q, s, max_index, num_indices = (
+        torch.ops.fbgemm.tbe_estimate_indices_distribution(indices)
+    )
+    params = IndicesParams(
+        heavy_hitters=heavy_hitters, zipf_q=q, zipf_s=s, index_dtype=indices.dtype
+    )
+    print(params.json(format=True), f"max_index={max_index}\nnum_indices={num_indices}")
+@cli.command()
+@click.option(
+    "--hitters",
+    type=str,
+    default="",
+    help="TBE heavy hitter indices (comma-delimited list of floats)",
+)
+@click.option(
+    "--zipf",
+    type=(float, float),
+    default=(0.1, 0.1),
+    help="Zipf distribution parameters for indices generation (q, s)",
+)
+@click.option(
+    "-e",
+    "--max-index",
+    type=int,
+    default=20,
+    help="Max index value (< E)",
+)
+@click.option(
+    "-n",
+    "--num-indices",
+    type=int,
+    default=20,
+    help="Target number of indices to generate",
+)
+@click.option(
+    "--output",
+    type=str,
+    required=True,
+    help="Tensor filepath (*.pt) to save the generated indices",
+)
+def generate(
+    hitters: str,
+    zipf: tuple[float, float],
+    max_index: int,
+    num_indices: int,
+    output: str,
+) -> None:
+    """
+    Generates a tensor of indices given the indices distribution parameters
+    Parameters:
+        hitters (str): heavy hitter indices (comma-delimited list of floats)
+        zipf (Tuple[float, float]): Zipf distribution parameters for indices generation (q, s)
+        max_index (int): Max index value (E)
+        num_indices (int): Target number of indices to generate
+        output (str): Tensor filepath (*.pt) to save the generated indices
+    Returns:
+        None
+    Example:
+        generate --hitters="2,4,6" --zipf="1.1,1.1" --max-index=10 --num-indices=100 --output="generated_indices.pt"
+    """
+    assert max_index > 0, "Max index value (E) must be greater than 0"
+    assert num_indices > 0, "Target number of indices must be greater than 0"
+    assert zipf[0] > 0, "Zipf parameter q must be greater than 0.0"
+    assert zipf[1] > 0, "Zipf parameter s must be greater than 0.0"
+    assert output != "", "Output file path must be provided"
+    try:
+        _hitters: list[float] = (
+            [float(x) for x in hitters.split(",")] if hitters else []
+        )
+    except Exception as e:
+        raise AssertionError(
+            f'Error: {e}. Please ensure to use comma-delimited list of floats, e.g., --hitters="2,4,6". '
+        )
+    heavy_hitters = torch.tensor(_hitters)
+    assert heavy_hitters.numel() <= 20, "The number of heavy hitters should be <= 20"
+    indices = torch.ops.fbgemm.tbe_generate_indices_from_distribution(
+        heavy_hitters, zipf[0], zipf[1], max_index, num_indices
+    )
+    print(f"Generated indices: {indices}")
+    torch.save(indices, output)
+    print(f"Saved indices to: {output}")
+if __name__ == "__main__":
+    cli()

fbgemm_gpu/tbe/bench/embedding_ops_common_config.py CHANGED Viewed

@@ -8,11 +8,12 @@
 # pyre-strict
 import dataclasses
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 import click
 import torch
+# fmt:skip
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     BoundsCheckMode,
@@ -44,7 +45,7 @@ class EmbeddingOpsCommonConfig:
     def validate(self):
         return self
-    def split_args(self) -> Dict[str, Any]:
+    def split_args(self) -> dict[str, Any]:
         return {
             "weights_precision": self.weights_dtype,
             "stochastic_rounding": self.stochastic_rounding,
@@ -99,9 +100,7 @@ class EmbeddingOpsCommonConfigLoader:
             click.option(
                 "--emb-location",
                 default="device",
-                type=click.Choice(
-                    ["device", "managed", "managed_caching"], case_sensitive=False
-                ),
+                type=click.Choice(EmbeddingLocation.str_values(), case_sensitive=False),
                 help="Memory location of the embeddings",
             ),
             click.option(

fbgemm_gpu/tbe/bench/eval_compression.py CHANGED Viewed

@@ -10,7 +10,7 @@
 import logging
 import statistics
 from dataclasses import dataclass
-from typing import Callable, List, Tuple
+from typing import Callable
 import torch
@@ -29,8 +29,8 @@ class EvalCompressionBenchmarkOutput:
 def benchmark_eval_compression(
-    baseline_requests: List[Tuple[torch.Tensor, torch.Tensor]],
-    compressed_requests: List[Tuple[torch.Tensor, torch.Tensor]],
+    baseline_requests: list[tuple[torch.Tensor, torch.Tensor]],
+    compressed_requests: list[tuple[torch.Tensor, torch.Tensor]],
     baseline_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
     compressed_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
     reindex: torch.Tensor,