PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py ADDED Viewed

@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Optional
+import torch
+from torch import Tensor
+try:
+    try:
+        from torch.compiler import is_compiling
+        def is_torchdynamo_compiling() -> bool:  # type: ignore[misc]
+            # at least one test fails if we import is_compiling as a different name
+            return is_compiling()
+    except Exception:
+        # torch.compiler.is_compiling is not available in torch 1.10
+        from torch._dynamo import is_compiling as is_torchdynamo_compiling
+except Exception:
+    def is_torchdynamo_compiling() -> bool:  # type: ignore[misc]
+        return False
+# @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
+import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
+def generate_vbe_metadata(
+    offsets: Tensor,
+    batch_size_per_feature_per_rank: Optional[list[list[int]]],
+    pooling_mode: PoolingMode,
+    feature_dims_cpu: Tensor,
+    device: torch.device,
+) -> invokers.lookup_args.VBEMetadata:
+    """
+    Generate VBE metadata based on batch_size_per_feature_per_rank.
+    Metadata includes:
+        1) B_offsets - A tensor that contains batch size offsets for each
+                        feature
+        2) output_offsets_feature_rank - A tensor that contains output
+                                            offsets for each feature
+        3) B_offsets_per_rank_per_feature - A tensor that contains batch
+                                            size offsets for each feature
+                                            and rank
+        4) max_B - The maximum batch size for all features
+        5) max_B_feature_rank - The maximum batch size for all ranks and
+                                features
+        6) output_size - The output size (number of elements)
+    """
+    if batch_size_per_feature_per_rank is not None:
+        assert (
+            pooling_mode != PoolingMode.NONE
+        ), "Variable batch size TBE support is not enabled for PoolingMode.NONE"
+        # TODO: Add input check
+        zero_tensor = torch.zeros(1, device="cpu", dtype=torch.int32)
+        # Create B offsets
+        total_batch_size_per_feature = torch.tensor(
+            batch_size_per_feature_per_rank, dtype=torch.int32, device="cpu"
+        ).sum(dim=1)
+        max_B = total_batch_size_per_feature.max().item()
+        if not torch.jit.is_scripting() and is_torchdynamo_compiling():
+            torch._check_is_size(max_B)
+            torch._check(max_B < offsets.numel())
+        Bs = torch.concat([zero_tensor, total_batch_size_per_feature])
+        B_offsets = Bs.cumsum(dim=0).to(torch.int)
+        # Create output offsets
+        B_feature_rank = torch.tensor(
+            batch_size_per_feature_per_rank,
+            device="cpu",
+            dtype=torch.int64,
+        )
+        max_B_feature_rank = B_feature_rank.max().item()
+        if not torch.jit.is_scripting() and is_torchdynamo_compiling():
+            torch._check_is_size(max_B_feature_rank)
+            torch._check(max_B_feature_rank <= offsets.size(0))
+        output_sizes_feature_rank = B_feature_rank.transpose(
+            0, 1
+        ) * feature_dims_cpu.view(1, -1)
+        output_offsets_feature_rank = torch.concat(
+            [
+                zero_tensor.to(torch.int64),
+                output_sizes_feature_rank.flatten().cumsum(dim=0),
+            ]
+        )
+        output_size = output_offsets_feature_rank[-1].item()
+        if not torch.jit.is_scripting() and is_torchdynamo_compiling():
+            torch._check_is_size(output_size)
+        # TODO: Support INT8 output
+        # B_offsets_rank_per_feature is for rank and (b, t) mapping
+        B_offsets_rank_per_feature = (
+            torch.tensor(
+                [
+                    [0] + batch_size_per_feature
+                    for batch_size_per_feature in batch_size_per_feature_per_rank
+                ],
+                device="cpu",
+                dtype=torch.int32,
+            )
+            .cumsum(dim=1)
+            .to(torch.int)
+        )
+        B_offsets = B_offsets.to(device, non_blocking=True)
+        output_offsets_feature_rank = output_offsets_feature_rank.to(
+            device, non_blocking=True
+        )
+        B_offsets_rank_per_feature = B_offsets_rank_per_feature.to(
+            device, non_blocking=True
+        )
+        # TODO: Use int32 for B_offsets and int64 for output_offsets_feature_rank
+        vbe_metadata = invokers.lookup_args.VBEMetadata(
+            B_offsets=B_offsets,
+            output_offsets_feature_rank=output_offsets_feature_rank,
+            B_offsets_rank_per_feature=B_offsets_rank_per_feature,
+            # pyre-ignore
+            max_B=max_B,
+            # pyre-ignore
+            max_B_feature_rank=max_B_feature_rank,
+            # pyre-ignore
+            output_size=output_size,
+        )
+    else:
+        vbe_metadata = invokers.lookup_args.VBEMetadata(
+            B_offsets=None,
+            output_offsets_feature_rank=None,
+            B_offsets_rank_per_feature=None,
+            max_B=-1,
+            max_B_feature_rank=-1,
+            output_size=-1,
+        )
+    return vbe_metadata

fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py ADDED Viewed

@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+# pyre-ignore-all-errors[56]
+import warnings
+from fbgemm_gpu.tbe.ssd import (  # noqa: F401
+    ASSOC,  # noqa: F401
+    SSDIntNBitTableBatchedEmbeddingBags,  # noqa: F401
+    SSDTableBatchedEmbeddingBags,  # noqa: F401
+)
+warnings.warn(  # noqa: B028
+    f"""\033[93m
+    The Python module {__name__} is now DEPRECATED and will be removed in the
+    future.  Users should import fbgemm_gpu.tbe.ssd into their scripts instead.
+    \033[0m""",
+    DeprecationWarning,
+)

fbgemm_gpu/tbe/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.

fbgemm_gpu/tbe/bench/__init__.py ADDED Viewed

@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import torch
+from .bench_config import (  # noqa F401
+    TBEBenchmarkingConfig,
+    TBEBenchmarkingConfigLoader,
+    TBEBenchmarkingHelperText,
+)
+from .bench_runs import (  # noqa F401
+    bench_warmup,
+    benchmark_cpu_requests,
+    benchmark_cpu_requests_mp,
+    benchmark_pipelined_requests,
+    benchmark_requests,
+    benchmark_requests_refer,
+    benchmark_requests_with_spec,
+    benchmark_vbe,
+)
+from .benchmark_click_interface import TbeBenchClickInterface  # noqa F401
+from .embedding_ops_common_config import EmbeddingOpsCommonConfigLoader  # noqa F401
+from .eval_compression import (  # noqa F401
+    benchmark_eval_compression,
+    EvalCompressionBenchmarkOutput,
+)
+from .reporter import BenchmarkReporter  # noqa F401
+from .tbe_data_config import TBEDataConfig  # noqa F401
+from .tbe_data_config_loader import (  # noqa F401
+    TBEDataConfigHelperText,
+    TBEDataConfigLoader,
+)
+from .tbe_data_config_param_models import (  # noqa F401
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+)
+from .utils import fill_random_scale_bias  # noqa F401
+try:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_estimator"
+    )
+except Exception:
+    pass
+#: The max number of heavy heavy hitters, as defined in
+#: fbgemm_gpu/src/tbe/eeg/indices_estimator.h
+EEG_MAX_HEAVY_HITTERS: int = 20

fbgemm_gpu/tbe/bench/bench_config.py ADDED Viewed

@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import dataclasses
+import json
+from enum import Enum
+from typing import Any, Optional
+import click
+@dataclasses.dataclass(frozen=True)
+class TBEBenchmarkingConfig:
+    # Number of iterations
+    iterations: int
+    # Number of input TBE batches to generate for testing
+    num_requests: int
+    # Number of warmup iterations to run before making measurements
+    warmup_iterations: int
+    # Amount of memory to use for flushing the GPU cache after each iteration
+    flush_gpu_cache_size_mb: int
+    # If set, trace will be exported to the path specified in trace_url
+    export_trace: bool
+    # The path for exporting the trace
+    trace_url: Optional[str]
+    # If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba
+    upload_perf_data: bool
+    @classmethod
+    # pyre-ignore [3]
+    def from_dict(cls, data: dict[str, Any]):
+        return cls(**data)
+    @classmethod
+    # pyre-ignore [3]
+    def from_json(cls, data: str):
+        return cls.from_dict(json.loads(data))
+    def dict(self) -> dict[str, Any]:
+        return dataclasses.asdict(self)
+    def json(self, format: bool = False) -> str:
+        return json.dumps(self.dict(), indent=(2 if format else -1), sort_keys=True)
+    # pyre-ignore [3]
+    def validate(self):
+        assert self.iterations > 0, "iterations must be positive"
+        assert self.num_requests > 0, "num_requests must be positive"
+        assert self.warmup_iterations >= 0, "warmup_iterations must be non-negative"
+        assert (
+            self.flush_gpu_cache_size_mb >= 0
+        ), "flush_gpu_cache_size_mb must be non-negative"
+        return self
+@dataclasses.dataclass(frozen=True)
+class TBEBenchmarkingHelperText(Enum):
+    BENCH_ITERATIONS = "Number of benchmark iterations to run"
+    BENCH_NUM_REQUESTS = "Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used"
+    BENCH_WARMUP_ITERATIONS = (
+        "Number of warmup iterations to run before making measurements"
+    )
+    BENCH_FLUSH_GPU_CACHE_SIZE = (
+        "Amount of memory to use for flushing the GPU cache after each iteration (MB)"
+    )
+    BENCH_EXPORT_TRACE = (
+        "If set, trace will be exported to the path specified in trace url"
+    )
+    BENCH_TRACE_URL = "The path for exporting the trace"
+    BENCH_UPLOAD_PERF_DATA = "If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba"
+class TBEBenchmarkingConfigLoader:
+    @classmethod
+    # pyre-ignore [2]
+    def options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--bench-iterations",
+                type=int,
+                default=100,
+                help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value,
+            ),
+            click.option(
+                "--bench-num-requests",
+                type=int,
+                default=-1,
+                help=TBEBenchmarkingHelperText.BENCH_NUM_REQUESTS.value,
+            ),
+            click.option(
+                "--bench-warmup-iterations",
+                type=int,
+                default=0,
+                help=TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value,
+            ),
+            click.option(
+                "--bench-flush-gpu-cache-size",
+                type=int,
+                default=0,
+                help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
+            ),
+            click.option(
+                "--bench-export-trace",
+                is_flag=True,
+                default=False,
+                help=TBEBenchmarkingHelperText.BENCH_EXPORT_TRACE.value,
+            ),
+            click.option(
+                "--bench-trace-url",
+                type=str,
+                required=False,
+                default="{emb_op_type}_tbe_{phase}_trace_{ospid}.json",
+                help=TBEBenchmarkingHelperText.BENCH_TRACE_URL.value,
+            ),
+            click.option(
+                "--upload-perf-data",
+                is_flag=True,
+                default=False,
+                help=TBEBenchmarkingHelperText.BENCH_UPLOAD_PERF_DATA.value,
+            ),
+        ]
+        for option in reversed(options):
+            func = option(func)
+        return func
+    @classmethod
+    def load(cls, context: click.Context) -> TBEBenchmarkingConfig:
+        params = context.params
+        iterations = params["bench_iterations"]
+        num_requests = params["bench_num_requests"]
+        warmup_iterations = params["bench_warmup_iterations"]
+        flush_gpu_cache_size = params["bench_flush_gpu_cache_size"]
+        export_trace = params["bench_export_trace"]
+        trace_url = params["bench_trace_url"]
+        upload_perf_data = params["upload_perf_data"]
+        # Default the number of TBE requests to number of iterations specified
+        num_requests = iterations if num_requests == -1 else num_requests
+        return TBEBenchmarkingConfig(
+            iterations,
+            num_requests,
+            warmup_iterations,
+            flush_gpu_cache_size,
+            export_trace,
+            trace_url,
+            upload_perf_data,
+        ).validate()