PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.8.2__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.11.4__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.8.2__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.11.4__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

fbgemm_gpu/__init__.py +106 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +2 -3
fbgemm_gpu/config/feature_list.py +4 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/experimental/example/__init__.py +0 -4
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +0 -4
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +277 -218
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +509 -433
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +157 -102
fbgemm_gpu/experimental/gen_ai/__init__.py +12 -7
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +32 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +261 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +344 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +0 -4
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +1 -2
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +15 -16
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +250 -190
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +721 -129
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +0 -4
fbgemm_gpu/experimental/gen_ai/moe/activation.py +2 -2
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +24 -17
fbgemm_gpu/experimental/gen_ai/moe/layers.py +6 -9
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +82 -67
fbgemm_gpu/experimental/gen_ai/quantize.py +6 -7
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +4 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize_comm.py +13 -6
fbgemm_gpu/quantize_utils.py +29 -3
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/cpu/cpu_sll.py +6 -6
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +1 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +3 -4
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +93 -53
fbgemm_gpu/split_embedding_configs.py +98 -48
fbgemm_gpu/split_embedding_inference_converter.py +4 -4
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +101 -23
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +528 -71
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +2 -2
fbgemm_gpu/tbe/bench/__init__.py +1 -0
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +155 -14
fbgemm_gpu/tbe/bench/eeg_cli.py +2 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +2 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +30 -185
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +55 -3
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +13 -8
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/inference.py +13 -13
fbgemm_gpu/tbe/ssd/training.py +812 -174
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -2
fbgemm_gpu/tbe/stats/bench_params_reporter.py +187 -44
fbgemm_gpu/tbe/utils/offsets.py +3 -3
fbgemm_gpu/tbe/utils/quantize.py +2 -2
fbgemm_gpu/tbe/utils/requests.py +14 -14
fbgemm_gpu/tbe_input_multiplexer.py +10 -10
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
{fbgemm_gpu_genai_nightly-2025.8.2.dist-info → fbgemm_gpu_genai_nightly-2025.11.4.dist-info}/METADATA +1 -1
fbgemm_gpu_genai_nightly-2025.11.4.dist-info/RECORD +127 -0
list_versions/cli_run.py +5 -6
fbgemm_gpu/docs/version.py +0 -11
fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py +0 -168
fbgemm_gpu_genai_nightly-2025.8.2.dist-info/RECORD +0 -124
{fbgemm_gpu_genai_nightly-2025.8.2.dist-info → fbgemm_gpu_genai_nightly-2025.11.4.dist-info}/WHEEL +0 -0
{fbgemm_gpu_genai_nightly-2025.8.2.dist-info → fbgemm_gpu_genai_nightly-2025.11.4.dist-info}/top_level.txt +0 -0

fbgemm_gpu/__init__.py CHANGED Viewed

@@ -5,17 +5,100 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import json
 import logging
 import os
+import re
 import torch
+# Based on the FBGEMM-PyTorch compatibility table at
+# https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
+_fbgemm_torch_compat_table = {
+    "1.3": "2.8",
+    "1.2": "2.7",
+    "1.1": "2.6",
+    "1.0": "2.5",
+    "0.8": "2.4",
+    "0.7": "2.3",
+    "0.6": "2.2",
+    "0.5": "2.1",
+    "0.4": "2.0",
+}
+def _load_target_info(target: str) -> dict[str, str]:
+    try:
+        filepath = os.path.join(
+            os.path.dirname(__file__), "docs", f"target.{target}.json.py"
+        )
+        with open(filepath, "r") as file:
+            data = json.load(file)
+    except Exception:
+        data = {}
+    return data
-def _load_library(filename: str, no_throw: bool = False) -> None:
+def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
     """Load a shared library from the given filename."""
+    # Check if the version of PyTorch is compatible with the version of FBGEMM
+    # that we are trying to load, and print a loud warning if not.  This is
+    # useful for the OSS build, where we have a single FBGEMM library that is
+    # compatible with multiple versions of PyTorch.
+    #
+    # Based on: https://github.com/pytorch/ao/blob/main/torchao/__init__.py#L30
+    keys = [
+        key
+        for key in _fbgemm_torch_compat_table.keys()
+        if version.startswith(f"{key}.")
+    ]
+    if version == "INTERNAL" or "+git" in version:
+        # if FBGEMM version has "+git", assume it's locally built and we don't know
+        #   anything about the PyTorch version used to build it
+        logging.info(
+            "FBGEMM version is INTERNAL or local, ignoring version compatibility check with PyTorch"
+        )
+    elif re.match(r"^\d{4}\.\d{1,2}\.\d{1,2}.*$", version):
+        # if FBGEMM version is a date, assume it's a nightly build and that we
+        # know what we're doing
+        logging.info(
+            "FBGEMM version is a nightly version, ignoring version compatibility check with PyTorch"
+        )
+    elif not keys:
+        logging.warning(
+            f"""
+            \033[33m
+            _fbgemm_torch_compat_table has no entry for {version} of FBGEMM;
+            cannot determine compatibility with PyTorch {torch.__version__}
+            \033[0m
+            """
+        )
+    elif str(torch.__version__) != _fbgemm_torch_compat_table[keys[0]]:
+        logging.warning(
+            f"""
+            \033[31m
+            FBGEMM_GPU version is {version}, which is not guaranteed to be
+            compatible with PyTorch {torch.__version__}; library loading might
+            crash!
+            Please refer to
+            https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
+            for the FBGEMM-PyTorch compatibility table.
+            \033[0m
+            """
+        )
     try:
         torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
         logging.info(f"Successfully loaded: '{filename}'")
     except Exception as error:
         logging.error(f"Could not load the library '{filename}'!\n\n\n{error}\n\n\n")
         if not no_throw:
@@ -29,13 +112,16 @@ open_source: bool = True
 # Trigger the manual addition of docstrings to pybind11-generated operators
 import fbgemm_gpu.docs  # noqa: F401, E402
+__targets_infos__ = {
+    target: _load_target_info(target) for target in ["default", "genai", "hstu"]
+}
+__targets_infos__ = {k: v for (k, v) in __targets_infos__.items() if v}
 try:
-    # Export the version string from the version file auto-generated by setup.py
-    from fbgemm_gpu.docs.version import (  # noqa: F401, E402
-        __target__,
-        __variant__,
-        __version__,
-    )
+    __target__, __info__ = next(iter(__targets_infos__.items()))
+    __variant__ = __info__["variant"]
+    __version__ = __info__["version"]
 except Exception:
     __variant__: str = "INTERNAL"
     __version__: str = "INTERNAL"
@@ -76,18 +162,19 @@ libraries_to_load = {
     "genai": fbgemm_genai_libraries,
 }
-for library in libraries_to_load.get(__target__, []):
-    # NOTE: In all cases, we want to throw an error if we cannot load the
-    # library.  However, this appears to break the OSS documentation build,
-    # where the Python documentation doesn't show up in the generated docs.
-    #
-    # To work around this problem, we introduce a fake build variant called
-    # `docs` and we only throw a library load error when the variant is not
-    # `docs`.  For more information, see:
-    #
-    #   https://github.com/pytorch/FBGEMM/pull/3477
-    #   https://github.com/pytorch/FBGEMM/pull/3717
-    _load_library(f"{library}.so", __variant__ == "docs")
+for target, info in __targets_infos__.items():
+    for library in libraries_to_load.get(target, []):
+        # NOTE: In all cases, we want to throw an error if we cannot load the
+        # library.  However, this appears to break the OSS documentation build,
+        # where the Python documentation doesn't show up in the generated docs.
+        #
+        # To work around this problem, we introduce a fake build variant called
+        # `docs` and we only throw a library load error when the variant is not
+        # `docs`.  For more information, see:
+        #
+        #   https://github.com/pytorch/FBGEMM/pull/3477
+        #   https://github.com/pytorch/FBGEMM/pull/3717
+        _load_library(f"{library}.so", info["version"], info["variant"] == "docs")
 try:
     # Trigger meta operator registrations

fbgemm_gpu/asmjit.so CHANGED Viewed

Binary file

fbgemm_gpu/batched_unary_embeddings_ops.py CHANGED Viewed

@@ -9,7 +9,6 @@
 from math import sqrt
-from typing import List
 import torch
@@ -22,7 +21,7 @@ except Exception:
     load_torch_module("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
+def wrap_weight_to_parameter(weights: list[torch.Tensor]) -> list[torch.Tensor]:
     for i, v in enumerate(weights):
         if not isinstance(v, torch.nn.Parameter):
             weights[i] = torch.nn.Parameter(v)
@@ -31,7 +30,7 @@ def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
 class BatchedUnaryEmbeddingBag(torch.nn.Module):
     # pyre-fixme[3]: Return type must be annotated.
-    def __init__(self, num_tasks: int, hash_sizes: List[int], long_index: bool = False):
+    def __init__(self, num_tasks: int, hash_sizes: list[int], long_index: bool = False):
         super().__init__()
         self.num_tasks = num_tasks
         self.hash_sizes = hash_sizes

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -11,7 +11,7 @@ from enum import auto, Enum
 import torch
 try:
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp_torch_op")
 except Exception:
     import fbgemm_gpu  # noqa F401
@@ -60,6 +60,9 @@ class FeatureGateName(Enum):
     # Enable bounds_check_indices_v2
     BOUNDS_CHECK_INDICES_V2 = auto()
+    # Enable TBE input parameters extraction
+    TBE_REPORT_INPUT_PARAMS = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/sparse_ops.py CHANGED Viewed

@@ -496,3 +496,121 @@ Return:
      None)
    """,
 )
+add_docs(
+    torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights,
+    """
+block_bucketize_sparse_features_2d_weights(lengths, indices, bucketize_pos, sequence, block_sizes, my_size, weights, weights_dim=1, batch_size_per_feature=None, max_B= -1, block_bucketize_pos=None, keep_orig_idx=False, total_num_blocks=None, keep_orig_idx_per_feature=None) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]
+Preprocess sparse features by partitioning sparse features into multiple
+buckets with support for 2D weights. Every feature is split into the same number of buckets, but the bucket
+sizes (widths) for the different features can be different. Moreover, the
+bucket sizes within each feature can be different.
+This function is similar to block_bucketize_sparse_features but supports 2D weights,
+where each index can have multiple weight values associated with it.
+Args:
+    lengths (Tensor): The lengths of the sparse features. The tensor contains
+        the lengths of each sample in a batch and each feature. Shape is `B *
+        T` where `B` is the batch size and `T` is the number of features
+    indices (Tensor): The sparse data. Only support integer types. Shape is the
+        sum of `lengths`
+    bucketize_pos (bool): If True, return the original relative indices within
+        a sample. For example, `indices = [9, 8, 2, 1, 0, 8, 9]` and `lengths =
+        [3, 4]`. The original relative indices within a sample for the indices
+        are `[0, 1, 2, 0, 1, 2, 3]`
+    sequence (bool): If True, return the new indices positions in the original
+        indices positions (the tensor is called `unbucketize_permute_data`).
+    block_sizes (Tensor): This tensor is used for the case where the bucket
+        size within a feature is uniform (i.e., when
+        `block_bucketize_pos=None`).  The tensor contains bucket sizes (i.e.,
+        bucket widths) for each feature.  `block_sizes[t]` represents the
+        bucket size of feature `t`.  Shape is the number of features.
+    my_size (int): The number of buckets for each feature. Note that every
+        feature has the same number of buckets.
+    weights (Tensor): A float tensor that will be bucketized the same way as
+        `indices`. This tensor must have shape `[indices.size(0), weights_dim]`
+        where `weights_dim` is the dimension of the weight values for each index.
+    weights_dim (int = 1): The dimension of the weight values for each index.
+        This parameter is only used when `weights` is not None.
+    batch_size_per_feature (Optional[Tensor] = None): An optional tensor that
+        contains batch sizes for different features. If not None, batch sizes
+        are not uniform among features. Otherwise, the operator will assume
+        that the batch size is uniform and infer it from the `lengths` and
+        `block_sizes` tensors
+    max_B (int = -1): The max batch size. Must be set if
+        `batch_size_per_feature` is not None
+    block_bucketize_pos (Optional[List[Tensor]] = None): The input is used for
+        non-uniform bucket sizes within a feature. `block_bucketize_pos` is a
+        list of tensors. Each tensor contains the range offsets of buckets for
+        each feature. These range offsets are equivalent to the complete
+        cumulative sum of the bucket sizes. For example, `[0, 4, 20]` represents
+        two buckets. The first bucket size is `(4 - 0) = 4`, and the second
+        bucket size is `(20 - 4) = 16`. The length of `block_bucketize_pos`
+        must be equal to the number of features.
+    keep_orig_idx (bool = False): If True, return original indices instead of
+        the relative indices within each bucket
+    total_num_blocks (Optional[torch.Tensor] = None): An optional tensor that
+        contains then number of logical buckets (aka blocks) within a given
+        feature.  This is useful for applications where the number of buckets
+        is more than the number of physical GPUs, which is common in cases
+        where we scale up/down the number of GPUs but want to maintain
+        same numerical behavior.
+    keep_orig_idx_per_feature (Optional[Tensor] = None): An optional tensor that
+        contains whether to keep original indices for each feature. If not None,
+        the operator will use this tensor to determine whether to keep original
+        indices for each feature. if None, will fallback to `keep_orig_idx`
+Return:
+    A tuple of tensors containing
+    (1) Bucketized lengths. Shape is `lengths.num() * my_size`.
+    (2) Bucketized indices. Same shape as `indices`.
+    (3) Bucketized weights or None if `weights` is None. Shape is
+        `[indices.size(0), weights_dim]`.
+    (4) Bucketized positions or None if `bucketize_pos=False`. Same shape as
+        `indices`.
+    (5) `unbucketize_permute` or None if `sequence=False`. Same shape as
+        `indices`
+**Example**:
+    >>> # Generate input example. Batch size = 2. Number of features = 4
+    >>> lengths = torch.tensor([0, 2, 1, 3, 2, 3, 3, 1], dtype=torch.int, device="cuda")
+    >>> indices = torch.tensor([3, 4, 15, 11, 28, 29, 1, 10, 11, 12, 13, 11, 22, 20, 20], dtype=torch.int, device="cuda")
+    >>> block_sizes = torch.tensor([[5, 15, 10, 20]], dtype=torch.int, device="cuda")
+    >>> my_size = 2 # Number of buckets
+    >>> weights_dim = 3 # Dimension of weight values for each index
+    >>> weights = torch.randn(indices.size(0), weights_dim, dtype=torch.float, device="cuda")
+    >>> # Invoke with keep_orig_idx=False, bucketize_pos=False, and
+    >>> # sequence=False
+    >>> torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights(
+    >>>     lengths,
+    >>>     indices,
+    >>>     bucketize_pos=False,
+    >>>     sequence=False,
+    >>>     block_sizes=block_sizes,
+    >>>     my_size=my_size,
+    >>>     weights=weights,
+    >>>     weights_dim=weights_dim,
+    >>>     keep_orig_idx=False)
+   """,
+)

fbgemm_gpu/docs/target.genai.json.py ADDED Viewed

@@ -0,0 +1,6 @@
+{
+    "version": "2025.11.4",
+    "target": "genai",
+    "variant": "cuda"
+}

fbgemm_gpu/enums.py CHANGED Viewed

@@ -8,14 +8,13 @@
 # pyre-strict
 import enum
-import typing
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable
 # Create enums in given namespace with information from query_op
 def create_enums(
-    namespace: typing.Dict[str, Any],
-    query_op: Callable[[], List[Tuple[str, List[Tuple[str, int]]]]],
+    namespace: dict[str, Any],
+    query_op: Callable[[], list[tuple[str, list[tuple[str, int]]]]],
 ) -> None:
     for enum_name, items in query_op():
         # Create matching python enumeration

fbgemm_gpu/experimental/example/__init__.py CHANGED Viewed

@@ -15,10 +15,6 @@ try:
     # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
     open_source: bool = False

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py CHANGED Viewed

@@ -11,9 +11,5 @@ try:
     # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
     open_source: bool = False