PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/__init__.py CHANGED Viewed

@@ -5,17 +5,106 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import json
 import logging
 import os
+import re
 import torch
+# Based on the FBGEMM-PyTorch compatibility table at
+# https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
+_fbgemm_torch_compat_table = {
+    "1.5": "2.10",
+    "1.4": "2.9",
+    "1.3": "2.8",
+    "1.2": "2.7",
+    "1.1": "2.6",
+    "1.0": "2.5",
+    "0.8": "2.4",
+    "0.7": "2.3",
+    "0.6": "2.2",
+    "0.5": "2.1",
+    "0.4": "2.0",
+}
+def _load_target_info(target: str) -> dict[str, str]:
+    try:
+        filepath = os.path.join(
+            os.path.dirname(__file__), "docs", f"target.{target}.json.py"
+        )
+        with open(filepath, "r") as file:
+            data = json.load(file)
+    except Exception:
+        data = {}
+    return data
-def _load_library(filename: str, no_throw: bool = False) -> None:
+def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
     """Load a shared library from the given filename."""
+    # Check if the version of PyTorch is compatible with the version of FBGEMM
+    # that we are trying to load, and print a loud warning if not.  This is
+    # useful for the OSS build, where we have a single FBGEMM library that is
+    # compatible with multiple versions of PyTorch.
+    #
+    # Based on: https://github.com/pytorch/ao/blob/main/torchao/__init__.py#L30
+    keys = [
+        key
+        for key in _fbgemm_torch_compat_table.keys()
+        if version.startswith(f"{key}.")
+    ]
+    if version == "INTERNAL" or "+git" in version:
+        # if FBGEMM version has "+git", assume it's locally built and we don't know
+        #   anything about the PyTorch version used to build it
+        logging.info(
+            "FBGEMM version is INTERNAL or local, ignoring version compatibility check with PyTorch"
+        )
+    elif re.match(r"^\d{4}\.\d{1,2}\.\d{1,2}.*$", version):
+        # if FBGEMM version is a date, assume it's a nightly build and that we
+        # know what we're doing
+        logging.info(
+            "FBGEMM version is a nightly version, ignoring version compatibility check with PyTorch"
+        )
+    elif not keys:
+        # fmt: off
+        logging.warning(
+            f"""
+            \033[33m
+            _fbgemm_torch_compat_table has no entry for {version} of FBGEMM;
+            cannot determine compatibility with PyTorch {torch.__version__}
+            \033[0m
+            """
+        )
+        # fmt: on
+    elif not str(torch.__version__).startswith(_fbgemm_torch_compat_table[keys[0]]):
+        # fmt: off
+        logging.warning(
+            f"""
+            \033[31m
+            FBGEMM_GPU version is {version}, which is not guaranteed to be
+            compatible with PyTorch {torch.__version__}; library loading might
+            crash!
+            Please refer to
+            https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
+            for the FBGEMM-PyTorch compatibility table.
+            \033[0m
+            """
+        )
+        # fmt: on
     try:
         torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
         logging.info(f"Successfully loaded: '{filename}'")
     except Exception as error:
         logging.error(f"Could not load the library '{filename}'!\n\n\n{error}\n\n\n")
         if not no_throw:
@@ -29,13 +118,15 @@ open_source: bool = True
 # Trigger the manual addition of docstrings to pybind11-generated operators
 import fbgemm_gpu.docs  # noqa: F401, E402
+__targets_infos__ = {
+    target: _load_target_info(target) for target in ["default", "genai", "hstu"]
+}
+__targets_infos__ = {k: v for (k, v) in __targets_infos__.items() if v}
 try:
-    # Export the version string from the version file auto-generated by setup.py
-    from fbgemm_gpu.docs.version import (  # noqa: F401, E402
-        __target__,
-        __variant__,
-        __version__,
-    )
+    __target__, __info__ = next(iter(__targets_infos__.items()))
+    __variant__ = __info__["variant"]
+    __version__ = __info__["version"]
 except Exception:
     __variant__: str = "INTERNAL"
     __version__: str = "INTERNAL"
@@ -45,6 +136,7 @@ fbgemm_gpu_libraries = [
     "fbgemm_gpu_config",
     "fbgemm_gpu_tbe_utils",
     "fbgemm_gpu_tbe_index_select",
+    "fbgemm_gpu_tbe_cache",
     "fbgemm_gpu_tbe_optimizers",
     "fbgemm_gpu_tbe_inference",
     "fbgemm_gpu_tbe_training_forward",
@@ -76,18 +168,19 @@ libraries_to_load = {
     "genai": fbgemm_genai_libraries,
 }
-for library in libraries_to_load.get(__target__, []):
-    # NOTE: In all cases, we want to throw an error if we cannot load the
-    # library.  However, this appears to break the OSS documentation build,
-    # where the Python documentation doesn't show up in the generated docs.
-    #
-    # To work around this problem, we introduce a fake build variant called
-    # `docs` and we only throw a library load error when the variant is not
-    # `docs`.  For more information, see:
-    #
-    #   https://github.com/pytorch/FBGEMM/pull/3477
-    #   https://github.com/pytorch/FBGEMM/pull/3717
-    _load_library(f"{library}.so", __variant__ == "docs")
+for target, info in __targets_infos__.items():
+    for library in libraries_to_load.get(target, []):
+        # NOTE: In all cases, we want to throw an error if we cannot load the
+        # library.  However, this appears to break the OSS documentation build,
+        # where the Python documentation doesn't show up in the generated docs.
+        #
+        # To work around this problem, we introduce a fake build variant called
+        # `docs` and we only throw a library load error when the variant is not
+        # `docs`.  For more information, see:
+        #
+        #   https://github.com/pytorch/FBGEMM/pull/3477
+        #   https://github.com/pytorch/FBGEMM/pull/3717
+        _load_library(f"{library}.so", info["version"], info["variant"] == "docs")
 try:
     # Trigger meta operator registrations

fbgemm_gpu/asmjit.so CHANGED Viewed

Binary file

fbgemm_gpu/batched_unary_embeddings_ops.py CHANGED Viewed

@@ -9,10 +9,10 @@
 from math import sqrt
-from typing import List
 import torch
+# fmt:skip
 from fbgemm_gpu.utils.loader import load_torch_module
 try:
@@ -22,7 +22,7 @@ except Exception:
     load_torch_module("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
+def wrap_weight_to_parameter(weights: list[torch.Tensor]) -> list[torch.Tensor]:
     for i, v in enumerate(weights):
         if not isinstance(v, torch.nn.Parameter):
             weights[i] = torch.nn.Parameter(v)
@@ -31,7 +31,7 @@ def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
 class BatchedUnaryEmbeddingBag(torch.nn.Module):
     # pyre-fixme[3]: Return type must be annotated.
-    def __init__(self, num_tasks: int, hash_sizes: List[int], long_index: bool = False):
+    def __init__(self, num_tasks: int, hash_sizes: list[int], long_index: bool = False):
         super().__init__()
         self.num_tasks = num_tasks
         self.hash_sizes = hash_sizes

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -11,7 +11,7 @@ from enum import auto, Enum
 import torch
 try:
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp_torch_op")
 except Exception:
     import fbgemm_gpu  # noqa F401
@@ -60,6 +60,12 @@ class FeatureGateName(Enum):
     # Enable bounds_check_indices_v2
     BOUNDS_CHECK_INDICES_V2 = auto()
+    # Enable TBE input parameters extraction
+    TBE_REPORT_INPUT_PARAMS = auto()
+    # Enable tuned max segment length per CTA for B200
+    TBE_USE_TUNED_SEGMENT_LENGTHS_CTA_B200 = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/jagged_tensor_ops.py CHANGED Viewed

@@ -9,7 +9,6 @@ import torch
 from .common import add_docs
 add_docs(
     torch.ops.fbgemm.jagged_2d_to_dense,
     """

fbgemm_gpu/docs/sparse_ops.py CHANGED Viewed

@@ -496,3 +496,121 @@ Return:
      None)
    """,
 )
+add_docs(
+    torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights,
+    """
+block_bucketize_sparse_features_2d_weights(lengths, indices, bucketize_pos, sequence, block_sizes, my_size, weights, weights_dim=1, batch_size_per_feature=None, max_B= -1, block_bucketize_pos=None, keep_orig_idx=False, total_num_blocks=None, keep_orig_idx_per_feature=None) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]
+Preprocess sparse features by partitioning sparse features into multiple
+buckets with support for 2D weights. Every feature is split into the same number of buckets, but the bucket
+sizes (widths) for the different features can be different. Moreover, the
+bucket sizes within each feature can be different.
+This function is similar to block_bucketize_sparse_features but supports 2D weights,
+where each index can have multiple weight values associated with it.
+Args:
+    lengths (Tensor): The lengths of the sparse features. The tensor contains
+        the lengths of each sample in a batch and each feature. Shape is `B *
+        T` where `B` is the batch size and `T` is the number of features
+    indices (Tensor): The sparse data. Only support integer types. Shape is the
+        sum of `lengths`
+    bucketize_pos (bool): If True, return the original relative indices within
+        a sample. For example, `indices = [9, 8, 2, 1, 0, 8, 9]` and `lengths =
+        [3, 4]`. The original relative indices within a sample for the indices
+        are `[0, 1, 2, 0, 1, 2, 3]`
+    sequence (bool): If True, return the new indices positions in the original
+        indices positions (the tensor is called `unbucketize_permute_data`).
+    block_sizes (Tensor): This tensor is used for the case where the bucket
+        size within a feature is uniform (i.e., when
+        `block_bucketize_pos=None`).  The tensor contains bucket sizes (i.e.,
+        bucket widths) for each feature.  `block_sizes[t]` represents the
+        bucket size of feature `t`.  Shape is the number of features.
+    my_size (int): The number of buckets for each feature. Note that every
+        feature has the same number of buckets.
+    weights (Tensor): A float tensor that will be bucketized the same way as
+        `indices`. This tensor must have shape `[indices.size(0), weights_dim]`
+        where `weights_dim` is the dimension of the weight values for each index.
+    weights_dim (int = 1): The dimension of the weight values for each index.
+        This parameter is only used when `weights` is not None.
+    batch_size_per_feature (Optional[Tensor] = None): An optional tensor that
+        contains batch sizes for different features. If not None, batch sizes
+        are not uniform among features. Otherwise, the operator will assume
+        that the batch size is uniform and infer it from the `lengths` and
+        `block_sizes` tensors
+    max_B (int = -1): The max batch size. Must be set if
+        `batch_size_per_feature` is not None
+    block_bucketize_pos (Optional[List[Tensor]] = None): The input is used for
+        non-uniform bucket sizes within a feature. `block_bucketize_pos` is a
+        list of tensors. Each tensor contains the range offsets of buckets for
+        each feature. These range offsets are equivalent to the complete
+        cumulative sum of the bucket sizes. For example, `[0, 4, 20]` represents
+        two buckets. The first bucket size is `(4 - 0) = 4`, and the second
+        bucket size is `(20 - 4) = 16`. The length of `block_bucketize_pos`
+        must be equal to the number of features.
+    keep_orig_idx (bool = False): If True, return original indices instead of
+        the relative indices within each bucket
+    total_num_blocks (Optional[torch.Tensor] = None): An optional tensor that
+        contains then number of logical buckets (aka blocks) within a given
+        feature.  This is useful for applications where the number of buckets
+        is more than the number of physical GPUs, which is common in cases
+        where we scale up/down the number of GPUs but want to maintain
+        same numerical behavior.
+    keep_orig_idx_per_feature (Optional[Tensor] = None): An optional tensor that
+        contains whether to keep original indices for each feature. If not None,
+        the operator will use this tensor to determine whether to keep original
+        indices for each feature. if None, will fallback to `keep_orig_idx`
+Return:
+    A tuple of tensors containing
+    (1) Bucketized lengths. Shape is `lengths.num() * my_size`.
+    (2) Bucketized indices. Same shape as `indices`.
+    (3) Bucketized weights or None if `weights` is None. Shape is
+        `[indices.size(0), weights_dim]`.
+    (4) Bucketized positions or None if `bucketize_pos=False`. Same shape as
+        `indices`.
+    (5) `unbucketize_permute` or None if `sequence=False`. Same shape as
+        `indices`
+**Example**:
+    >>> # Generate input example. Batch size = 2. Number of features = 4
+    >>> lengths = torch.tensor([0, 2, 1, 3, 2, 3, 3, 1], dtype=torch.int, device="cuda")
+    >>> indices = torch.tensor([3, 4, 15, 11, 28, 29, 1, 10, 11, 12, 13, 11, 22, 20, 20], dtype=torch.int, device="cuda")
+    >>> block_sizes = torch.tensor([[5, 15, 10, 20]], dtype=torch.int, device="cuda")
+    >>> my_size = 2 # Number of buckets
+    >>> weights_dim = 3 # Dimension of weight values for each index
+    >>> weights = torch.randn(indices.size(0), weights_dim, dtype=torch.float, device="cuda")
+    >>> # Invoke with keep_orig_idx=False, bucketize_pos=False, and
+    >>> # sequence=False
+    >>> torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights(
+    >>>     lengths,
+    >>>     indices,
+    >>>     bucketize_pos=False,
+    >>>     sequence=False,
+    >>>     block_sizes=block_sizes,
+    >>>     my_size=my_size,
+    >>>     weights=weights,
+    >>>     weights_dim=weights_dim,
+    >>>     keep_orig_idx=False)
+   """,
+)

fbgemm_gpu/docs/target.default.json.py ADDED Viewed

@@ -0,0 +1,6 @@
+{
+    "version": "2026.1.29",
+    "target": "default",
+    "variant": "cpu"
+}

fbgemm_gpu/enums.py CHANGED Viewed

@@ -8,14 +8,13 @@
 # pyre-strict
 import enum
-import typing
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable
 # Create enums in given namespace with information from query_op
 def create_enums(
-    namespace: typing.Dict[str, Any],
-    query_op: Callable[[], List[Tuple[str, List[Tuple[str, int]]]]],
+    namespace: dict[str, Any],
+    query_op: Callable[[], list[tuple[str, list[tuple[str, int]]]]],
 ) -> None:
     for enum_name, items in query_op():
         # Create matching python enumeration

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_config.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_py.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_cache.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_common.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_index_select.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_inference.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_utils.so CHANGED Viewed

Binary file

fbgemm_gpu/permute_pooled_embedding_modules.py CHANGED Viewed

@@ -8,10 +8,11 @@
 # pyre-strict
 from itertools import accumulate
-from typing import List, Optional
+from typing import Optional
 import torch
+# fmt:skip
 from fbgemm_gpu.utils.loader import load_torch_module
 try:
@@ -93,8 +94,8 @@ class PermutePooledEmbeddings:
     def __init__(
         self,
-        embs_dims: List[int],
-        permute: List[int],
+        embs_dims: list[int],
+        permute: list[int],
         device: Optional[torch.device] = None,
     ) -> None:
         self._offset_dim_list: torch.Tensor = torch.tensor(
@@ -105,7 +106,7 @@ class PermutePooledEmbeddings:
             permute, device=device, dtype=torch.int64
         )
-        inv_permute: List[int] = [0] * len(permute)
+        inv_permute: list[int] = [0] * len(permute)
         for i, p in enumerate(permute):
             inv_permute[p] = i

fbgemm_gpu/permute_pooled_embedding_modules_split.py CHANGED Viewed

@@ -9,7 +9,7 @@
 import logging
 from itertools import accumulate
-from typing import List, Optional
+from typing import Optional
 import torch
 from torch import nn
@@ -34,8 +34,8 @@ def _fx_wrap_tensor_to_device(t: torch.Tensor, device: torch.device) -> torch.Te
 class PermutePooledEmbeddingsSplit(nn.Module):
     def __init__(
         self,
-        embs_dims: List[int],
-        permute: List[int],
+        embs_dims: list[int],
+        permute: list[int],
         device: Optional[torch.device] = None,
     ) -> None:
         super(PermutePooledEmbeddingsSplit, self).__init__()
@@ -51,7 +51,7 @@ class PermutePooledEmbeddingsSplit(nn.Module):
             "_permute", torch.tensor(permute, device=device, dtype=torch.int64)
         )
-        inv_permute: List[int] = [0] * len(permute)
+        inv_permute: list[int] = [0] * len(permute)
         for i, p in enumerate(permute):
             inv_permute[p] = i

fbgemm_gpu/quantize/__init__.py CHANGED Viewed

@@ -11,6 +11,7 @@ from fbgemm_gpu.utils import TorchLibraryFragment
 lib = TorchLibraryFragment("fbgemm")
+# fmt: off
 lib.define(
     """quantize_mx(
         Tensor input,
@@ -41,3 +42,4 @@ lib.register(
     "dequantize_mx",
     {"CUDA": dequantize_mx, "CPU": dequantize_mx},
 )
+# fmt: on

fbgemm_gpu/quantize/quantize_ops.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Union
 import torch
+# fmt:skip
 from fbgemm_gpu.quantize_utils import fp32_to_mx4, mx4_to_fp32, RoundingMode

fbgemm_gpu/quantize_comm.py CHANGED Viewed

@@ -13,10 +13,11 @@
 import logging
-from typing import List, Optional, Tuple, TypeVar
+from typing import Optional, TypeVar
 import torch
+# fmt:skip
 from fbgemm_gpu.quantize_utils import (
     bf16_to_fp32,
     fp16_to_fp32,
@@ -25,12 +26,10 @@ from fbgemm_gpu.quantize_utils import (
     fp32_to_hfp8_with_clamp,
     fp32_to_mx4,
     hfp8_to_fp32,
-    mx4_to_fp32,
+    mx4_to_float,
     RoundingMode,
 )
 from fbgemm_gpu.split_embedding_configs import SparseType
 from torch.autograd.profiler import record_function  # usort:skip
 from dataclasses import dataclass
@@ -66,8 +65,8 @@ class QuantizationContext:
     row_dim: int = ROW_DIM_DEFAULT
     row_dim_quant: int = -1
     mx_group_size: int = MX_GROUP_SIZE_DEFAULT
-    rounding_mode: RoundingMode = RoundingMode.even
-    padded_dim_sum_per_rank: Optional[List[int]] = None
+    rounding_mode: Optional[RoundingMode] = RoundingMode.even
+    padded_dim_sum_per_rank: Optional[list[int]] = None
 def _quantize_tensor(
@@ -123,6 +122,7 @@ def _dequantize_tensor(
     comm_precision: SparseType,
     ctx: Optional[QuantizationContext] = None,
     is_fwd: bool = True,
+    output_dtype: Optional[SparseType] = None,
 ) -> torch.Tensor:
     if comm_precision == SparseType.FP32:
         assert quantized_tensor.dtype == torch.float
@@ -137,8 +137,12 @@ def _dequantize_tensor(
         if ctx is not None and ctx.row_dim > 0:
             row_dim_quant = ctx.row_dim_quant
             quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
+            # use provided output_dtype or default to FP32 (0)
+            output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
             dequant_tensor = torch.ops.fbgemm.FP8RowwiseQuantizedToFloat(
-                quantized_tensor_2d, is_fwd
+                quantized_tensor_2d,
+                is_fwd,
+                output_dtype_int,
             )
             return dequant_tensor.view(-1)
         else:
@@ -154,7 +158,7 @@ def _dequantize_tensor(
         return dequant_tensor.view(-1)
     elif comm_precision == SparseType.MX4:
         mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
-        return mx4_to_fp32(quantized_tensor, mx_group_size)
+        return mx4_to_float(quantized_tensor, mx_group_size, output_dtype=output_dtype)
     else:
         raise ValueError(f"comm_precision={comm_precision} is not supported")
@@ -167,6 +171,8 @@ class QuantizedCommCodec:
         loss_scale: Optional[float] = None,
         row_dim: Optional[int] = None,
         is_fwd: bool = True,
+        rounding_mode: Optional[RoundingMode] = None,
+        output_dtype: Optional[SparseType] = None,
     ) -> None:
         if loss_scale is not None:
             if comm_precision not in [SparseType.FP16, SparseType.BF16]:
@@ -183,8 +189,13 @@ class QuantizedCommCodec:
         self._loss_scale = loss_scale
         self._is_fwd = is_fwd
         self._row_dim: int = -1 if row_dim is None else row_dim
+        self._rounding_mode: Optional[RoundingMode] = rounding_mode
+        self._output_dtype: Optional[SparseType] = output_dtype
         if self._comm_precision == SparseType.MX4:
             self._row_dim = MX_GROUP_SIZE_DEFAULT if row_dim is None else row_dim
+            self._rounding_mode = (
+                RoundingMode.even if rounding_mode is None else rounding_mode
+            )
     def encode(
         self, input_tensor: torch.Tensor, ctx: Optional[QuantizationContext] = None
@@ -211,7 +222,11 @@ class QuantizedCommCodec:
             f"## decoder {self._comm_precision} {self._loss_scale} ##"
         ):
             dequantized_tensor = _dequantize_tensor(
-                input_tensor, self._comm_precision, ctx, self._is_fwd
+                input_tensor,
+                self._comm_precision,
+                ctx,
+                self._is_fwd,
+                output_dtype=self._output_dtype,
             )
         return dequantized_tensor
@@ -258,7 +273,9 @@ class QuantizedCommCodec:
             return QuantizationContext(self._row_dim)
         if self._comm_precision == SparseType.MX4:
             return QuantizationContext(
-                row_dim=self._row_dim, mx_group_size=self._row_dim
+                row_dim=self._row_dim,
+                mx_group_size=self._row_dim,
+                rounding_mode=self._rounding_mode,
             )
         # int8 rowwise is default
         return QuantizationContext()
@@ -266,10 +283,10 @@ class QuantizedCommCodec:
     def padded_size(
         self,
         input_tensor: torch.Tensor,
-        dim_per_rank: List[int],
+        dim_per_rank: list[int],
         my_rank: int,
         qcomm_ctx: QuantizationContext,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         if input_tensor.ndim == 1:
             return input_tensor.shape[0], 0
         # return padded size for the feature dimension (dim 1), 0 if no padding needed.