PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py CHANGED Viewed

@@ -33,7 +33,7 @@ class PartiallyMaterializedTensor:
     or use `full_tensor()` to get the full tensor (this could OOM).
     """
-    def __init__(self, wrapped) -> None:
+    def __init__(self, wrapped, is_virtual: bool = False) -> None:
         """
         Ensure caller loads the module before creating this object.
@@ -48,6 +48,7 @@ class PartiallyMaterializedTensor:
             wrapped: torch.classes.fbgemm.KVTensorWrapper
         """
         self._wrapped = wrapped
+        self._is_virtual = is_virtual
         self._requires_grad = False
     @property
@@ -57,6 +58,17 @@ class PartiallyMaterializedTensor:
         """
         return self._wrapped
+    @property
+    def is_virtual(self):
+        """
+        Indicate whether PMT is a virtual tensor.
+        This indicator is needed for checkpoint or publish.
+        They need to know wheether it is PMT for kvzch or for normal emb table
+        for kvzch, checkpoint and publish need to call all-gather to recalculate the correct
+        metadata of the ShardedTensor
+        """
+        return self._is_virtual
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
@@ -75,6 +87,18 @@ class PartiallyMaterializedTensor:
         """
         return self._wrapped.narrow(dim, start, length)
+    def set_weights_and_ids(self, weights: torch.Tensor, ids: torch.Tensor) -> None:
+        self._wrapped.set_weights_and_ids(weights, ids)
+    def get_weights_by_ids(self, ids: torch.Tensor) -> torch.Tensor:
+        return self._wrapped.get_weights_by_ids(ids)
+    def __reduce__(self):
+        return (
+            PartiallyMaterializedTensor,
+            (self._wrapped,),
+        )
     def full_tensor(self) -> torch.Tensor:
         """
         This loads the full tensor into memory (may OOM).
@@ -141,6 +165,8 @@ class PartiallyMaterializedTensor:
     @property
     def dtype(self) -> torch.dtype:
+        if isinstance(self._wrapped, torch.Tensor):
+            return self._wrapped.dtype
         mapping = {"c10::Half": "half"}
         dtype_str: str = self._wrapped.dtype_str
         dtype_str = mapping.get(dtype_str, dtype_str)
@@ -151,6 +177,8 @@ class PartiallyMaterializedTensor:
     @property
     def device(self) -> torch.device:
+        if isinstance(self._wrapped, torch.Tensor):
+            return self._wrapped.device
         device_str: str = self._wrapped.device_str
         device = torch.device(device_str)
         assert isinstance(device, torch.device)
@@ -158,11 +186,11 @@ class PartiallyMaterializedTensor:
     @property
     def layout(self) -> torch.layout:
-        pass
+        if isinstance(self._wrapped, torch.Tensor):
+            return self._wrapped.layout
         layout_str_mapping = {
             "SparseCsr": "sparse_csr",
             "Strided": "strided",
-            "SparseCsr": "sparse_csr",
             "SparseCsc": "sparse_csc",
             "Jagged": "jagged",
         }
@@ -220,6 +248,9 @@ class PartiallyMaterializedTensor:
         return torch.equal(tensor1.full_tensor(), tensor2.full_tensor())
+    def get_kvtensor_serializable_metadata(self) -> list[str]:
+        return self._wrapped.get_kvtensor_serializable_metadata()
     def __hash__(self):
         return id(self)

fbgemm_gpu/tbe/stats/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from .bench_params_reporter import TBEBenchmarkParamsReporter  # noqa F401

fbgemm_gpu/tbe/stats/bench_params_reporter.py ADDED Viewed

@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import io
+import json
+import logging
+import os
+from typing import List, Optional, Tuple
+import fbgemm_gpu  # noqa F401
+import torch  # usort:skip
+from fbgemm_gpu.tbe.bench.tbe_data_config import (
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+    TBEDataConfig,
+)
+open_source: bool = False
+# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+open_source: bool = getattr(fbgemm_gpu, "open_source", False)
+if open_source:
+    from fbgemm_gpu.utils import FileStore
+else:
+    try:
+        from fbgemm_gpu.fb.utils.manifold_wrapper import FileStore
+        torch.ops.load_library(
+            "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_estimator"
+        )
+    except Exception:
+        pass
+class TBEBenchmarkParamsReporter:
+    """
+    TBEBenchmarkParamsReporter is responsible for extracting and reporting the configuration data of TBE processes.
+    """
+    def __init__(
+        self,
+        report_interval: int,
+        report_iter_start: int = 0,
+        report_iter_end: int = -1,
+        bucket: Optional[str] = None,
+        path_prefix: Optional[str] = None,
+    ) -> None:
+        """
+        Initializes the TBEBenchmarkParamsReporter with the specified parameters.
+        Args:
+            report_interval (int): The interval at which reports are generated.
+            report_iter_start (int): The start of the iteration range to capture. Defaults to 0.
+            report_iter_end (int): The end of the iteration range to capture. Defaults to -1 (last iteration).
+            bucket (Optional[str], optional): The storage bucket for reports. Defaults to None.
+            path_prefix (Optional[str], optional): The path prefix for report storage. Defaults to None.
+        """
+        assert report_interval > 0, "report_interval must be greater than 0"
+        assert (
+            report_iter_start >= 0
+        ), "report_iter_start must be greater than or equal to 0"
+        assert (
+            report_iter_end >= -1
+        ), "report_iter_end must be greater than or equal to -1"
+        assert (
+            report_iter_end == -1 or report_iter_start <= report_iter_end
+        ), "report_iter_start must be less than or equal to report_iter_end"
+        self.report_interval = report_interval
+        self.report_iter_start = report_iter_start
+        self.report_iter_end = report_iter_end
+        if path_prefix is not None and path_prefix.endswith("/"):
+            path_prefix = path_prefix[:-1]
+        self.path_prefix = path_prefix
+        default_bucket = "/tmp" if open_source else "tlparse_reports"
+        bucket = (
+            bucket
+            if bucket is not None
+            else os.environ.get("FBGEMM_TBE_REPORTING_BUCKET", default_bucket)
+        )
+        self.filestore = FileStore(bucket)
+        if self.path_prefix is not None and not self.filestore.exists(self.path_prefix):
+            self.filestore.create_directory(self.path_prefix)
+        self.logger: logging.Logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.INFO)
+    @classmethod
+    def create(cls) -> "TBEBenchmarkParamsReporter":
+        """
+        This method returns an instance of TBEBenchmarkParamsReporter based on environment variables.
+        If the `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` environment variable is set to a value greater than 0, it creates an instance that:
+        - Reports input parameters (TBEDataConfig).
+        - Writes the output as a JSON file.
+        Additionally, the following environment variables are considered:
+        - `FBGEMM_REPORT_INPUT_PARAMS_ITER_START`: Specifies the start of the iteration range to capture.
+        - `FBGEMM_REPORT_INPUT_PARAMS_ITER_END`: Specifies the end of the iteration range to capture.
+        - `FBGEMM_REPORT_INPUT_PARAMS_BUCKET`: Specifies the bucket for reporting.
+        - `FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX`: Specifies the path prefix for reporting.
+        Returns:
+            TBEBenchmarkParamsReporter: An instance configured based on the environment variables.
+        """
+        report_interval = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_INTERVAL", "1")
+        )
+        report_iter_start = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_START", "0")
+        )
+        report_iter_end = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_END", "-1")
+        )
+        bucket = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_BUCKET", "")
+        path_prefix = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX", "")
+        return cls(
+            report_interval=report_interval,
+            report_iter_start=report_iter_start,
+            report_iter_end=report_iter_end,
+            bucket=bucket,
+            path_prefix=path_prefix,
+        )
+    def extract_Ls(
+        self,
+        bag_sizes: List[int],
+        Bs: List[int],
+    ) -> List[float]:
+        Ls = []
+        start = 0
+        for b in Bs:
+            end = start + b
+            avg_L = sum(bag_sizes[start:end]) / b if b > 0 else 0
+            start = end
+            Ls.append(avg_L)
+        return Ls
+    def extract_params(
+        self,
+        feature_rows: torch.Tensor,
+        feature_dims: torch.Tensor,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+        per_sample_weights: Optional[torch.Tensor] = None,
+        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        Es: Optional[List[int]] = None,
+        Ds: Optional[List[int]] = None,
+        embedding_specs: Optional[List[Tuple[int, int]]] = None,
+        feature_table_map: Optional[List[int]] = None,
+    ) -> TBEDataConfig:
+        """
+        Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
+        Args:
+            feature_rows (torch.Tensor): Number of rows in each feature.
+            feature_dims (torch.Tensor): Number of dimensions in each feature.
+            indices (torch.Tensor): The input indices tensor.
+            offsets (torch.Tensor): The input offsets tensor.
+            per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
+            batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
+        Returns:
+            TBEDataConfig: The configuration data for TBE benchmarking.
+        """
+        Es = feature_rows.tolist()
+        Ds = feature_dims.tolist()
+        assert len(Es) == len(
+            Ds
+        ), "feature_rows and feature_dims must have the same length"
+        # Transfer indices back to CPU for EEG analysis
+        indices_cpu = indices.cpu()
+        # Set T to be the number of features we are looking at
+        T = len(Ds)
+        # Set E to be the mean of the rowcounts to avoid biasing
+        E = (
+            Es[0]
+            if len(set(Es)) == 1
+            else torch.ceil(
+                torch.mean(torch.tensor(feature_rows, dtype=torch.float))
+            ).item()
+        )
+        # Set mixed_dim to be True if there are multiple dims
+        mixed_dim = len(set(Ds)) > 1
+        # Set D to be the mean of the dims to avoid biasing
+        D = (
+            Ds[0]
+            if not mixed_dim
+            else torch.ceil(
+                torch.mean(torch.tensor(feature_dims, dtype=torch.float))
+            ).item()
+        )
+        # Compute indices distribution parameters
+        heavy_hitters, q, s, _, _ = torch.ops.fbgemm.tbe_estimate_indices_distribution(
+            indices_cpu
+        )
+        indices_params = IndicesParams(
+            heavy_hitters, q, s, indices.dtype, offsets.dtype
+        )
+        # Compute batch parameters
+        B = int((offsets.numel() - 1) // T)
+        Bs = (
+            [sum(b_per_rank) for b_per_rank in batch_size_per_feature_per_rank]
+            if batch_size_per_feature_per_rank
+            else [B] * T
+        )
+        batch_params = BatchParams(
+            B=B,
+            sigma_B=(
+                int(
+                    torch.ceil(
+                        torch.std(
+                            torch.tensor(
+                                [
+                                    b
+                                    for bs in batch_size_per_feature_per_rank
+                                    for b in bs
+                                ]
+                            ).float()
+                        )
+                    )
+                )
+                if batch_size_per_feature_per_rank
+                else None
+            ),
+            vbe_distribution=("normal" if batch_size_per_feature_per_rank else None),
+            vbe_num_ranks=(
+                len(batch_size_per_feature_per_rank)
+                if batch_size_per_feature_per_rank
+                else None
+            ),
+            Bs=Bs,
+        )
+        # Compute pooling parameters
+        bag_sizes = offsets[1:] - offsets[:-1]
+        if batch_size_per_feature_per_rank is None:
+            _B = int(bag_sizes.numel() // T)
+            assert _B == Bs[0], f"Expected constant batch size {Bs[0]} but got {_B}"
+        mixed_bag_sizes = len(set(bag_sizes)) > 1
+        pooling_params = PoolingParams(
+            L=(
+                int(torch.ceil(torch.mean(bag_sizes.float())))
+                if mixed_bag_sizes
+                else int(bag_sizes[0])
+            ),
+            sigma_L=(
+                int(torch.ceil(torch.std(bag_sizes.float())))
+                if mixed_bag_sizes
+                else None
+            ),
+            length_distribution=("normal" if mixed_bag_sizes else None),
+            Ls=self.extract_Ls(bag_sizes.tolist(), Bs),
+        )
+        return TBEDataConfig(
+            T=T,
+            E=E,
+            D=D,
+            mixed_dim=mixed_dim,
+            weighted=(per_sample_weights is not None),
+            batch_params=batch_params,
+            indices_params=indices_params,
+            pooling_params=pooling_params,
+            use_cpu=(not torch.cuda.is_available()),
+            Es=Es,
+            Ds=Ds,
+            embedding_specs=embedding_specs,
+            feature_table_map=feature_table_map,
+        )
+    def report_stats(
+        self,
+        feature_rows: torch.Tensor,
+        feature_dims: torch.Tensor,
+        iteration: int,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+        op_id: str = "",
+        per_sample_weights: Optional[torch.Tensor] = None,
+        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        embedding_specs: Optional[List[Tuple[int, int]]] = None,
+        feature_table_map: Optional[List[int]] = None,
+    ) -> None:
+        """
+        Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
+        Args:
+            feature_rows (torch.Tensor): Number of rows in each feature.
+            feature_dims (torch.Tensor): Number of dimensions in each feature.
+            iteration (int): The current iteration number.
+            indices (torch.Tensor): The input indices tensor.
+            offsets (torch.Tensor): The input offsets tensor.
+            op_id (str, optional): The operation identifier. Defaults to an empty string.
+            per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
+            batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
+            embedding_specs (Optional[List[Tuple[int, int]]]): Embedding specs. Defaults to None.
+            feature_table_map (Optional[List[int]], optional): Feature table map. Defaults to None.
+        """
+        if (
+            (iteration - self.report_iter_start) % self.report_interval == 0
+            and (iteration >= self.report_iter_start)
+            and (self.report_iter_end == -1 or iteration <= self.report_iter_end)
+        ):
+            # If indices tensor is empty (indices.numel() == 0), skip reporting
+            # TODO: Remove this once we have a better way to handle empty indices tensors
+            if indices.numel() == 0:
+                return
+            # Extract TBE config
+            config = self.extract_params(
+                feature_rows=feature_rows,
+                feature_dims=feature_dims,
+                indices=indices,
+                offsets=offsets,
+                per_sample_weights=per_sample_weights,
+                batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                Es=feature_rows.tolist(),
+                Ds=feature_dims.tolist(),
+                embedding_specs=embedding_specs,
+                feature_table_map=feature_table_map,
+            )
+            # Write the TBE config to FileStore
+            self.filestore.write(
+                f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
+                io.BytesIO(json.dumps(config.dict(), indent=2).encode()),
+            )

fbgemm_gpu/tbe/utils/offsets.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # pyre-strict
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional
 import numpy as np
 import torch
@@ -21,9 +21,9 @@ def get_table_batched_offsets_from_dense(
     L: Optional[int] = None,
     total_B: Optional[int] = None,
     use_cpu: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     if L is None and total_B is None:
-        (T, B, L) = merged_indices.size()
+        T, B, L = merged_indices.size()
         total_B = T * B
     # pyre-fixme[6]: For 1st argument expected `Union[Sequence[SupportsIndex],
     #  SupportsIndex]` but got `Optional[int]`.
@@ -37,8 +37,8 @@ def get_table_batched_offsets_from_dense(
     )
-def get_offsets_from_dense(indices: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    (B, L) = indices.size()
+def get_offsets_from_dense(indices: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    B, L = indices.size()
     return (
         indices.contiguous().view(-1),
         torch.tensor(
@@ -54,7 +54,7 @@ def b_indices(
     use_cpu: bool = False,
     do_pooling: bool = True,
 ) -> torch.Tensor:
-    (indices, offsets) = get_offsets_from_dense(x)
+    indices, offsets = get_offsets_from_dense(x)
     if do_pooling:
         return b(
             to_device(indices, use_cpu),

fbgemm_gpu/tbe/utils/quantize.py CHANGED Viewed

@@ -7,7 +7,7 @@
 # pyre-strict
 # pyre-ignore-all-errors[61]
-from typing import Optional, Tuple
+from typing import Optional
 import torch
@@ -22,7 +22,7 @@ def quantize_embs(
     weight: torch.Tensor,
     weight_ty: SparseType,
     fp8_config: Optional[FP8QuantizationConfig] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     weight = weight.detach()
     if weight_ty == SparseType.FP32:
         q_weight = weight.float()
@@ -91,7 +91,7 @@ def dequantize_embs(
         th_scale_shift: torch.Tensor = scale_shift.view(torch.float16).to(torch.float32)
     if weight_ty == SparseType.INT4:
-        (E, D_2) = th_weights.shape
+        E, D_2 = th_weights.shape
         D = D_2 * 2
         def comp(i: int) -> torch.Tensor:
@@ -109,7 +109,7 @@ def dequantize_embs(
         return to_device(torch.tensor(comps), use_cpu)
     elif weight_ty == SparseType.INT2:
-        (E, D_4) = th_weights.shape
+        E, D_4 = th_weights.shape
         D = D_4 * 4
         # pyre-fixme[53]: Captured variable `scale_shift` is not annotated.
@@ -129,7 +129,7 @@ def dequantize_embs(
         return to_device(torch.tensor(comps), use_cpu)
     elif weight_ty == SparseType.INT8:
-        (E, D) = th_weights.shape
+        E, D = th_weights.shape
         comps = th_weights.to(torch.float32) * th_scale_shift[:, 0].reshape(-1, 1).to(
             torch.float32
         ) + th_scale_shift[:, 1].reshape(-1, 1).to(torch.float32)
@@ -177,7 +177,7 @@ def fake_quantize_embs(
         )
     if weight_ty == SparseType.INT4:
-        (E, D_2) = th_weights.shape
+        E, D_2 = th_weights.shape
         D = D_2 * 2
         def comp(i: int) -> torch.Tensor:
@@ -195,7 +195,7 @@ def fake_quantize_embs(
         dequant_weights.copy_(to_device(comps, use_cpu))
     elif weight_ty == SparseType.INT2:
-        (E, D_4) = th_weights.shape
+        E, D_4 = th_weights.shape
         D = D_4 * 4
         # pyre-fixme[53]: Captured variable `scale_shift` is not annotated.
@@ -215,7 +215,7 @@ def fake_quantize_embs(
         dequant_weights.copy_(to_device(comps, use_cpu))
     elif weight_ty == SparseType.INT8:
-        (E, D) = th_weights.shape
+        E, D = th_weights.shape
         comps = th_weights.to(torch.float32) * th_scale_shift[:, 0].reshape(-1, 1).to(
             torch.float32
         ) + th_scale_shift[:, 1].reshape(-1, 1).to(torch.float32)