PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py CHANGED Viewed

@@ -9,7 +9,7 @@
 from __future__ import annotations
 import functools
-from typing import List, Optional, Union
+from typing import Optional, Union
 import torch
@@ -191,7 +191,6 @@ class PartiallyMaterializedTensor:
         layout_str_mapping = {
             "SparseCsr": "sparse_csr",
             "Strided": "strided",
-            "SparseCsr": "sparse_csr",
             "SparseCsc": "sparse_csc",
             "Jagged": "jagged",
         }
@@ -249,7 +248,7 @@ class PartiallyMaterializedTensor:
         return torch.equal(tensor1.full_tensor(), tensor2.full_tensor())
-    def get_kvtensor_serializable_metadata(self) -> List[str]:
+    def get_kvtensor_serializable_metadata(self) -> list[str]:
         return self._wrapped.get_kvtensor_serializable_metadata()
     def __hash__(self):

fbgemm_gpu/tbe/stats/bench_params_reporter.py CHANGED Viewed

@@ -8,31 +8,37 @@
 # pyre-strict
 import io
+import json
 import logging
 import os
-from typing import List, Optional
+from typing import List, Optional, Tuple
 import fbgemm_gpu  # noqa F401
-import numpy as np  # usort:skip
 import torch  # usort:skip
-from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
-    SplitTableBatchedEmbeddingBagsCodegen,
-)
-from fbgemm_gpu.tbe.bench import (
+from fbgemm_gpu.tbe.bench.tbe_data_config import (
     BatchParams,
     IndicesParams,
     PoolingParams,
     TBEDataConfig,
 )
-# pyre-ignore[16]
+open_source: bool = False
+# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
 open_source: bool = getattr(fbgemm_gpu, "open_source", False)
 if open_source:
     from fbgemm_gpu.utils import FileStore
 else:
-    from fbgemm_gpu.fb.utils import FileStore
+    try:
+        from fbgemm_gpu.fb.utils.manifold_wrapper import FileStore
+        torch.ops.load_library(
+            "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_estimator"
+        )
+    except Exception:
+        pass
 class TBEBenchmarkParamsReporter:
@@ -43,7 +49,8 @@ class TBEBenchmarkParamsReporter:
     def __init__(
         self,
         report_interval: int,
-        report_once: bool = False,
+        report_iter_start: int = 0,
+        report_iter_end: int = -1,
         bucket: Optional[str] = None,
         path_prefix: Optional[str] = None,
     ) -> None:
@@ -52,13 +59,31 @@ class TBEBenchmarkParamsReporter:
         Args:
             report_interval (int): The interval at which reports are generated.
-            report_once (bool, optional): If True, reporting occurs only once. Defaults to False.
+            report_iter_start (int): The start of the iteration range to capture. Defaults to 0.
+            report_iter_end (int): The end of the iteration range to capture. Defaults to -1 (last iteration).
             bucket (Optional[str], optional): The storage bucket for reports. Defaults to None.
             path_prefix (Optional[str], optional): The path prefix for report storage. Defaults to None.
         """
+        assert report_interval > 0, "report_interval must be greater than 0"
+        assert (
+            report_iter_start >= 0
+        ), "report_iter_start must be greater than or equal to 0"
+        assert (
+            report_iter_end >= -1
+        ), "report_iter_end must be greater than or equal to -1"
+        assert (
+            report_iter_end == -1 or report_iter_start <= report_iter_end
+        ), "report_iter_start must be less than or equal to report_iter_end"
         self.report_interval = report_interval
-        self.report_once = report_once
-        self.has_reported = False
+        self.report_iter_start = report_iter_start
+        self.report_iter_end = report_iter_end
+        if path_prefix is not None and path_prefix.endswith("/"):
+            path_prefix = path_prefix[:-1]
+        self.path_prefix = path_prefix
         default_bucket = "/tmp" if open_source else "tlparse_reports"
         bucket = (
@@ -68,22 +93,83 @@ class TBEBenchmarkParamsReporter:
         )
         self.filestore = FileStore(bucket)
+        if self.path_prefix is not None and not self.filestore.exists(self.path_prefix):
+            self.filestore.create_directory(self.path_prefix)
         self.logger: logging.Logger = logging.getLogger(__name__)
         self.logger.setLevel(logging.INFO)
+    @classmethod
+    def create(cls) -> "TBEBenchmarkParamsReporter":
+        """
+        This method returns an instance of TBEBenchmarkParamsReporter based on environment variables.
+        If the `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` environment variable is set to a value greater than 0, it creates an instance that:
+        - Reports input parameters (TBEDataConfig).
+        - Writes the output as a JSON file.
+        Additionally, the following environment variables are considered:
+        - `FBGEMM_REPORT_INPUT_PARAMS_ITER_START`: Specifies the start of the iteration range to capture.
+        - `FBGEMM_REPORT_INPUT_PARAMS_ITER_END`: Specifies the end of the iteration range to capture.
+        - `FBGEMM_REPORT_INPUT_PARAMS_BUCKET`: Specifies the bucket for reporting.
+        - `FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX`: Specifies the path prefix for reporting.
+        Returns:
+            TBEBenchmarkParamsReporter: An instance configured based on the environment variables.
+        """
+        report_interval = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_INTERVAL", "1")
+        )
+        report_iter_start = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_START", "0")
+        )
+        report_iter_end = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_END", "-1")
+        )
+        bucket = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_BUCKET", "")
+        path_prefix = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX", "")
+        return cls(
+            report_interval=report_interval,
+            report_iter_start=report_iter_start,
+            report_iter_end=report_iter_end,
+            bucket=bucket,
+            path_prefix=path_prefix,
+        )
+    def extract_Ls(
+        self,
+        bag_sizes: List[int],
+        Bs: List[int],
+    ) -> List[float]:
+        Ls = []
+        start = 0
+        for b in Bs:
+            end = start + b
+            avg_L = sum(bag_sizes[start:end]) / b if b > 0 else 0
+            start = end
+            Ls.append(avg_L)
+        return Ls
     def extract_params(
         self,
-        embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
+        feature_rows: torch.Tensor,
+        feature_dims: torch.Tensor,
         indices: torch.Tensor,
         offsets: torch.Tensor,
         per_sample_weights: Optional[torch.Tensor] = None,
         batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        Es: Optional[List[int]] = None,
+        Ds: Optional[List[int]] = None,
+        embedding_specs: Optional[List[Tuple[int, int]]] = None,
+        feature_table_map: Optional[List[int]] = None,
     ) -> TBEDataConfig:
         """
-        Extracts parameters from the embedding operation, input indices and offsets to create a TBEDataConfig.
+        Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
         Args:
-            embedding_op (SplitTableBatchedEmbeddingBagsCodegen): The embedding operation.
+            feature_rows (torch.Tensor): Number of rows in each feature.
+            feature_dims (torch.Tensor): Number of dimensions in each feature.
             indices (torch.Tensor): The input indices tensor.
             offsets (torch.Tensor): The input offsets tensor.
             per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
@@ -92,24 +178,37 @@ class TBEBenchmarkParamsReporter:
         Returns:
             TBEDataConfig: The configuration data for TBE benchmarking.
         """
+        Es = feature_rows.tolist()
+        Ds = feature_dims.tolist()
+        assert len(Es) == len(
+            Ds
+        ), "feature_rows and feature_dims must have the same length"
         # Transfer indices back to CPU for EEG analysis
         indices_cpu = indices.cpu()
-        # Extract embedding table specs
-        embedding_specs = [
-            embedding_op.embedding_specs[t] for t in embedding_op.feature_table_map
-        ]
-        rowcounts = [embedding_spec[0] for embedding_spec in embedding_specs]
-        dims = [embedding_spec[1] for embedding_spec in embedding_specs]
         # Set T to be the number of features we are looking at
-        T = len(embedding_op.feature_table_map)
+        T = len(Ds)
         # Set E to be the mean of the rowcounts to avoid biasing
-        E = rowcounts[0] if len(set(rowcounts)) == 1 else np.ceil((np.mean(rowcounts)))
+        E = (
+            Es[0]
+            if len(set(Es)) == 1
+            else torch.ceil(
+                torch.mean(torch.tensor(feature_rows, dtype=torch.float))
+            ).item()
+        )
         # Set mixed_dim to be True if there are multiple dims
-        mixed_dim = len(set(dims)) > 1
+        mixed_dim = len(set(Ds)) > 1
         # Set D to be the mean of the dims to avoid biasing
-        D = dims[0] if not mixed_dim else np.ceil((np.mean(dims)))
+        D = (
+            Ds[0]
+            if not mixed_dim
+            else torch.ceil(
+                torch.mean(torch.tensor(feature_dims, dtype=torch.float))
+            ).item()
+        )
         # Compute indices distribution parameters
         heavy_hitters, q, s, _, _ = torch.ops.fbgemm.tbe_estimate_indices_distribution(
@@ -120,11 +219,27 @@ class TBEBenchmarkParamsReporter:
         )
         # Compute batch parameters
+        B = int((offsets.numel() - 1) // T)
+        Bs = (
+            [sum(b_per_rank) for b_per_rank in batch_size_per_feature_per_rank]
+            if batch_size_per_feature_per_rank
+            else [B] * T
+        )
         batch_params = BatchParams(
-            B=((offsets.numel() - 1) // T),
+            B=B,
             sigma_B=(
-                np.ceil(
-                    np.std([b for bs in batch_size_per_feature_per_rank for b in bs])
+                int(
+                    torch.ceil(
+                        torch.std(
+                            torch.tensor(
+                                [
+                                    b
+                                    for bs in batch_size_per_feature_per_rank
+                                    for b in bs
+                                ]
+                            ).float()
+                        )
+                    )
                 )
                 if batch_size_per_feature_per_rank
                 else None
@@ -135,15 +250,28 @@ class TBEBenchmarkParamsReporter:
                 if batch_size_per_feature_per_rank
                 else None
             ),
+            Bs=Bs,
         )
         # Compute pooling parameters
-        bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
+        bag_sizes = offsets[1:] - offsets[:-1]
+        if batch_size_per_feature_per_rank is None:
+            _B = int(bag_sizes.numel() // T)
+            assert _B == Bs[0], f"Expected constant batch size {Bs[0]} but got {_B}"
         mixed_bag_sizes = len(set(bag_sizes)) > 1
         pooling_params = PoolingParams(
-            L=np.ceil(np.mean(bag_sizes)) if mixed_bag_sizes else bag_sizes[0],
-            sigma_L=(np.ceil(np.std(bag_sizes)) if mixed_bag_sizes else None),
+            L=(
+                int(torch.ceil(torch.mean(bag_sizes.float())))
+                if mixed_bag_sizes
+                else int(bag_sizes[0])
+            ),
+            sigma_L=(
+                int(torch.ceil(torch.std(bag_sizes.float())))
+                if mixed_bag_sizes
+                else None
+            ),
             length_distribution=("normal" if mixed_bag_sizes else None),
+            Ls=self.extract_Ls(bag_sizes.tolist(), Bs),
         )
         return TBEDataConfig(
@@ -156,38 +284,66 @@ class TBEBenchmarkParamsReporter:
             indices_params=indices_params,
             pooling_params=pooling_params,
             use_cpu=(not torch.cuda.is_available()),
+            Es=Es,
+            Ds=Ds,
+            embedding_specs=embedding_specs,
+            feature_table_map=feature_table_map,
         )
     def report_stats(
         self,
-        embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
+        feature_rows: torch.Tensor,
+        feature_dims: torch.Tensor,
+        iteration: int,
         indices: torch.Tensor,
         offsets: torch.Tensor,
+        op_id: str = "",
         per_sample_weights: Optional[torch.Tensor] = None,
         batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        embedding_specs: Optional[List[Tuple[int, int]]] = None,
+        feature_table_map: Optional[List[int]] = None,
     ) -> None:
         """
-        Reports the configuration of the embedding operation and input data then writes the TBE configuration to the filestore.
+        Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
         Args:
-            embedding_op (SplitTableBatchedEmbeddingBagsCodegen): The embedding operation.
+            feature_rows (torch.Tensor): Number of rows in each feature.
+            feature_dims (torch.Tensor): Number of dimensions in each feature.
+            iteration (int): The current iteration number.
             indices (torch.Tensor): The input indices tensor.
             offsets (torch.Tensor): The input offsets tensor.
+            op_id (str, optional): The operation identifier. Defaults to an empty string.
             per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
             batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
+            embedding_specs (Optional[List[Tuple[int, int]]]): Embedding specs. Defaults to None.
+            feature_table_map (Optional[List[int]], optional): Feature table map. Defaults to None.
         """
-        if embedding_op.iter.item() % self.report_interval == 0 and (
-            not self.report_once or (self.report_once and not self.has_reported)
+        if (
+            (iteration - self.report_iter_start) % self.report_interval == 0
+            and (iteration >= self.report_iter_start)
+            and (self.report_iter_end == -1 or iteration <= self.report_iter_end)
         ):
+            # If indices tensor is empty (indices.numel() == 0), skip reporting
+            # TODO: Remove this once we have a better way to handle empty indices tensors
+            if indices.numel() == 0:
+                return
             # Extract TBE config
             config = self.extract_params(
-                embedding_op, indices, offsets, per_sample_weights
+                feature_rows=feature_rows,
+                feature_dims=feature_dims,
+                indices=indices,
+                offsets=offsets,
+                per_sample_weights=per_sample_weights,
+                batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                Es=feature_rows.tolist(),
+                Ds=feature_dims.tolist(),
+                embedding_specs=embedding_specs,
+                feature_table_map=feature_table_map,
             )
             # Write the TBE config to FileStore
             self.filestore.write(
-                f"tbe-{embedding_op.uuid}-config-estimation-{embedding_op.iter.item()}.json",
-                io.BytesIO(config.json(format=True).encode()),
+                f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
+                io.BytesIO(json.dumps(config.dict(), indent=2).encode()),
             )
-            self.has_reported = True

fbgemm_gpu/tbe/utils/offsets.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # pyre-strict
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional
 import numpy as np
 import torch
@@ -21,9 +21,9 @@ def get_table_batched_offsets_from_dense(
     L: Optional[int] = None,
     total_B: Optional[int] = None,
     use_cpu: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     if L is None and total_B is None:
-        (T, B, L) = merged_indices.size()
+        T, B, L = merged_indices.size()
         total_B = T * B
     # pyre-fixme[6]: For 1st argument expected `Union[Sequence[SupportsIndex],
     #  SupportsIndex]` but got `Optional[int]`.
@@ -37,8 +37,8 @@ def get_table_batched_offsets_from_dense(
     )
-def get_offsets_from_dense(indices: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    (B, L) = indices.size()
+def get_offsets_from_dense(indices: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    B, L = indices.size()
     return (
         indices.contiguous().view(-1),
         torch.tensor(
@@ -54,7 +54,7 @@ def b_indices(
     use_cpu: bool = False,
     do_pooling: bool = True,
 ) -> torch.Tensor:
-    (indices, offsets) = get_offsets_from_dense(x)
+    indices, offsets = get_offsets_from_dense(x)
     if do_pooling:
         return b(
             to_device(indices, use_cpu),

fbgemm_gpu/tbe/utils/quantize.py CHANGED Viewed

@@ -7,7 +7,7 @@
 # pyre-strict
 # pyre-ignore-all-errors[61]
-from typing import Optional, Tuple
+from typing import Optional
 import torch
@@ -22,7 +22,7 @@ def quantize_embs(
     weight: torch.Tensor,
     weight_ty: SparseType,
     fp8_config: Optional[FP8QuantizationConfig] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     weight = weight.detach()
     if weight_ty == SparseType.FP32:
         q_weight = weight.float()
@@ -91,7 +91,7 @@ def dequantize_embs(
         th_scale_shift: torch.Tensor = scale_shift.view(torch.float16).to(torch.float32)
     if weight_ty == SparseType.INT4:
-        (E, D_2) = th_weights.shape
+        E, D_2 = th_weights.shape
         D = D_2 * 2
         def comp(i: int) -> torch.Tensor:
@@ -109,7 +109,7 @@ def dequantize_embs(
         return to_device(torch.tensor(comps), use_cpu)
     elif weight_ty == SparseType.INT2:
-        (E, D_4) = th_weights.shape
+        E, D_4 = th_weights.shape
         D = D_4 * 4
         # pyre-fixme[53]: Captured variable `scale_shift` is not annotated.
@@ -129,7 +129,7 @@ def dequantize_embs(
         return to_device(torch.tensor(comps), use_cpu)
     elif weight_ty == SparseType.INT8:
-        (E, D) = th_weights.shape
+        E, D = th_weights.shape
         comps = th_weights.to(torch.float32) * th_scale_shift[:, 0].reshape(-1, 1).to(
             torch.float32
         ) + th_scale_shift[:, 1].reshape(-1, 1).to(torch.float32)
@@ -177,7 +177,7 @@ def fake_quantize_embs(
         )
     if weight_ty == SparseType.INT4:
-        (E, D_2) = th_weights.shape
+        E, D_2 = th_weights.shape
         D = D_2 * 2
         def comp(i: int) -> torch.Tensor:
@@ -195,7 +195,7 @@ def fake_quantize_embs(
         dequant_weights.copy_(to_device(comps, use_cpu))
     elif weight_ty == SparseType.INT2:
-        (E, D_4) = th_weights.shape
+        E, D_4 = th_weights.shape
         D = D_4 * 4
         # pyre-fixme[53]: Captured variable `scale_shift` is not annotated.
@@ -215,7 +215,7 @@ def fake_quantize_embs(
         dequant_weights.copy_(to_device(comps, use_cpu))
     elif weight_ty == SparseType.INT8:
-        (E, D) = th_weights.shape
+        E, D = th_weights.shape
         comps = th_weights.to(torch.float32) * th_scale_shift[:, 0].reshape(-1, 1).to(
             torch.float32
         ) + th_scale_shift[:, 1].reshape(-1, 1).to(torch.float32)

fbgemm_gpu/tbe/utils/requests.py CHANGED Viewed

@@ -8,7 +8,7 @@
 import logging
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Optional
 import numpy as np
 import numpy.typing as npt
@@ -32,20 +32,20 @@ class TBERequest:
     indices: torch.Tensor
     offsets: torch.Tensor
     per_sample_weights: Optional[torch.Tensor] = None
-    Bs_per_feature_per_rank: Optional[List[List[int]]] = None
+    Bs_per_feature_per_rank: Optional[list[list[int]]] = None
-    def unpack_2(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def unpack_2(self) -> tuple[torch.Tensor, torch.Tensor]:
         return (self.indices, self.offsets)
     def unpack_3(
         self,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         return (self.indices, self.offsets, self.per_sample_weights)
     def unpack_4(
         self,
-    ) -> Tuple[
-        torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[List[List[int]]]
+    ) -> tuple[
+        torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]
     ]:
         return (
             self.indices,
@@ -68,7 +68,7 @@ def generate_requests_from_data_file(
     tables: Optional[str] = None,
     index_dtype: Optional[torch.dtype] = None,
     offset_dtype: Optional[torch.dtype] = None,
-) -> List[TBERequest]:
+) -> list[TBERequest]:
     """
     Generate TBE requests from the input data file. If `requests_data_file` is provided,
     `indices_file` and `offsets_file` should not be provided. If either `indices_file`
@@ -178,12 +178,12 @@ def generate_int_data_from_stats(
 def generate_pooling_factors_from_stats(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     sigma_L: int,
     # distribution of pooling factors
     length_dist: str,
-) -> Tuple[int, torch.Tensor]:
+) -> tuple[int, torch.Tensor]:
     """
     Generate pooling factors for the TBE requests from the given stats
     """
@@ -211,7 +211,7 @@ def generate_batch_sizes_from_stats(
     vbe_num_ranks: int,
     # Distribution of batch sizes
     batch_size_dist: str,
-) -> Tuple[List[int], List[List[int]]]:
+) -> tuple[list[int], list[list[int]]]:
     """
     Generate batch sizes for features from the given stats
     """
@@ -234,7 +234,7 @@ def generate_batch_sizes_from_stats(
 def generate_indices_uniform(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     E: int,
     use_variable_L: bool,
@@ -252,7 +252,7 @@ def generate_indices_uniform(
         dtype=torch.int32,
     )
     # each bag is usually sorted
-    (indices, _) = torch.sort(indices)
+    indices, _ = torch.sort(indices)
     if use_variable_L:
         # 1D layout, where row offsets are determined by L_offsets
         indices = torch.ops.fbgemm.bottom_k_per_row(
@@ -267,7 +267,7 @@ def generate_indices_uniform(
 def generate_indices_zipf(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     E: int,
     alpha: float,
@@ -324,7 +324,7 @@ def generate_indices_zipf(
 def update_indices_with_random_reuse(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     reuse: float,
     indices: torch.Tensor,
@@ -411,7 +411,7 @@ def generate_requests(  # noqa C901
     vbe_num_ranks: Optional[int] = None,
     index_dtype: Optional[torch.dtype] = None,
     offset_dtype: Optional[torch.dtype] = None,
-) -> List[TBERequest]:
+) -> list[TBERequest]:
     # TODO: refactor and split into helper functions to separate load from file,
     # generate from distribution, and other future methods of generating data
     if (

fbgemm_gpu/tbe_input_multiplexer.py CHANGED Viewed

@@ -8,9 +8,8 @@
 # pyre-unsafe
 import abc
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional
 from torch import Tensor
@@ -32,15 +31,15 @@ class TBEInfo:
         col_offset: the shard offset of the current rank on column (dim)
     """
-    table_names: List[str]
-    table_heights: List[int]
+    table_names: list[str]
+    table_heights: list[int]
     tbe_uuid: str
-    feature_table_map: List[int]
-    table_dims: List[int]
-    full_table_heights: List[int]
-    full_table_dims: List[int]
-    row_offset: List[int]
-    col_offset: List[int]
+    feature_table_map: list[int]
+    table_dims: list[int]
+    full_table_heights: list[int]
+    full_table_dims: list[int]
+    row_offset: list[int]
+    col_offset: list[int]
 @dataclass(frozen=True)
@@ -55,7 +54,7 @@ class TBEInputInfo:
     indices: Tensor
     offsets: Tensor
-    batch_size_per_feature_per_rank: Optional[List[List[int]]] = None
+    batch_size_per_feature_per_rank: Optional[list[list[int]]] = None
 class TBEInputMultiplexer(abc.ABC):

fbgemm_gpu/triton/common.py CHANGED Viewed

@@ -10,7 +10,6 @@ from enum import IntEnum
 import torch
 # We keep LUTs persistent to minimize the number of device copies required.
 E2M1_LUT = torch.tensor(
     [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6],