PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py ADDED Viewed

@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+# pyre-ignore-all-errors[56]
+from typing import Optional, Union
+import torch  # usort:skip
+from torch import Tensor  # usort:skip
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    BoundsCheckMode,
+    CacheAlgorithm,
+    DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
+    EmbeddingLocation,
+    PoolingMode,
+    RecordCacheMetrics,
+)
+from fbgemm_gpu.split_table_batched_embeddings_ops_inference import (
+    inputs_to_device,
+    IntNBitTableBatchedEmbeddingBagsCodegen,
+    random_quant_scaled_tensor,
+    rounded_row_size_in_bytes,
+)
+from fbgemm_gpu.utils.loader import load_torch_module
+try:
+    load_torch_module(
+        "//deeplearning/fbgemm/fbgemm_gpu:dram_kv_embedding_inference",
+    )
+except Exception:
+    pass
+class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
+    """
+    KV Table-batched version of nn.EmbeddingBag(sparse=False)
+    Inference version, with support for FP32/FP16/FP8/INT8/INT4/INT2 weights
+    """
+    def __init__(  # noqa C901
+        self,
+        embedding_specs: list[
+            tuple[str, int, int, SparseType, EmbeddingLocation]
+        ],  # tuple of (feature_names, rows, dims, SparseType, EmbeddingLocation/placement)
+        feature_table_map: Optional[list[int]] = None,  # [T]
+        index_remapping: Optional[list[Tensor]] = None,
+        pooling_mode: PoolingMode = PoolingMode.SUM,
+        device: Optional[Union[str, int, torch.device]] = None,
+        bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
+        weight_lists: Optional[list[tuple[Tensor, Optional[Tensor]]]] = None,
+        pruning_hash_load_factor: float = 0.5,
+        use_array_for_index_remapping: bool = True,
+        output_dtype: SparseType = SparseType.FP16,
+        cache_algorithm: CacheAlgorithm = CacheAlgorithm.LRU,
+        cache_load_factor: float = 0.2,
+        cache_sets: int = 0,
+        cache_reserved_memory: float = 0.0,
+        enforce_hbm: bool = False,  # place all weights/momentums in HBM when using cache
+        record_cache_metrics: Optional[RecordCacheMetrics] = None,
+        gather_uvm_cache_stats: Optional[bool] = False,
+        row_alignment: Optional[int] = None,
+        fp8_exponent_bits: Optional[int] = None,
+        fp8_exponent_bias: Optional[int] = None,
+        cache_assoc: int = 32,
+        scale_bias_size_in_bytes: int = DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
+        cacheline_alignment: bool = True,
+        uvm_host_mapped: bool = False,  # True to use cudaHostAlloc; False to use cudaMallocManaged.
+        reverse_qparam: bool = False,  # True to load qparams at end of each row; False to load qparam at begnning of each row.
+        feature_names_per_table: Optional[list[list[str]]] = None,
+        indices_dtype: torch.dtype = torch.int32,  # Used for construction of the remap_indices tensors.  Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
+        embedding_cache_mode: bool = False,  # True for zero initialization, False for randomized initialization
+    ) -> None:  # noqa C901  # tuple of (rows, dims,)
+        super(KVEmbeddingInference, self).__init__(
+            embedding_specs=embedding_specs,
+            feature_table_map=feature_table_map,
+            index_remapping=index_remapping,
+            pooling_mode=pooling_mode,
+            device=device,
+            bounds_check_mode=bounds_check_mode,
+            weight_lists=weight_lists,
+            pruning_hash_load_factor=pruning_hash_load_factor,
+            use_array_for_index_remapping=use_array_for_index_remapping,
+            output_dtype=output_dtype,
+            cache_algorithm=cache_algorithm,
+            cache_load_factor=cache_load_factor,
+            cache_sets=cache_sets,
+            cache_reserved_memory=cache_reserved_memory,
+            enforce_hbm=enforce_hbm,
+            record_cache_metrics=record_cache_metrics,
+            gather_uvm_cache_stats=gather_uvm_cache_stats,
+            row_alignment=row_alignment,
+            fp8_exponent_bits=fp8_exponent_bits,
+            fp8_exponent_bias=fp8_exponent_bias,
+            cache_assoc=cache_assoc,
+            scale_bias_size_in_bytes=scale_bias_size_in_bytes,
+            cacheline_alignment=cacheline_alignment,
+            uvm_host_mapped=uvm_host_mapped,
+            reverse_qparam=reverse_qparam,
+            feature_names_per_table=feature_names_per_table,
+            indices_dtype=indices_dtype,
+        )
+        self.register_buffer(
+            "weights_ids",
+            torch.tensor(0, device=self.current_device, dtype=torch.int64),
+        )
+        num_shards = 32
+        uniform_init_lower: float = -0.01
+        uniform_init_upper: float = 0.01
+        # pyre-fixme[4]: Attribute must be annotated.
+        self.kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
+            num_shards,
+            uniform_init_lower,
+            uniform_init_upper,
+            embedding_cache_mode,  # in embedding_cache_mode, we disable random init
+        )
+        self.specs: list[tuple[int, int, int]] = [
+            (rows, dims, sparse_type.as_int())
+            for (_, rows, dims, sparse_type, _) in self.embedding_specs
+        ]
+        # table shard offset if inference sharding is enabled, otherwise, should be all zeros
+        self.table_sharding_offset: list[int] = [0] * len(self.embedding_specs)
+        self.kv_embedding_cache_initialized = False
+        self.hash_size_cumsum: torch.Tensor = torch.zeros(
+            0,
+            device=self.current_device,
+            dtype=torch.int64,
+        )
+        self.feature_hash_size_cumsum: torch.Tensor = torch.zeros(
+            0,
+            device=self.current_device,
+            dtype=torch.int64,
+        )
+    def construct_hash_size_cumsum(self) -> list[int]:
+        hash_size_cumsum = [0]
+        for spec in self.embedding_specs:
+            rows = spec[1]
+            hash_size_cumsum.append(hash_size_cumsum[-1] + rows)
+        return hash_size_cumsum
+    def calculate_indices_and_weights_offsets(
+        self, indices: Tensor, offsets: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        if self.pooling_mode is not PoolingMode.NONE:
+            T = self.weights_offsets.numel()
+        else:
+            T = self.D_offsets.numel() - 1
+        B = int((offsets.size(0) - 1) / T)
+        total_bytes_added = 0
+        new_indices = torch.tensor(
+            [0] * indices.size(0), device=self.current_device, dtype=indices.dtype
+        )
+        new_weights_offsets = torch.tensor(
+            [0] * T, device=self.current_device, dtype=self.weights_offsets.dtype
+        )
+        for t in range(T):
+            new_weights_offsets[t] = total_bytes_added
+            start, end = int(offsets[t * B]), int(offsets[(t + 1) * B])
+            index_size = end - start
+            new_indices[start:end] = torch.arange(index_size)
+            table_id = self.feature_table_map[t]
+            total_bytes_added += index_size * rounded_row_size_in_bytes(
+                self.embedding_specs[table_id][2],  # dim
+                self.embedding_specs[table_id][3],  # weight_ty
+                self.row_alignment,
+                self.scale_bias_size_in_bytes,
+            )
+        return new_indices, new_weights_offsets
+    def linearize_cache_indices(
+        self,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Linearize cache indices for KV cache.
+        """
+        linearized_indices = torch.zeros(
+            indices.numel(),
+            device=indices.device,
+            dtype=torch.int64,
+        )
+        T = self.feature_hash_size_cumsum.numel() - 1
+        B = int((offsets.size(0) - 1) / T)
+        for t in range(T):
+            start, end = int(offsets[t * B]), int(offsets[(t + 1) * B])
+            linearized_indices[start:end] = (
+                indices[start:end] + self.feature_hash_size_cumsum[t]
+            )
+        return linearized_indices
+    def forward(
+        self,
+        indices: Tensor,
+        offsets: Tensor,
+        per_sample_weights: Optional[Tensor] = None,
+    ) -> Tensor:
+        assert (
+            self.weight_initialized
+        ), "weight needs to be initialized before forward function"
+        indices, offsets, per_sample_weights = inputs_to_device(
+            indices, offsets, per_sample_weights, self.bounds_check_warning
+        )
+        lxu_cache_locations = self.lxu_cache_locations_list.pop()
+        weights_offsets = self.weights_offsets
+        weights = self.weights_host if self.host_size > 0 else self.weights_dev
+        if self.kv_embedding_cache_initialized:
+            indices = self.linearize_cache_indices(
+                indices,
+                offsets,
+            )
+            weights = self.kv_embedding_cache.get_embeddings(indices)
+            indices, weights_offsets = self.calculate_indices_and_weights_offsets(
+                indices, offsets
+            )
+        return torch.ops.fbgemm.int_nbit_split_embedding_codegen_lookup_function(
+            dev_weights=weights,
+            uvm_weights=self.weights_uvm,
+            weights_placements=self.weights_placements,
+            weights_offsets=weights_offsets,
+            weights_tys=self.weights_tys,
+            D_offsets=self.D_offsets,
+            total_D=self.total_D,
+            max_int2_D=self.max_int2_D,
+            max_int4_D=self.max_int4_D,
+            max_int8_D=self.max_int8_D,
+            max_float16_D=self.max_float16_D,
+            max_float32_D=self.max_float32_D,
+            indices=indices,
+            offsets=offsets,
+            pooling_mode=int(self.pooling_mode),
+            indice_weights=per_sample_weights,
+            output_dtype=self.output_dtype,
+            lxu_cache_weights=self.lxu_cache_weights,
+            lxu_cache_locations=lxu_cache_locations,
+            row_alignment=self.row_alignment,
+            max_float8_D=self.max_float8_D,
+            fp8_exponent_bits=self.fp8_exponent_bits,
+            fp8_exponent_bias=self.fp8_exponent_bias,
+        )
+    def fill_random_weights(self) -> None:
+        """
+        Fill the buffer with random weights, table by table
+        """
+        self.initialize_kv_embedding_cache()
+        for i, (_, num_embeddings, embedding_dim, weight_ty, _) in enumerate(
+            self.embedding_specs
+        ):
+            embedding_dim = rounded_row_size_in_bytes(
+                embedding_dim, weight_ty, self.row_alignment
+            )
+            indices = torch.range(0, num_embeddings - 1, dtype=torch.int64)
+            weights = random_quant_scaled_tensor(
+                shape=torch.Size([num_embeddings, embedding_dim]),
+                device=self.current_device,
+            )
+            self.embedding_inplace_update_per_table(
+                i,
+                indices,
+                weights,
+            )
+        self.weight_initialized = True
+    @torch.jit.export
+    def init_tbe_config(self, table_sharding_offset: list[int]) -> None:
+        """
+        Initialize the dynamic TBE table configs, e.g. sharded table offsets, etc.
+        Should be called before loading weights.
+        """
+        self.table_sharding_offset = table_sharding_offset
+    @torch.jit.export
+    def embedding_inplace_update(
+        self,
+        update_table_indices: list[int],
+        update_row_indices: list[list[int]],
+        update_weights: list[Tensor],
+    ) -> None:
+        # function is not used for now on the inference side
+        for i in range(len(update_table_indices)):
+            self.embedding_inplace_update_per_table(
+                update_table_indices[i],
+                torch.tensor(
+                    update_row_indices[i], device=self.current_device, dtype=torch.int64
+                ),
+                update_weights[i],
+                None,
+            )
+    @torch.jit.export
+    def embedding_inplace_update_per_table(
+        self,
+        table_id: int,
+        update_row_indices: Tensor,
+        update_weights: Tensor,
+        inplace_update_ts_sec: Optional[int] = None,
+    ) -> None:
+        assert table_id < len(
+            self.embedding_specs
+        ), f"table index {table_id} is out of range {len(self.embedding_specs)}"
+        # pyre-ignore [29]
+        table_offset = self.hash_size_cumsum[table_id]
+        sharding_offset = self.table_sharding_offset[table_id]
+        row_size = update_row_indices.numel()
+        if row_size == 0:
+            return
+        # convert global weight index to fused local weight index
+        row_indices = update_row_indices + table_offset - sharding_offset
+        # set weight by id
+        self.kv_embedding_cache.set_embeddings(
+            row_indices, update_weights, inplace_update_ts_sec
+        )
+    @torch.jit.export
+    def log_inplace_update_stats(
+        self,
+    ) -> None:
+        self.kv_embedding_cache.log_inplace_update_stats()
+    @torch.jit.export
+    def embedding_trigger_evict(
+        self,
+        inplace_update_ts_sec: int,
+    ) -> None:
+        self.kv_embedding_cache.trigger_evict(inplace_update_ts_sec)
+    @torch.jit.export
+    def embedding_wait_evict_completion(
+        self,
+    ) -> None:
+        self.kv_embedding_cache.wait_evict_completion()
+    @torch.jit.export
+    def initialize_kv_embedding_cache(self) -> None:
+        if not self.kv_embedding_cache_initialized:
+            self.initialize_logical_weights_placements_and_offsets()
+            self.row_alignment = 8  # in order to use mempool implementation for kv embedding it needs to be divisible by 8
+            hash_size_cumsum = self.construct_hash_size_cumsum()
+            self.hash_size_cumsum = torch.tensor(
+                hash_size_cumsum,
+                dtype=torch.int64,
+                device=self.current_device,
+            )
+            self.feature_hash_size_cumsum = torch.tensor(
+                [hash_size_cumsum[t] for t in self.feature_table_map]
+                + [hash_size_cumsum[-1]],
+                dtype=torch.int64,
+                device=self.current_device,
+            )
+            self.kv_embedding_cache.init(
+                self.specs,
+                self.row_alignment,
+                self.scale_bias_size_in_bytes,
+                self.hash_size_cumsum,
+            )
+            self.kv_embedding_cache_initialized = True

fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # pyre-unsafe
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 import torch
@@ -17,13 +17,13 @@ def get_unique_indices_v2(
     compute_count: bool = False,
     compute_inverse_indices: bool = False,
 ) -> Union[
-    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
-    Tuple[
+    tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
+    tuple[
         torch.Tensor,
         torch.Tensor,
         Optional[torch.Tensor],
-        Tuple[torch.Tensor, torch.Tensor],
     ],
+    tuple[torch.Tensor, torch.Tensor],
 ]:
     """
     A wrapper for get_unique_indices for overloading the return type
@@ -43,7 +43,6 @@ def get_unique_indices_v2(
         return ret[:-1]
     if compute_inverse_indices:
         # Return (unique_indices, length, inverse_indices)
-        # pyre-fixme[7]: The arity arity of this return is wrong (3 vs 4)
         return ret[0], ret[1], ret[3]
     # Return (unique_indices, length)
     return ret[:-2]

fbgemm_gpu/tbe/ssd/common.py CHANGED Viewed

@@ -8,6 +8,9 @@
 # pyre-strict
 # pyre-ignore-all-errors[56]
+import torch
+# fmt:skip
 from fbgemm_gpu.utils.loader import load_torch_module
 try:
@@ -18,3 +21,27 @@ except Exception:
     pass
 ASSOC = 32
+def pad4(value: int) -> int:
+    """
+    Compute the smallest multiple of 4 that is greater than or equal to the given value.
+    Parameters:
+        value (int): The integer to align (must be non-negative).
+    Returns:
+        int: The aligned value.
+    Raises:
+        ValueError: If the input is negative.
+        TypeError: If the input is not an integer.
+    """
+    return (int(value) + 3) & ~3
+def tensor_pad4(value: torch.Tensor) -> torch.Tensor:
+    """
+    The equivalent of pad4 for tensors.
+    """
+    return (value + 3) & ~3

fbgemm_gpu/tbe/ssd/inference.py CHANGED Viewed

@@ -13,7 +13,7 @@ import logging
 import os
 import tempfile
 from math import log2
-from typing import List, Optional, Tuple
+from typing import Optional
 import torch  # usort:skip
@@ -42,15 +42,15 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
     Inference version, with FP32/FP16/FP8/INT8/INT4/INT2 supports
     """
-    embedding_specs: List[Tuple[str, int, int, SparseType]]
+    embedding_specs: list[tuple[str, int, int, SparseType]]
     _local_instance_index: int = -1
     def __init__(
         self,
-        embedding_specs: List[
-            Tuple[str, int, int, SparseType]
+        embedding_specs: list[
+            tuple[str, int, int, SparseType]
         ],  # tuple of (feature_names, rows, dims, SparseType)
-        feature_table_map: Optional[List[int]] = None,  # [T]
+        feature_table_map: Optional[list[int]] = None,  # [T]
         pooling_mode: PoolingMode = PoolingMode.SUM,
         output_dtype: SparseType = SparseType.FP16,
         row_alignment: Optional[int] = None,
@@ -73,7 +73,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
         ssd_uniform_init_lower: float = -0.01,
         ssd_uniform_init_upper: float = 0.01,
         # Parameter Server Configs
-        ps_hosts: Optional[Tuple[Tuple[str, int]]] = None,
+        ps_hosts: Optional[tuple[tuple[str, int]]] = None,
         ps_max_key_per_request: Optional[int] = None,
         ps_client_thread_num: Optional[int] = None,
         ps_max_local_index_length: Optional[int] = None,
@@ -99,7 +99,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
             self.current_device = torch.device(device)
         self.use_cpu: bool = self.current_device.type == "cpu"
-        self.feature_table_map: List[int] = (
+        self.feature_table_map: list[int] = (
             feature_table_map if feature_table_map is not None else list(range(T_))
         )
         T = len(self.feature_table_map)
@@ -112,9 +112,9 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
         self.output_dtype: int = output_dtype.as_int()
         # (feature_names, rows, dims, weights_tys) = zip(*embedding_specs)
         # Pyre workaround
-        rows: List[int] = [e[1] for e in embedding_specs]
-        dims: List[int] = [e[2] for e in embedding_specs]
-        weights_tys: List[SparseType] = [e[3] for e in embedding_specs]
+        rows: list[int] = [e[1] for e in embedding_specs]
+        dims: list[int] = [e[2] for e in embedding_specs]
+        weights_tys: list[SparseType] = [e[3] for e in embedding_specs]
         D_offsets = [dims[t] for t in self.feature_table_map]
         D_offsets = [0] + list(itertools.accumulate(D_offsets))
@@ -169,7 +169,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
             offsets.append(uvm_size)
             uvm_size += state_size
-        self.weights_physical_offsets: List[int] = offsets
+        self.weights_physical_offsets: list[int] = offsets
         weights_tys_int = [weights_tys[t].as_int() for t in self.feature_table_map]
         self.register_buffer(
@@ -306,7 +306,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
             )
         # pyre-fixme[20]: Argument `self` expected.
-        (low_priority, high_priority) = torch.cuda.Stream.priority_range()
+        low_priority, high_priority = torch.cuda.Stream.priority_range()
         self.ssd_stream = torch.cuda.Stream(priority=low_priority)
         self.ssd_set_start = torch.cuda.Event()
         self.ssd_set_end = torch.cuda.Event()
@@ -369,7 +369,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
     @torch.jit.export
     def prefetch(self, indices: Tensor, offsets: Tensor) -> Tensor:
-        (indices, offsets) = indices.long(), offsets.long()
+        indices, offsets = indices.long(), offsets.long()
         linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
             self.hash_size_cumsum,
             indices,
@@ -517,13 +517,13 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
     @torch.jit.export
     def split_embedding_weights(
         self, split_scale_shifts: bool = True
-    ) -> List[Tuple[Tensor, Optional[Tensor]]]:
+    ) -> list[tuple[Tensor, Optional[Tensor]]]:
         """
         Returns a list of weights, split by table.
         Testing only, very slow.
         """
-        splits: List[Tuple[Tensor, Optional[Tensor]]] = []
+        splits: list[tuple[Tensor, Optional[Tensor]]] = []
         rows_cumsum = 0
         for _, row, dim, weight_ty in self.embedding_specs:
             weights = torch.empty(