PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/bench/tbe_data_config_loader.py CHANGED Viewed

@@ -8,14 +8,21 @@
 # pyre-strict
 import dataclasses
+import logging
+import re
 from enum import Enum
 import click
 import torch
 import yaml
-from .tbe_data_config import TBEDataConfig
-from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
+# fmt:skip
+from fbgemm_gpu.tbe.bench.tbe_data_config import (
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+    TBEDataConfig,
+)
 @dataclasses.dataclass(frozen=True)
@@ -40,12 +47,16 @@ class TBEDataConfigHelperText(Enum):
     TBE_INDICES_HITTERS = "Heavy hitters for indices (comma-delimited list of floats)"
     TBE_INDICES_ZIPF = "Zipf distribution parameters for indices generation (q, s)"
     TBE_INDICES_DTYPE = "The dtype of the table indices (choices: '32', '64')"
-    TBE_OFFSETS_DTYPE = "The dtype of the table indices (choices: '32', '64')"
+    TBE_OFFSETS_DTYPE = "The dtype of the table offsets (choices: '32', '64')"
     # Pooling Parameters
     TBE_POOLING_SIZE = "Bag size / pooling factor (L)"
-    TBE_POOLING_VL_SIGMA = "Standard deviation of B for VBE"
-    TBE_POOLING_VL_DIST = "VBE distribution (choices: 'uniform', 'normal')"
+    TBE_POOLING_VL_SIGMA = "Standard deviation of L for variable bag size"
+    TBE_POOLING_VL_DIST = (
+        "Variable bag size distribution (choices: 'uniform', 'normal')"
+    )
+    TBE_EMBEDDING_SPECS = "Embedding Specs which is List[Tuple[int, int, EmbeddingLocation, ComputeDevice]]"
+    TBE_FEATURE_TABLE_MAP = "Mapping of feature-table"
 class TBEDataConfigLoader:
@@ -73,12 +84,26 @@ class TBEDataConfigLoader:
                 default=int(1e5),
                 help=TBEDataConfigHelperText.TBE_NUM_EMBEDDINGS.value,
             ),
+            click.option(
+                "--tbe-num-embeddings-list",
+                type=str,
+                required=False,
+                default=None,
+                help="Comma-separated list of number of embeddings (Es)",
+            ),
             click.option(
                 "--tbe-embedding-dim",
                 type=int,
                 default=128,
                 help=TBEDataConfigHelperText.TBE_EMBEDDING_DIM.value,
             ),
+            click.option(
+                "--tbe-embedding-dim-list",
+                type=str,
+                required=False,
+                default=None,
+                help="Comma-separated list of number of Embedding dimensions (D)",
+            ),
             click.option(
                 "--tbe-mixed-dim",
                 is_flag=True,
@@ -91,6 +116,13 @@ class TBEDataConfigLoader:
                 default=False,
                 help=TBEDataConfigHelperText.TBE_WEIGHTED.value,
             ),
+            click.option(
+                "--tbe-max-indices",
+                type=int,
+                required=False,
+                default=None,
+                help="(Optional) Maximum number of indices, will be calculated if not provided",
+            ),
             # Batch Parameters
             click.option(
                 "--tbe-batch-size",
@@ -98,6 +130,13 @@ class TBEDataConfigLoader:
                 default=512,
                 help=TBEDataConfigHelperText.TBE_BATCH_SIZE.value,
             ),
+            click.option(
+                "--tbe-batch-sizes-list",
+                type=str,
+                required=False,
+                default=None,
+                help="List Batch sizes per feature (Bs)",
+            ),
             click.option(
                 "--tbe-batch-vbe-sigma",
                 type=int,
@@ -160,6 +199,18 @@ class TBEDataConfigLoader:
                 required=False,
                 help=TBEDataConfigHelperText.TBE_POOLING_VL_DIST.value,
             ),
+            click.option(
+                "--tbe-embedding-specs",
+                type=str,
+                required=False,
+                help=TBEDataConfigHelperText.TBE_EMBEDDING_SPECS.value,
+            ),
+            click.option(
+                "--tbe-feature-table-map",
+                type=str,
+                required=False,
+                help=TBEDataConfigHelperText.TBE_FEATURE_TABLE_MAP.value,
+            ),
         ]
         for option in reversed(options):
@@ -180,18 +231,62 @@ class TBEDataConfigLoader:
         params = context.params
         # Read table parameters
-        T = params["tbe_num_tables"]
-        E = params["tbe_num_embeddings"]
+        T = params["tbe_num_tables"]  # number of features
+        E = params["tbe_num_embeddings"]  # feature_rows
+        if params["tbe_num_embeddings_list"] is not None:
+            Es = [int(x) for x in params["tbe_num_embeddings_list"].split(",")]
+            T = len(Es)
+            E = sum(Es) // T  # average E
+        else:
+            Es = None
         D = params["tbe_embedding_dim"]
+        if params["tbe_embedding_dim_list"] is not None:
+            Ds = [int(x) for x in params["tbe_embedding_dim_list"].split(",")]
+            assert (
+                len(Ds) == T
+            ), f"Expected tbe_embedding_dim_list to have {T} elements, but got {len(Ds)}"
+            D = sum(Ds) // T  # average D
+        else:
+            Ds = None
         mixed_dim = params["tbe_mixed_dim"]
         weighted = params["tbe_weighted"]
+        if params["tbe_max_indices"] is not None:
+            max_indices = params["tbe_max_indices"]
+        else:
+            max_indices = None
         # Read batch parameters
         B = params["tbe_batch_size"]
         sigma_B = params["tbe_batch_vbe_sigma"]
         vbe_distribution = params["tbe_batch_vbe_dist"]
         vbe_num_ranks = params["tbe_batch_vbe_ranks"]
-        batch_params = BatchParams(B, sigma_B, vbe_distribution, vbe_num_ranks)
+        if params["tbe_batch_sizes_list"] is not None:
+            Bs = [int(x) for x in params["tbe_batch_sizes_list"].split(",")]
+            B = sum(Bs) // T  # average B
+        else:
+            B = params["tbe_batch_size"]
+            Bs = None
+        batch_params = BatchParams(B, sigma_B, vbe_distribution, vbe_num_ranks, Bs)
+        # Parse embedding_specs: "(E,D),(E,D),..." or "(E,D,loc,dev),(E,D,loc,dev),..."
+        # Only the first two values (E, D) are extracted.
+        embedding_specs = None
+        feature_table_map = None
+        if params["tbe_embedding_specs"] is not None:
+            try:
+                tuples = re.findall(r"\(([^)]+)\)", params["tbe_embedding_specs"])
+                if tuples:
+                    embedding_specs = [
+                        (int(t.split(",")[0].strip()), int(t.split(",")[1].strip()))
+                        for t in tuples
+                    ]
+            except (ValueError, IndexError):
+                logging.warning("Failed to parse embedding_specs. Setting to None.")
+        if params["tbe_feature_table_map"] is not None:
+            feature_table_map = [
+                int(x) for x in params["tbe_feature_table_map"].split(",")
+            ]
         # Read indices parameters
         heavy_hitters = (
@@ -226,6 +321,11 @@ class TBEDataConfigLoader:
             indices_params,
             pooling_params,
             not torch.cuda.is_available(),
+            Es,
+            Ds,
+            max_indices,
+            embedding_specs,
+            feature_table_map,
         ).validate()
     @classmethod

fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py CHANGED Viewed

@@ -9,7 +9,7 @@
 import dataclasses
 import json
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 import torch
@@ -40,7 +40,7 @@ class IndicesParams:
     @classmethod
     # pyre-ignore [3]
-    def from_dict(cls, data: Dict[str, Any]):
+    def from_dict(cls, data: dict[str, Any]):
         if not isinstance(data["heavy_hitters"], torch.Tensor):
             data["heavy_hitters"] = torch.tensor(
                 data["heavy_hitters"], dtype=torch.float32
@@ -54,7 +54,7 @@ class IndicesParams:
     def from_json(cls, data: str):
         return cls.from_dict(json.loads(data))
-    def dict(self) -> Dict[str, Any]:
+    def dict(self) -> dict[str, Any]:
         # https://stackoverflow.com/questions/73735974/convert-dataclass-of-dataclass-to-json-string
         tmp = dataclasses.asdict(self)
         # Convert tensor to list for JSON serialization
@@ -98,10 +98,12 @@ class BatchParams:
     vbe_distribution: Optional[str] = "normal"
     # Number of ranks for variable batch size generation
     vbe_num_ranks: Optional[int] = None
+    # List of target batch sizes, i.e. number of batch lookups per feature
+    Bs: Optional[list[int]] = None
     @classmethod
     # pyre-ignore [3]
-    def from_dict(cls, data: Dict[str, Any]):
+    def from_dict(cls, data: dict[str, Any]):
         return cls(**data)
     @classmethod
@@ -109,7 +111,7 @@ class BatchParams:
     def from_json(cls, data: str):
         return cls.from_dict(json.loads(data))
-    def dict(self) -> Dict[str, Any]:
+    def dict(self) -> dict[str, Any]:
         return dataclasses.asdict(self)
     def json(self, format: bool = False) -> str:
@@ -117,7 +119,10 @@ class BatchParams:
     # pyre-ignore [3]
     def validate(self):
-        assert self.B > 0, "B must be positive"
+        if self.Bs is not None:
+            assert all(b > 0 for b in self.Bs), "All elements in Bs must be positive"
+        else:
+            assert self.B > 0, "B must be positive"
         assert not self.sigma_B or self.sigma_B > 0, "sigma_B must be positive"
         assert (
             self.vbe_num_ranks is None or self.vbe_num_ranks > 0
@@ -137,10 +142,12 @@ class PoolingParams:
     sigma_L: Optional[int] = None
     # [Optional] Distribution of embedding sequence lengths (normal, uniform)
     length_distribution: Optional[str] = "normal"
+    # [Optional] List of target bag sizes, i.e. pooling factors per batch
+    Ls: Optional[list[float]] = None
     @classmethod
     # pyre-ignore [3]
-    def from_dict(cls, data: Dict[str, Any]):
+    def from_dict(cls, data: dict[str, Any]):
         return cls(**data)
     @classmethod
@@ -148,7 +155,7 @@ class PoolingParams:
     def from_json(cls, data: str):
         return cls.from_dict(json.loads(data))
-    def dict(self) -> Dict[str, Any]:
+    def dict(self) -> dict[str, Any]:
         return dataclasses.asdict(self)
     def json(self, format: bool = False) -> str:

fbgemm_gpu/tbe/bench/utils.py CHANGED Viewed

@@ -6,15 +6,14 @@
 # pyre-strict
-import logging
+from typing import List, Tuple
 import numpy as np
 import torch
+# fmt:skip
 from fbgemm_gpu.split_embedding_configs import SparseType
-logging.basicConfig(level=logging.DEBUG)
 def fill_random_scale_bias(
     emb: torch.nn.Module,
@@ -23,9 +22,9 @@ def fill_random_scale_bias(
 ) -> None:
     for t in range(T):
         # pyre-fixme[29]: `Union[Module, Tensor]` is not a function.
-        (weights, scale_shift) = emb.split_embedding_weights()[t]
+        weights, scale_shift = emb.split_embedding_weights()[t]
         if scale_shift is not None:
-            (E, R) = scale_shift.shape
+            E, R = scale_shift.shape
             assert R == 4
             scales = None
             shifts = None
@@ -46,3 +45,128 @@ def fill_random_scale_bias(
                     device=scale_shift.device,
                 )
             )
+def check_oom(
+    data_size: int,
+) -> Tuple[bool, str]:
+    free_memory, total_memory = torch.cuda.mem_get_info()
+    if data_size > free_memory:
+        warning = f"Expect to allocate {round(data_size / (1024 ** 3), 2)} GB, but available memory is {round(free_memory / (1024 ** 3), 2)} GB from {round(total_memory / (1024 ** 3), 2)} GB."
+        return (True, warning)
+    return (False, "")
+def generate_batch_size_per_feature_per_rank(
+    Bs: List[int], num_ranks: int
+) -> List[List[int]]:
+    """
+    Generate batch size per feature per rank for VBE, assuming the batch size
+    is evenly distributed across ranks.
+    Args:
+        Bs (List[int]): batch size per feature
+        num_ranks (int): number of ranks
+    Returns:
+        List[List[int]]: batch size per feature per rank
+    """
+    b_per_feature_per_rank = []
+    for B in Bs:
+        b_per_feature = []
+        for i in range(num_ranks):
+            if i != num_ranks - 1:
+                b_per_feature.append(int(B / num_ranks))
+            else:
+                b_per_feature.append(B - sum(b_per_feature))
+        b_per_feature_per_rank.append(b_per_feature)
+    return b_per_feature_per_rank
+def generate_merged_output_and_offsets(
+    Ds: List[int],
+    Bs: List[int],
+    output_dtype: torch.dtype,
+    device: torch.device,
+    num_ranks: int = 2,
+    num_tbe_ops: int = 2,
+) -> Tuple[List[List[int]], torch.Tensor, torch.Tensor]:
+    """
+    Generate merged vbe_output and vbe_output_offsets tensors for VBE.
+    The vbe_output is a tensor that will contain forward output from all VBE TBE ops.
+    The vbe_output_offsets is a tensor that will contain start offsets for the output to be written to.
+    Args:
+        Ds (List[int]): embedding dimension per feature
+        Bs (List[int]): batch size per feature
+        num_ranks (int): number of ranks
+        num_tbe_ops (int): number of TBE ops
+    Returns:
+        Tuple[List[List[int]], torch.Tensor, torch.Tensor]: batch_size_per_feature_per_rank, merged vbe_output and vbe_output_offsets tensors
+    """
+    # The first embedding ops is the embedding op created in the benchmark
+    emb_op = {}
+    emb_op[0] = {}
+    emb_op[0]["dim"] = Ds
+    emb_op[0]["Bs"] = Bs
+    emb_op[0]["output_size"] = sum([b * d for b, d in zip(Bs, Ds)])
+    emb_op[0]["batch_size_per_feature_per_rank"] = (
+        generate_batch_size_per_feature_per_rank(Bs, num_ranks)
+    )
+    num_features = len(Bs)
+    # create other embedding ops to allocate output and offsets tensors
+    # Using representative values for additional TBE ops in multi-op scenarios:
+    # - batch_size=32000: typical large batch size for production workloads
+    # - dim=512: common embedding dimension for large models
+    for i in range(1, num_tbe_ops):
+        emb_op[i] = {}
+        emb_op[i]["batch_size_per_feature_per_rank"] = (
+            generate_batch_size_per_feature_per_rank([32000], num_ranks)
+        )
+        emb_op[i]["Bs"] = [sum(B) for B in emb_op[i]["batch_size_per_feature_per_rank"]]
+        emb_op[i]["dim"] = [512]
+        emb_op[i]["output_size"] = sum(
+            [b * d for b, d in zip(emb_op[i]["Bs"], emb_op[i]["dim"])]
+        )
+    total_output = 0
+    ranks = [[] for _ in range(num_ranks)]
+    for e in emb_op.values():
+        b_per_rank_per_feature = list(zip(*e["batch_size_per_feature_per_rank"]))
+        assert len(b_per_rank_per_feature) == num_ranks
+        dims = e["dim"]
+        for r, b_r in enumerate(b_per_rank_per_feature):
+            for f, b in enumerate(b_r):
+                output_size_per_batch = b * dims[f]
+                ranks[r].append(output_size_per_batch)
+                total_output += output_size_per_batch
+    ranks[0].insert(0, 0)
+    offsets_ranks: List[List[int]] = [[] for _ in range(num_ranks)]
+    total_output_offsets = []
+    start = 0
+    for r in range(num_ranks):
+        offsets_ranks[r] = [
+            start + sum(ranks[r][: i + 1]) for i in range(len(ranks[r]))
+        ]
+        start = offsets_ranks[r][-1]
+        total_output_offsets.extend(offsets_ranks[r])
+    check_total_output_size = sum([e["output_size"] for e in emb_op.values()])
+    assert (
+        total_output == check_total_output_size
+    ), f"{total_output} != {check_total_output_size}{[e['output_size'] for e in emb_op.values()]}"
+    assert (
+        total_output == total_output_offsets[-1]
+    ), f"{total_output} != {total_output_offsets[-1]}"
+    out = torch.empty(total_output, dtype=output_dtype, device=device)
+    offsets = []
+    offsets.append(offsets_ranks[0][:num_features])
+    for r in range(1, num_ranks):
+        start = [offsets_ranks[r - 1][-1]]
+        the_rest = offsets_ranks[r][: num_features - 1] if num_features > 1 else []
+        start.extend(the_rest)
+        offsets.append(start)
+    out_offsets = torch.tensor(
+        offsets,
+        dtype=torch.int64,
+        device=device,
+    )
+    batch_size_per_feature_per_rank = emb_op[0]["batch_size_per_feature_per_rank"]
+    return (batch_size_per_feature_per_rank, out, out_offsets)

fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py CHANGED Viewed

@@ -10,7 +10,7 @@
 # pyre-ignore-all-errors[56]
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 import torch  # usort:skip
 from torch import Tensor  # usort:skip
@@ -47,15 +47,15 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
     def __init__(  # noqa C901
         self,
-        embedding_specs: List[
-            Tuple[str, int, int, SparseType, EmbeddingLocation]
+        embedding_specs: list[
+            tuple[str, int, int, SparseType, EmbeddingLocation]
         ],  # tuple of (feature_names, rows, dims, SparseType, EmbeddingLocation/placement)
-        feature_table_map: Optional[List[int]] = None,  # [T]
-        index_remapping: Optional[List[Tensor]] = None,
+        feature_table_map: Optional[list[int]] = None,  # [T]
+        index_remapping: Optional[list[Tensor]] = None,
         pooling_mode: PoolingMode = PoolingMode.SUM,
         device: Optional[Union[str, int, torch.device]] = None,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
-        weight_lists: Optional[List[Tuple[Tensor, Optional[Tensor]]]] = None,
+        weight_lists: Optional[list[tuple[Tensor, Optional[Tensor]]]] = None,
         pruning_hash_load_factor: float = 0.5,
         use_array_for_index_remapping: bool = True,
         output_dtype: SparseType = SparseType.FP16,
@@ -74,8 +74,9 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
         cacheline_alignment: bool = True,
         uvm_host_mapped: bool = False,  # True to use cudaHostAlloc; False to use cudaMallocManaged.
         reverse_qparam: bool = False,  # True to load qparams at end of each row; False to load qparam at begnning of each row.
-        feature_names_per_table: Optional[List[List[str]]] = None,
+        feature_names_per_table: Optional[list[list[str]]] = None,
         indices_dtype: torch.dtype = torch.int32,  # Used for construction of the remap_indices tensors.  Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
+        embedding_cache_mode: bool = False,  # True for zero initialization, False for randomized initialization
     ) -> None:  # noqa C901  # tuple of (rows, dims,)
         super(KVEmbeddingInference, self).__init__(
             embedding_specs=embedding_specs,
@@ -114,17 +115,21 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
         num_shards = 32
         uniform_init_lower: float = -0.01
         uniform_init_upper: float = 0.01
         # pyre-fixme[4]: Attribute must be annotated.
         self.kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
-            num_shards, uniform_init_lower, uniform_init_upper
+            num_shards,
+            uniform_init_lower,
+            uniform_init_upper,
+            embedding_cache_mode,  # in embedding_cache_mode, we disable random init
         )
-        self.specs: List[Tuple[int, int, int]] = [
+        self.specs: list[tuple[int, int, int]] = [
             (rows, dims, sparse_type.as_int())
             for (_, rows, dims, sparse_type, _) in self.embedding_specs
         ]
         # table shard offset if inference sharding is enabled, otherwise, should be all zeros
-        self.table_sharding_offset: List[int] = [0] * len(self.embedding_specs)
+        self.table_sharding_offset: list[int] = [0] * len(self.embedding_specs)
         self.kv_embedding_cache_initialized = False
         self.hash_size_cumsum: torch.Tensor = torch.zeros(
             0,
@@ -137,7 +142,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
             dtype=torch.int64,
         )
-    def construct_hash_size_cumsum(self) -> List[int]:
+    def construct_hash_size_cumsum(self) -> list[int]:
         hash_size_cumsum = [0]
         for spec in self.embedding_specs:
             rows = spec[1]
@@ -146,7 +151,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
     def calculate_indices_and_weights_offsets(
         self, indices: Tensor, offsets: Tensor
-    ) -> Tuple[Tensor, Tensor]:
+    ) -> tuple[Tensor, Tensor]:
         if self.pooling_mode is not PoolingMode.NONE:
             T = self.weights_offsets.numel()
         else:
@@ -280,7 +285,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
         self.weight_initialized = True
     @torch.jit.export
-    def init_tbe_config(self, table_sharding_offset: List[int]) -> None:
+    def init_tbe_config(self, table_sharding_offset: list[int]) -> None:
         """
         Initialize the dynamic TBE table configs, e.g. sharded table offsets, etc.
         Should be called before loading weights.
@@ -290,9 +295,9 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
     @torch.jit.export
     def embedding_inplace_update(
         self,
-        update_table_indices: List[int],
-        update_row_indices: List[List[int]],
-        update_weights: List[Tensor],
+        update_table_indices: list[int],
+        update_row_indices: list[list[int]],
+        update_weights: list[Tensor],
     ) -> None:
         # function is not used for now on the inference side
         for i in range(len(update_table_indices)):
@@ -355,9 +360,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
         if not self.kv_embedding_cache_initialized:
             self.initialize_logical_weights_placements_and_offsets()
-            self.row_alignment = (
-                8 if self.use_cpu else self.row_alignment
-            )  # in order to use mempool implementation for kv embedding it needs to be divisible by 8
+            self.row_alignment = 8  # in order to use mempool implementation for kv embedding it needs to be divisible by 8
             hash_size_cumsum = self.construct_hash_size_cumsum()
             self.hash_size_cumsum = torch.tensor(

fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # pyre-unsafe
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 import torch
@@ -17,13 +17,13 @@ def get_unique_indices_v2(
     compute_count: bool = False,
     compute_inverse_indices: bool = False,
 ) -> Union[
-    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
-    Tuple[
+    tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
+    tuple[
         torch.Tensor,
         torch.Tensor,
         Optional[torch.Tensor],
     ],
-    Tuple[torch.Tensor, torch.Tensor],
+    tuple[torch.Tensor, torch.Tensor],
 ]:
     """
     A wrapper for get_unique_indices for overloading the return type

fbgemm_gpu/tbe/ssd/common.py CHANGED Viewed

@@ -10,6 +10,7 @@
 import torch
+# fmt:skip
 from fbgemm_gpu.utils.loader import load_torch_module
 try:

fbgemm_gpu/tbe/ssd/inference.py CHANGED Viewed

@@ -13,7 +13,7 @@ import logging
 import os
 import tempfile
 from math import log2
-from typing import List, Optional, Tuple
+from typing import Optional
 import torch  # usort:skip
@@ -42,15 +42,15 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
     Inference version, with FP32/FP16/FP8/INT8/INT4/INT2 supports
     """
-    embedding_specs: List[Tuple[str, int, int, SparseType]]
+    embedding_specs: list[tuple[str, int, int, SparseType]]
     _local_instance_index: int = -1
     def __init__(
         self,
-        embedding_specs: List[
-            Tuple[str, int, int, SparseType]
+        embedding_specs: list[
+            tuple[str, int, int, SparseType]
         ],  # tuple of (feature_names, rows, dims, SparseType)
-        feature_table_map: Optional[List[int]] = None,  # [T]
+        feature_table_map: Optional[list[int]] = None,  # [T]
         pooling_mode: PoolingMode = PoolingMode.SUM,
         output_dtype: SparseType = SparseType.FP16,
         row_alignment: Optional[int] = None,
@@ -73,7 +73,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
         ssd_uniform_init_lower: float = -0.01,
         ssd_uniform_init_upper: float = 0.01,
         # Parameter Server Configs
-        ps_hosts: Optional[Tuple[Tuple[str, int]]] = None,
+        ps_hosts: Optional[tuple[tuple[str, int]]] = None,
         ps_max_key_per_request: Optional[int] = None,
         ps_client_thread_num: Optional[int] = None,
         ps_max_local_index_length: Optional[int] = None,
@@ -99,7 +99,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
             self.current_device = torch.device(device)
         self.use_cpu: bool = self.current_device.type == "cpu"
-        self.feature_table_map: List[int] = (
+        self.feature_table_map: list[int] = (
             feature_table_map if feature_table_map is not None else list(range(T_))
         )
         T = len(self.feature_table_map)
@@ -112,9 +112,9 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
         self.output_dtype: int = output_dtype.as_int()
         # (feature_names, rows, dims, weights_tys) = zip(*embedding_specs)
         # Pyre workaround
-        rows: List[int] = [e[1] for e in embedding_specs]
-        dims: List[int] = [e[2] for e in embedding_specs]
-        weights_tys: List[SparseType] = [e[3] for e in embedding_specs]
+        rows: list[int] = [e[1] for e in embedding_specs]
+        dims: list[int] = [e[2] for e in embedding_specs]
+        weights_tys: list[SparseType] = [e[3] for e in embedding_specs]
         D_offsets = [dims[t] for t in self.feature_table_map]
         D_offsets = [0] + list(itertools.accumulate(D_offsets))
@@ -169,7 +169,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
             offsets.append(uvm_size)
             uvm_size += state_size
-        self.weights_physical_offsets: List[int] = offsets
+        self.weights_physical_offsets: list[int] = offsets
         weights_tys_int = [weights_tys[t].as_int() for t in self.feature_table_map]
         self.register_buffer(
@@ -306,7 +306,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
             )
         # pyre-fixme[20]: Argument `self` expected.
-        (low_priority, high_priority) = torch.cuda.Stream.priority_range()
+        low_priority, high_priority = torch.cuda.Stream.priority_range()
         self.ssd_stream = torch.cuda.Stream(priority=low_priority)
         self.ssd_set_start = torch.cuda.Event()
         self.ssd_set_end = torch.cuda.Event()
@@ -369,7 +369,7 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
     @torch.jit.export
     def prefetch(self, indices: Tensor, offsets: Tensor) -> Tensor:
-        (indices, offsets) = indices.long(), offsets.long()
+        indices, offsets = indices.long(), offsets.long()
         linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
             self.hash_size_cumsum,
             indices,
@@ -517,13 +517,13 @@ class SSDIntNBitTableBatchedEmbeddingBags(nn.Module):
     @torch.jit.export
     def split_embedding_weights(
         self, split_scale_shifts: bool = True
-    ) -> List[Tuple[Tensor, Optional[Tensor]]]:
+    ) -> list[tuple[Tensor, Optional[Tensor]]]:
         """
         Returns a list of weights, split by table.
         Testing only, very slow.
         """
-        splits: List[Tuple[Tensor, Optional[Tensor]]] = []
+        splits: list[tuple[Tensor, Optional[Tensor]]] = []
         rows_cumsum = 0
         for _, row, dim, weight_ty in self.embedding_specs:
             weights = torch.empty(