PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/bench/tbe_data_config.py CHANGED Viewed

@@ -9,21 +9,18 @@
 import dataclasses
 import json
-from typing import Any, Dict, List, Optional, Tuple
+import logging
+from typing import Any, List, Optional, Tuple
-import numpy as np
 import torch
-from fbgemm_gpu.tbe.utils.common import get_device, round_up
-from fbgemm_gpu.tbe.utils.requests import (
-    generate_batch_sizes_from_stats,
-    generate_pooling_factors_from_stats,
-    get_table_batched_offsets_from_dense,
-    maybe_to_dtype,
-    TBERequest,
-)
-from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
+# fmt:skip
+from fbgemm_gpu.tbe.utils.common import get_device
+from .tbe_data_config_param_models import (
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+)  # usort:skip
 try:
     torch.ops.load_library(
@@ -35,27 +32,87 @@ except Exception:
 @dataclasses.dataclass(frozen=True)
 class TBEDataConfig:
-    # Number of tables
     T: int
-    # Number of rows in the embedding table
     E: int
-    # Target embedding dimension for a table (number of columns)
     D: int
-    # Generate mixed dimensions if true
     mixed_dim: bool
-    # Whether the table is weighted or not
     weighted: bool
-    # Batch parameters
     batch_params: BatchParams
-    # Indices parameters
     indices_params: IndicesParams
-    # Pooling parameters
     pooling_params: PoolingParams
-    # Force generated tensors to be on CPU
     use_cpu: bool = False
+    Es: Optional[list[int]] = None
+    Ds: Optional[list[int]] = None
+    max_indices: Optional[int] = None
+    embedding_specs: Optional[List[Tuple[int, int]]] = None
+    feature_table_map: Optional[List[int]] = None
+    """
+    Configuration for TBE (Table Batched Embedding) benchmark data collection and generation.
+    This dataclass holds parameters required to generate synthetic data for
+    TBE benchmarking, including table specifications, batch parameters, indices
+    distribution parameters, and pooling parameters.
+    Args:
+        T (int): Number of embedding tables (features). Must be positive.
+        E (int): Number of rows in the embedding table (feature). If T > 1, this
+            represents the averaged number of rows across all features.
+        D (int): Target embedding dimension for a table (feature), i.e., number of
+            columns. If T > 1, this represents the averaged dimension across
+            all features.
+        mixed_dim (bool): If True, generate embeddings with mixed dimensions
+            across tables (features). This is automatically set to True if D is provided
+            as a list with non-uniform values.
+        weighted (bool): If True, the lookup rows are weighted (per-sample
+            weights). The weights will be generated as FP32 tensors.
+        batch_params (BatchParams): Parameters controlling batch generation.
+            Contains:
+            (1) `B` = target batch size (number of batch lookups per features)
+            (2) `sigma_B` = optional standard deviation for variable batch size
+            (3) `vbe_distribution` = distribution type ("normal" or "uniform")
+            (4) `vbe_num_ranks` = number of ranks for variable batch size
+            (5) `Bs` = per-feature batch sizes
+        indices_params (IndicesParams): Parameters controlling index generation
+            following a Zipf distribution. Contains:
+            (1) `heavy_hitters` = probability density map for hot indices
+            (2) `zipf_q` = q parameter in Zipf distribution (x+q)^{-s}
+            (3) `zipf_s` = s parameter (alpha) in Zipf distribution
+            (4) `index_dtype` = optional dtype for indices tensor
+            (5) `offset_dtype` = optional dtype for offsets tensor
+        pooling_params (PoolingParams): Parameters controlling pooling behavior.
+            Contains:
+            (1) `L` = target bag size (pooling factor, indices per lookup)
+            (2) `sigma_L` = optional standard deviation for variable bag size
+            (3) `length_distribution` = distribution type ("normal" or "uniform")
+            (4) `Ls` = per-feature bag sizes
+        use_cpu (bool = False): If True, force generated tensors to be placed
+            on CPU instead of the default compute device.
+        Es (Optional[List[int]] = None): Number of embeddings (rows) for each
+            individual embedding feature. If provided, must have length equal
+            to T. All elements must be positive.
+        Ds (Optional[List[int]] = None): Target embedding dimension (columns)
+            for each individual feature. If provided, must have length equal
+            to T. All elements must be positive.
+        max_indices (Optional[int] = None): Maximum number of indices for
+            bounds checking. If Es is provided as a list and max_indices is
+            None, it is automatically computed as sum(Es) - 1.
+        embedding_specs (Optional[List[Tuple[int, int]]] = None): A list of
+            embedding specs consisting of a list of tuples of (num_rows, embedding_dim).
+            See https://fburl.com/tbe_embedding_specs for details.
+        feature_table_map (Optional[List[int]] = None): An optional list that
+            specifies feature-table mapping. feature_table_map[i] indicates the
+            physical embedding table that feature i maps to.
+    """
+    def __post_init__(self) -> None:
+        if isinstance(self.D, list):
+            object.__setattr__(self, "mixed_dim", len(set(self.D)) > 1)
+        if isinstance(self.E, list) and self.max_indices is None:
+            object.__setattr__(self, "max_indices", sum(self.E) - 1)
+        self.validate()
     @staticmethod
-    def complex_fields() -> Dict[str, Any]:
+    def complex_fields() -> dict[str, Any]:
         return {
             "batch_params": BatchParams,
             "indices_params": IndicesParams,
@@ -64,7 +121,7 @@ class TBEDataConfig:
     @classmethod
     # pyre-ignore [3]
-    def from_dict(cls, data: Dict[str, Any]):
+    def from_dict(cls, data: dict[str, Any]):
         for field, Type in cls.complex_fields().items():
             if not isinstance(data[field], Type):
                 data[field] = Type.from_dict(data[field])
@@ -73,9 +130,22 @@ class TBEDataConfig:
     @classmethod
     # pyre-ignore [3]
     def from_json(cls, data: str):
-        return cls.from_dict(json.loads(data))
+        raw = json.loads(data)
+        allowed = {f.name for f in dataclasses.fields(cls)}
+        existing_fields = {k: v for k, v in raw.items() if k in allowed}
+        missing_fields = allowed - set(existing_fields.keys())
+        unknown_fields = set(raw.keys()) - allowed
+        if missing_fields:
+            logging.warning(
+                f"TBEDataConfig.from_json: Missing expected fields not loaded: {sorted(missing_fields)}"
+            )
+        if unknown_fields:
+            logging.info(
+                f"TBEDataConfig.from_json: Ignored unknown fields from input: {sorted(unknown_fields)}"
+            )
+        return cls.from_dict(existing_fields)
-    def dict(self) -> Dict[str, Any]:
+    def dict(self) -> dict[str, Any]:
         tmp = dataclasses.asdict(self)
         for field in TBEDataConfig.complex_fields().keys():
             tmp[field] = self.__dict__[field].dict()
@@ -89,10 +159,30 @@ class TBEDataConfig:
         # NOTE: Add validation logic here
         assert self.T > 0, "T must be positive"
         assert self.E > 0, "E must be positive"
+        if self.Es is not None:
+            assert all(e > 0 for e in self.Es), "All elements in Es must be positive"
         assert self.D > 0, "D must be positive"
+        if self.Ds is not None:
+            assert all(d > 0 for d in self.Ds), "All elements in Ds must be positive"
+        if isinstance(self.Es, list) and isinstance(self.Ds, list):
+            assert (
+                len(self.Es) == len(self.Ds) == self.T
+            ), "Lengths of Es, Lengths of Ds, and T must be equal"
+            if self.max_indices is not None:
+                assert self.max_indices == (
+                    sum(self.Es) - 1
+                ), "max_indices must be equal to sum(Es) - 1"
         self.batch_params.validate()
+        if self.batch_params.Bs is not None:
+            assert (
+                len(self.batch_params.Bs) == self.T
+            ), f"Length of Bs must be equal to T. Expected: {self.T}, but got: {len(self.batch_params.Bs)}"
         self.indices_params.validate()
         self.pooling_params.validate()
+        if self.pooling_params.Ls is not None:
+            assert (
+                len(self.pooling_params.Ls) == self.T
+            ), f"Length of Ls must be equal to T. Expected: {self.T}, but got: {len(self.pooling_params.Ls)}"
         return self
     def variable_B(self) -> bool:
@@ -102,177 +192,5 @@ class TBEDataConfig:
         return self.pooling_params.sigma_L is not None
     def _new_weights(self, size: int) -> Optional[torch.Tensor]:
-        # per sample weights will always be FP32
+        # Per-sample weights will always be FP32
         return None if not self.weighted else torch.randn(size, device=get_device())
-    def _generate_batch_sizes(self) -> Tuple[List[int], Optional[List[List[int]]]]:
-        if self.variable_B():
-            assert (
-                self.batch_params.vbe_num_ranks is not None
-            ), "vbe_num_ranks must be set for varaible batch size generation"
-            return generate_batch_sizes_from_stats(
-                self.batch_params.B,
-                self.T,
-                # pyre-ignore [6]
-                self.batch_params.sigma_B,
-                self.batch_params.vbe_num_ranks,
-                # pyre-ignore [6]
-                self.batch_params.vbe_distribution,
-            )
-        else:
-            return ([self.batch_params.B] * self.T, None)
-    def _generate_pooling_info(self, iters: int, Bs: List[int]) -> torch.Tensor:
-        if self.variable_L():
-            # Generate L from stats
-            _, L_offsets = generate_pooling_factors_from_stats(
-                iters,
-                Bs,
-                self.pooling_params.L,
-                # pyre-ignore [6]
-                self.pooling_params.sigma_L,
-                # pyre-ignore [6]
-                self.pooling_params.length_distribution,
-            )
-        else:
-            Ls = [self.pooling_params.L] * (sum(Bs) * iters)
-            L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
-        return L_offsets
-    def _generate_indices(
-        self,
-        iters: int,
-        Bs: List[int],
-        L_offsets: torch.Tensor,
-    ) -> torch.Tensor:
-        total_B = sum(Bs)
-        L_offsets_list = L_offsets.tolist()
-        indices_list = []
-        for it in range(iters):
-            # L_offsets is defined over the entire set of batches for a single iteration
-            start_offset = L_offsets_list[it * total_B]
-            end_offset = L_offsets_list[(it + 1) * total_B]
-            indices_list.append(
-                torch.ops.fbgemm.tbe_generate_indices_from_distribution(
-                    self.indices_params.heavy_hitters,
-                    self.indices_params.zipf_q,
-                    self.indices_params.zipf_s,
-                    # max_index = dimensions of the embedding table
-                    self.E,
-                    # num_indices = number of indices to generate
-                    end_offset - start_offset,
-                )
-            )
-        return torch.cat(indices_list)
-    def _build_requests_jagged(
-        self,
-        iters: int,
-        Bs: List[int],
-        Bs_feature_rank: Optional[List[List[int]]],
-        L_offsets: torch.Tensor,
-        all_indices: torch.Tensor,
-    ) -> List[TBERequest]:
-        total_B = sum(Bs)
-        all_indices = all_indices.flatten()
-        requests = []
-        for it in range(iters):
-            start_offset = L_offsets[it * total_B]
-            it_L_offsets = torch.concat(
-                [
-                    torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
-                    L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
-                ]
-            )
-            requests.append(
-                TBERequest(
-                    maybe_to_dtype(
-                        all_indices[start_offset : L_offsets[(it + 1) * total_B]],
-                        self.indices_params.index_dtype,
-                    ),
-                    maybe_to_dtype(
-                        it_L_offsets.to(get_device()), self.indices_params.offset_dtype
-                    ),
-                    self._new_weights(int(it_L_offsets[-1].item())),
-                    Bs_feature_rank if self.variable_B() else None,
-                )
-            )
-        return requests
-    def _build_requests_dense(
-        self, iters: int, all_indices: torch.Tensor
-    ) -> List[TBERequest]:
-        # NOTE: We're using existing code from requests.py to build the
-        # requests, and since the existing code requires 2D view of all_indices,
-        # the existing all_indices must be reshaped
-        all_indices = all_indices.reshape(iters, -1)
-        requests = []
-        for it in range(iters):
-            indices, offsets = get_table_batched_offsets_from_dense(
-                all_indices[it].view(
-                    self.T, self.batch_params.B, self.pooling_params.L
-                ),
-                use_cpu=self.use_cpu,
-            )
-            requests.append(
-                TBERequest(
-                    maybe_to_dtype(indices, self.indices_params.index_dtype),
-                    maybe_to_dtype(offsets, self.indices_params.offset_dtype),
-                    self._new_weights(
-                        self.T * self.batch_params.B * self.pooling_params.L
-                    ),
-                )
-            )
-        return requests
-    def generate_requests(
-        self,
-        iters: int = 1,
-    ) -> List[TBERequest]:
-        # Generate batch sizes
-        Bs, Bs_feature_rank = self._generate_batch_sizes()
-        # Generate pooling info
-        L_offsets = self._generate_pooling_info(iters, Bs)
-        # Generate indices
-        all_indices = self._generate_indices(iters, Bs, L_offsets)
-        # Build TBE requests
-        if self.variable_B() or self.variable_L():
-            return self._build_requests_jagged(
-                iters, Bs, Bs_feature_rank, L_offsets, all_indices
-            )
-        else:
-            return self._build_requests_dense(iters, all_indices)
-    def generate_embedding_dims(self) -> Tuple[int, List[int]]:
-        if self.mixed_dim:
-            Ds = [
-                round_up(
-                    np.random.randint(low=int(0.5 * self.D), high=int(1.5 * self.D)), 4
-                )
-                for _ in range(self.T)
-            ]
-            return (int(np.average(Ds)), Ds)
-        else:
-            return (self.D, [self.D] * self.T)
-    def generate_feature_requires_grad(self, size: int) -> torch.Tensor:
-        assert size <= self.T, "size of feature_requires_grad must be less than T"
-        weighted_requires_grad_tables = np.random.choice(
-            self.T, replace=False, size=(size,)
-        ).tolist()
-        return (
-            torch.tensor(
-                [1 if t in weighted_requires_grad_tables else 0 for t in range(self.T)]
-            )
-            .to(get_device())
-            .int()
-        )

fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py ADDED Viewed

@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import logging
+from typing import Optional
+import numpy as np
+import torch
+# fmt:skip
+from fbgemm_gpu.tbe.bench.tbe_data_config import TBEDataConfig
+from fbgemm_gpu.tbe.utils.common import get_device, round_up
+from fbgemm_gpu.tbe.utils.requests import (
+    generate_batch_sizes_from_stats,
+    generate_pooling_factors_from_stats,
+    get_table_batched_offsets_from_dense,
+    maybe_to_dtype,
+    TBERequest,
+)
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_generator"
+    )
+def _generate_batch_sizes(
+    tbe_data_config: TBEDataConfig,
+) -> tuple[list[int], Optional[list[list[int]]]]:
+    logging.info(
+        f"DEBUG_TBE: [_generate_batch_sizes] VBE tbe_data_config.variable_B()={tbe_data_config.variable_B()}"
+    )
+    if tbe_data_config.variable_B():
+        assert (
+            tbe_data_config.batch_params.vbe_num_ranks is not None
+        ), "vbe_num_ranks must be set for varaible batch size generation"
+        return generate_batch_sizes_from_stats(
+            tbe_data_config.batch_params.B,
+            tbe_data_config.T,
+            # pyre-ignore [6]
+            tbe_data_config.batch_params.sigma_B,
+            tbe_data_config.batch_params.vbe_num_ranks,
+            # pyre-ignore [6]
+            tbe_data_config.batch_params.vbe_distribution,
+        )
+    else:
+        return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
+def _generate_pooling_info(
+    tbe_data_config: TBEDataConfig, iters: int, Bs: list[int]
+) -> torch.Tensor:
+    if tbe_data_config.variable_L():
+        # Generate L from stats
+        _, L_offsets = generate_pooling_factors_from_stats(
+            iters,
+            Bs,
+            tbe_data_config.pooling_params.L,
+            # pyre-ignore [6]
+            tbe_data_config.pooling_params.sigma_L,
+            # pyre-ignore [6]
+            tbe_data_config.pooling_params.length_distribution,
+        )
+    else:
+        Ls = [tbe_data_config.pooling_params.L] * (sum(Bs) * iters)
+        L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
+    return L_offsets
+def _generate_indices(
+    tbe_data_config: TBEDataConfig,
+    iters: int,
+    Bs: list[int],
+    L_offsets: torch.Tensor,
+) -> torch.Tensor:
+    total_B = sum(Bs)
+    L_offsets_list = L_offsets.tolist()
+    indices_list = []
+    for it in range(iters):
+        # L_offsets is defined over the entire set of batches for a single iteration
+        start_offset = L_offsets_list[it * total_B]
+        end_offset = L_offsets_list[(it + 1) * total_B]
+        logging.info(f"DEBUG_TBE: _generate_indices E = {tbe_data_config.E=}")
+        indices_list.append(
+            torch.ops.fbgemm.tbe_generate_indices_from_distribution(
+                tbe_data_config.indices_params.heavy_hitters,
+                tbe_data_config.indices_params.zipf_q,
+                tbe_data_config.indices_params.zipf_s,
+                # max_index = dimensions of the embedding table
+                int(tbe_data_config.E),
+                # num_indices = number of indices to generate
+                end_offset - start_offset,
+            )
+        )
+    return torch.cat(indices_list)
+def _build_requests_jagged(
+    tbe_data_config: TBEDataConfig,
+    iters: int,
+    Bs: list[int],
+    Bs_feature_rank: Optional[list[list[int]]],
+    L_offsets: torch.Tensor,
+    all_indices: torch.Tensor,
+) -> list[TBERequest]:
+    total_B = sum(Bs)
+    all_indices = all_indices.flatten()
+    requests = []
+    for it in range(iters):
+        start_offset = L_offsets[it * total_B]
+        it_L_offsets = torch.concat(
+            [
+                torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
+                L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
+            ]
+        )
+        requests.append(
+            TBERequest(
+                maybe_to_dtype(
+                    all_indices[start_offset : L_offsets[(it + 1) * total_B]],
+                    tbe_data_config.indices_params.index_dtype,
+                ),
+                maybe_to_dtype(
+                    it_L_offsets.to(get_device()),
+                    tbe_data_config.indices_params.offset_dtype,
+                ),
+                tbe_data_config._new_weights(int(it_L_offsets[-1].item())),
+                Bs_feature_rank if tbe_data_config.variable_B() else None,
+            )
+        )
+    return requests
+def _build_requests_dense(
+    tbe_data_config: TBEDataConfig, iters: int, all_indices: torch.Tensor
+) -> list[TBERequest]:
+    # NOTE: We're using existing code from requests.py to build the
+    # requests, and since the existing code requires 2D view of all_indices,
+    # the existing all_indices must be reshaped
+    all_indices = all_indices.reshape(iters, -1)
+    requests = []
+    for it in range(iters):
+        indices, offsets = get_table_batched_offsets_from_dense(
+            all_indices[it].view(
+                tbe_data_config.T,
+                tbe_data_config.batch_params.B,
+                tbe_data_config.pooling_params.L,
+            ),
+            use_cpu=tbe_data_config.use_cpu,
+        )
+        requests.append(
+            TBERequest(
+                maybe_to_dtype(indices, tbe_data_config.indices_params.index_dtype),
+                maybe_to_dtype(offsets, tbe_data_config.indices_params.offset_dtype),
+                tbe_data_config._new_weights(
+                    tbe_data_config.T
+                    * tbe_data_config.batch_params.B
+                    * tbe_data_config.pooling_params.L
+                ),
+            )
+        )
+    return requests
+def generate_requests(
+    tbe_data_config: TBEDataConfig,
+    iters: int = 1,
+    batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+) -> list[TBERequest]:
+    # Generate batch sizes
+    if batch_size_per_feature_per_rank:
+        Bs = tbe_data_config.batch_params.Bs
+    else:
+        Bs, _ = _generate_batch_sizes(tbe_data_config)
+    logging.info(
+        f"DEBUG_TBE: VBE [generate_requests] batch_size_per_feature_per_rank={batch_size_per_feature_per_rank} Bs={Bs}"
+    )
+    assert Bs is not None, "Batch sizes (Bs) must be set"
+    # Generate pooling info
+    L_offsets = _generate_pooling_info(tbe_data_config, iters, Bs)
+    # Generate indices
+    all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
+    all_indices = all_indices.to(get_device())
+    # Build TBE requests
+    if tbe_data_config.variable_B() or tbe_data_config.variable_L():
+        if batch_size_per_feature_per_rank:
+            return _build_requests_jagged(
+                tbe_data_config,
+                iters,
+                Bs,
+                batch_size_per_feature_per_rank,
+                L_offsets,
+                all_indices,
+            )
+        else:
+            return _build_requests_jagged(
+                tbe_data_config,
+                iters,
+                Bs,
+                batch_size_per_feature_per_rank,
+                L_offsets,
+                all_indices,
+            )
+    else:
+        return _build_requests_dense(tbe_data_config, iters, all_indices)
+def generate_requests_with_Llist(
+    tbe_data_config: TBEDataConfig,
+    L_list: torch.Tensor,
+    iters: int = 1,
+    batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+) -> list[TBERequest]:
+    """
+    Generate a list of TBERequest objects based on the provided TBE data configuration and L_list
+    This function generates batch sizes and pooling information from the input L_list,
+    simulates L distributions with Gaussian noise, and creates indices for embedding lookups.
+    It supports both variable batch sizes and sequence lengths, building either jagged or dense requests accordingly.
+    Args:
+        tbe_data_config (TBEDataConfig): Configuration object containing batch parameters and pooling parameters.
+        L_list (torch.Tensor): Tensor of base sequence lengths for each batch.
+        iters (int, optional): Number of iterations to repeat the generated requests. Defaults to 1.
+        batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Optional batch size specification per feature per rank. Defaults to None.
+    Returns:
+        List[TBERequest]: A list of TBERequest objects constructed according to the configuration and input parameters.
+    Raises:
+        AssertionError: If batch sizes (Bs) are not set in the tbe_data_config.
+    Example:
+        >>> requests = generate_requests_with_Llist(tbe_data_config, L_list=torch.tensor([10, 20]), iters=2)
+        >>> len(requests)
+        2
+    """
+    # Generate batch sizes
+    Bs = tbe_data_config.batch_params.Bs
+    assert (
+        Bs is not None
+    ), "Batch sizes (Bs) must be set for generate_requests_with_Llist"
+    # Generate pooling info from L list
+    Ls_list = []
+    for i in range(len(Bs)):
+        L = L_list[i]
+        B = Bs[i]
+        Ls_iter = np.random.normal(
+            loc=L, scale=tbe_data_config.pooling_params.sigma_L, size=B
+        ).astype(int)
+        Ls_list.append(Ls_iter)
+    Ls = np.concatenate(Ls_list)
+    Ls[Ls < 0] = 0
+    # Use the same L distribution across iters
+    Ls = np.tile(Ls, iters)
+    L = Ls.max()
+    # Make it exclusive cumsum
+    L_offsets = torch.from_numpy(np.insert(Ls.cumsum(), 0, 0)).to(torch.long)
+    # Generate indices
+    all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
+    all_indices = all_indices.to(get_device())
+    # Build TBE requests
+    if tbe_data_config.variable_B() or tbe_data_config.variable_L():
+        return _build_requests_jagged(
+            tbe_data_config,
+            iters,
+            Bs,
+            batch_size_per_feature_per_rank,
+            L_offsets,
+            all_indices,
+        )
+    else:
+        return _build_requests_dense(tbe_data_config, iters, all_indices)
+def generate_embedding_dims(tbe_data_config: TBEDataConfig) -> tuple[int, list[int]]:
+    if tbe_data_config.mixed_dim:
+        Ds = [
+            round_up(
+                int(
+                    torch.randint(
+                        low=int(0.5 * tbe_data_config.D),
+                        high=int(1.5 * tbe_data_config.D),
+                        size=(1,),
+                    ).item()
+                ),
+                4,
+            )
+            for _ in range(tbe_data_config.T)
+        ]
+        return (sum(Ds) // len(Ds), Ds)
+    else:
+        return (tbe_data_config.D, [tbe_data_config.D] * tbe_data_config.T)
+def generate_feature_requires_grad(
+    tbe_data_config: TBEDataConfig, size: int
+) -> torch.Tensor:
+    assert (
+        size <= tbe_data_config.T
+    ), "size of feature_requires_grad must be less than T"
+    weighted_requires_grad_tables = torch.randperm(tbe_data_config.T)[:size].tolist()
+    return (
+        torch.tensor(
+            [
+                1 if t in weighted_requires_grad_tables else 0
+                for t in range(tbe_data_config.T)
+            ]
+        )
+        .to(get_device())
+        .int()
+    )