PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/tbe/bench/tbe_data_config.py ADDED Viewed

@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import dataclasses
+import json
+import logging
+from typing import Any, Optional
+import torch
+from fbgemm_gpu.tbe.utils.common import get_device
+from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
+try:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_generator"
+    )
+except Exception:
+    pass
+@dataclasses.dataclass(frozen=True)
+class TBEDataConfig:
+    # Number of tables
+    T: int
+    # Number of rows in the embedding table
+    E: int
+    # Target embedding dimension for a table (number of columns)
+    D: int
+    # Generate mixed dimensions if true
+    mixed_dim: bool
+    # Whether the lookup rows are weighted or not
+    weighted: bool
+    # Batch parameters
+    batch_params: BatchParams
+    # Indices parameters
+    indices_params: IndicesParams
+    # Pooling parameters
+    pooling_params: PoolingParams
+    # Force generated tensors to be on CPU
+    use_cpu: bool = False
+    # Number of embeddings in each embedding features (number of rows)
+    Es: Optional[list[int]] = None
+    # Target embedding dimension for each features (number of columns)
+    Ds: Optional[list[int]] = None
+    # Maximum number of indices
+    max_indices: Optional[int] = None  # Maximum number of indices
+    def __post_init__(self) -> None:
+        if isinstance(self.D, list):
+            object.__setattr__(self, "mixed_dim", len(set(self.D)) > 1)
+        if isinstance(self.E, list) and self.max_indices is None:
+            object.__setattr__(self, "max_indices", sum(self.E) - 1)
+        self.validate()
+    @staticmethod
+    def complex_fields() -> dict[str, Any]:
+        return {
+            "batch_params": BatchParams,
+            "indices_params": IndicesParams,
+            "pooling_params": PoolingParams,
+        }
+    @classmethod
+    # pyre-ignore [3]
+    def from_dict(cls, data: dict[str, Any]):
+        for field, Type in cls.complex_fields().items():
+            if not isinstance(data[field], Type):
+                data[field] = Type.from_dict(data[field])
+        return cls(**data)
+    @classmethod
+    # pyre-ignore [3]
+    def from_json(cls, data: str):
+        raw = json.loads(data)
+        allowed = {f.name for f in dataclasses.fields(cls)}
+        existing_fields = {k: v for k, v in raw.items() if k in allowed}
+        missing_fields = allowed - set(existing_fields.keys())
+        unknown_fields = set(raw.keys()) - allowed
+        if missing_fields:
+            logging.warning(
+                f"TBEDataConfig.from_json: Missing expected fields not loaded: {sorted(missing_fields)}"
+            )
+        if unknown_fields:
+            logging.info(
+                f"TBEDataConfig.from_json: Ignored unknown fields from input: {sorted(unknown_fields)}"
+            )
+        return cls.from_dict(existing_fields)
+    def dict(self) -> dict[str, Any]:
+        tmp = dataclasses.asdict(self)
+        for field in TBEDataConfig.complex_fields().keys():
+            tmp[field] = self.__dict__[field].dict()
+        return tmp
+    def json(self, format: bool = False) -> str:
+        return json.dumps(self.dict(), indent=(2 if format else -1), sort_keys=True)
+    # pyre-ignore [3]
+    def validate(self):
+        # NOTE: Add validation logic here
+        assert self.T > 0, "T must be positive"
+        assert self.E > 0, "E must be positive"
+        if self.Es is not None:
+            assert all(e > 0 for e in self.Es), "All elements in Es must be positive"
+        assert self.D > 0, "D must be positive"
+        if self.Ds is not None:
+            assert all(d > 0 for d in self.Ds), "All elements in Ds must be positive"
+        if isinstance(self.E, list) and isinstance(self.D, list):
+            assert (
+                len(self.E) == len(self.D) == self.T
+            ), "Lengths of Es, Lengths of Ds, and T must be equal"
+            if self.max_indices is not None:
+                assert self.max_indices == (
+                    sum(self.Es) - 1
+                ), "max_indices must be equal to sum(Es) - 1"
+        self.batch_params.validate()
+        self.indices_params.validate()
+        self.pooling_params.validate()
+        return self
+    def variable_B(self) -> bool:
+        return self.batch_params.sigma_B is not None
+    def variable_L(self) -> bool:
+        return self.pooling_params.sigma_L is not None
+    def _new_weights(self, size: int) -> Optional[torch.Tensor]:
+        # Per-sample weights will always be FP32
+        return None if not self.weighted else torch.randn(size, device=get_device())

fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py ADDED Viewed

@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from typing import Optional
+import numpy as np
+import torch
+from fbgemm_gpu.tbe.bench.tbe_data_config import TBEDataConfig
+from fbgemm_gpu.tbe.utils.common import get_device, round_up
+from fbgemm_gpu.tbe.utils.requests import (
+    generate_batch_sizes_from_stats,
+    generate_pooling_factors_from_stats,
+    get_table_batched_offsets_from_dense,
+    maybe_to_dtype,
+    TBERequest,
+)
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_generator"
+    )
+def _generate_batch_sizes(
+    tbe_data_config: TBEDataConfig,
+) -> tuple[list[int], Optional[list[list[int]]]]:
+    if tbe_data_config.variable_B():
+        assert (
+            tbe_data_config.batch_params.vbe_num_ranks is not None
+        ), "vbe_num_ranks must be set for varaible batch size generation"
+        return generate_batch_sizes_from_stats(
+            tbe_data_config.batch_params.B,
+            tbe_data_config.T,
+            # pyre-ignore [6]
+            tbe_data_config.batch_params.sigma_B,
+            tbe_data_config.batch_params.vbe_num_ranks,
+            # pyre-ignore [6]
+            tbe_data_config.batch_params.vbe_distribution,
+        )
+    else:
+        return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
+def _generate_pooling_info(
+    tbe_data_config: TBEDataConfig, iters: int, Bs: list[int]
+) -> torch.Tensor:
+    if tbe_data_config.variable_L():
+        # Generate L from stats
+        _, L_offsets = generate_pooling_factors_from_stats(
+            iters,
+            Bs,
+            tbe_data_config.pooling_params.L,
+            # pyre-ignore [6]
+            tbe_data_config.pooling_params.sigma_L,
+            # pyre-ignore [6]
+            tbe_data_config.pooling_params.length_distribution,
+        )
+    else:
+        Ls = [tbe_data_config.pooling_params.L] * (sum(Bs) * iters)
+        L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
+    return L_offsets
+def _generate_indices(
+    tbe_data_config: TBEDataConfig,
+    iters: int,
+    Bs: list[int],
+    L_offsets: torch.Tensor,
+) -> torch.Tensor:
+    total_B = sum(Bs)
+    L_offsets_list = L_offsets.tolist()
+    indices_list = []
+    for it in range(iters):
+        # L_offsets is defined over the entire set of batches for a single iteration
+        start_offset = L_offsets_list[it * total_B]
+        end_offset = L_offsets_list[(it + 1) * total_B]
+        indices_list.append(
+            torch.ops.fbgemm.tbe_generate_indices_from_distribution(
+                tbe_data_config.indices_params.heavy_hitters,
+                tbe_data_config.indices_params.zipf_q,
+                tbe_data_config.indices_params.zipf_s,
+                # max_index = dimensions of the embedding table
+                tbe_data_config.E,
+                # num_indices = number of indices to generate
+                end_offset - start_offset,
+            )
+        )
+    return torch.cat(indices_list)
+def _build_requests_jagged(
+    tbe_data_config: TBEDataConfig,
+    iters: int,
+    Bs: list[int],
+    Bs_feature_rank: Optional[list[list[int]]],
+    L_offsets: torch.Tensor,
+    all_indices: torch.Tensor,
+) -> list[TBERequest]:
+    total_B = sum(Bs)
+    all_indices = all_indices.flatten()
+    requests = []
+    for it in range(iters):
+        start_offset = L_offsets[it * total_B]
+        it_L_offsets = torch.concat(
+            [
+                torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
+                L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
+            ]
+        )
+        requests.append(
+            TBERequest(
+                maybe_to_dtype(
+                    all_indices[start_offset : L_offsets[(it + 1) * total_B]],
+                    tbe_data_config.indices_params.index_dtype,
+                ),
+                maybe_to_dtype(
+                    it_L_offsets.to(get_device()),
+                    tbe_data_config.indices_params.offset_dtype,
+                ),
+                tbe_data_config._new_weights(int(it_L_offsets[-1].item())),
+                Bs_feature_rank if tbe_data_config.variable_B() else None,
+            )
+        )
+    return requests
+def _build_requests_dense(
+    tbe_data_config: TBEDataConfig, iters: int, all_indices: torch.Tensor
+) -> list[TBERequest]:
+    # NOTE: We're using existing code from requests.py to build the
+    # requests, and since the existing code requires 2D view of all_indices,
+    # the existing all_indices must be reshaped
+    all_indices = all_indices.reshape(iters, -1)
+    requests = []
+    for it in range(iters):
+        indices, offsets = get_table_batched_offsets_from_dense(
+            all_indices[it].view(
+                tbe_data_config.T,
+                tbe_data_config.batch_params.B,
+                tbe_data_config.pooling_params.L,
+            ),
+            use_cpu=tbe_data_config.use_cpu,
+        )
+        requests.append(
+            TBERequest(
+                maybe_to_dtype(indices, tbe_data_config.indices_params.index_dtype),
+                maybe_to_dtype(offsets, tbe_data_config.indices_params.offset_dtype),
+                tbe_data_config._new_weights(
+                    tbe_data_config.T
+                    * tbe_data_config.batch_params.B
+                    * tbe_data_config.pooling_params.L
+                ),
+            )
+        )
+    return requests
+def generate_requests(
+    tbe_data_config: TBEDataConfig,
+    iters: int = 1,
+    batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+) -> list[TBERequest]:
+    # Generate batch sizes
+    if batch_size_per_feature_per_rank:
+        Bs = tbe_data_config.batch_params.Bs
+    else:
+        Bs, _ = _generate_batch_sizes(tbe_data_config)
+    assert Bs is not None, "Batch sizes (Bs) must be set"
+    # Generate pooling info
+    L_offsets = _generate_pooling_info(tbe_data_config, iters, Bs)
+    # Generate indices
+    all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
+    all_indices = all_indices.to(get_device())
+    # Build TBE requests
+    if tbe_data_config.variable_B() or tbe_data_config.variable_L():
+        if batch_size_per_feature_per_rank:
+            return _build_requests_jagged(
+                tbe_data_config,
+                iters,
+                Bs,
+                batch_size_per_feature_per_rank,
+                L_offsets,
+                all_indices,
+            )
+        else:
+            return _build_requests_jagged(
+                tbe_data_config,
+                iters,
+                Bs,
+                batch_size_per_feature_per_rank,
+                L_offsets,
+                all_indices,
+            )
+    else:
+        return _build_requests_dense(tbe_data_config, iters, all_indices)
+def generate_requests_with_Llist(
+    tbe_data_config: TBEDataConfig,
+    L_list: torch.Tensor,
+    iters: int = 1,
+    batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+) -> list[TBERequest]:
+    """
+    Generate a list of TBERequest objects based on the provided TBE data configuration and L_list
+    This function generates batch sizes and pooling information from the input L_list,
+    simulates L distributions with Gaussian noise, and creates indices for embedding lookups.
+    It supports both variable batch sizes and sequence lengths, building either jagged or dense requests accordingly.
+    Args:
+        tbe_data_config (TBEDataConfig): Configuration object containing batch parameters and pooling parameters.
+        L_list (torch.Tensor): Tensor of base sequence lengths for each batch.
+        iters (int, optional): Number of iterations to repeat the generated requests. Defaults to 1.
+        batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Optional batch size specification per feature per rank. Defaults to None.
+    Returns:
+        List[TBERequest]: A list of TBERequest objects constructed according to the configuration and input parameters.
+    Raises:
+        AssertionError: If batch sizes (Bs) are not set in the tbe_data_config.
+    Example:
+        >>> requests = generate_requests_with_Llist(tbe_data_config, L_list=torch.tensor([10, 20]), iters=2)
+        >>> len(requests)
+        2
+    """
+    # Generate batch sizes
+    Bs = tbe_data_config.batch_params.Bs
+    assert (
+        Bs is not None
+    ), "Batch sizes (Bs) must be set for generate_requests_with_Llist"
+    # Generate pooling info from L list
+    Ls_list = []
+    for i in range(len(Bs)):
+        L = L_list[i]
+        B = Bs[i]
+        Ls_iter = np.random.normal(
+            loc=L, scale=tbe_data_config.pooling_params.sigma_L, size=B
+        ).astype(int)
+        Ls_list.append(Ls_iter)
+    Ls = np.concatenate(Ls_list)
+    Ls[Ls < 0] = 0
+    # Use the same L distribution across iters
+    Ls = np.tile(Ls, iters)
+    L = Ls.max()
+    # Make it exclusive cumsum
+    L_offsets = torch.from_numpy(np.insert(Ls.cumsum(), 0, 0)).to(torch.long)
+    # Generate indices
+    all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
+    all_indices = all_indices.to(get_device())
+    # Build TBE requests
+    if tbe_data_config.variable_B() or tbe_data_config.variable_L():
+        return _build_requests_jagged(
+            tbe_data_config,
+            iters,
+            Bs,
+            batch_size_per_feature_per_rank,
+            L_offsets,
+            all_indices,
+        )
+    else:
+        return _build_requests_dense(tbe_data_config, iters, all_indices)
+def generate_embedding_dims(tbe_data_config: TBEDataConfig) -> tuple[int, list[int]]:
+    if tbe_data_config.mixed_dim:
+        Ds = [
+            round_up(
+                int(
+                    torch.randint(
+                        low=int(0.5 * tbe_data_config.D),
+                        high=int(1.5 * tbe_data_config.D),
+                        size=(1,),
+                    ).item()
+                ),
+                4,
+            )
+            for _ in range(tbe_data_config.T)
+        ]
+        return (sum(Ds) // len(Ds), Ds)
+    else:
+        return (tbe_data_config.D, [tbe_data_config.D] * tbe_data_config.T)
+def generate_feature_requires_grad(
+    tbe_data_config: TBEDataConfig, size: int
+) -> torch.Tensor:
+    assert (
+        size <= tbe_data_config.T
+    ), "size of feature_requires_grad must be less than T"
+    weighted_requires_grad_tables = torch.randperm(tbe_data_config.T)[:size].tolist()
+    return (
+        torch.tensor(
+            [
+                1 if t in weighted_requires_grad_tables else 0
+                for t in range(tbe_data_config.T)
+            ]
+        )
+        .to(get_device())
+        .int()
+    )