PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2026.1.22__cp312-cp312-manylinux_2_28_x86_64.whl → 2026.1.29__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-nightly-cpu 2026.1.22__cp312-cp312-manylinux_2_28_x86_64.whl → 2026.1.29__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

fbgemm_gpu/docs/target.default.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2026.1.22",
+    "version": "2026.1.29",
     "target": "default",
     "variant": "cpu"
 }

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_config.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_py.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_cache.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_common.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm_gpu_tbe_inference.so CHANGED Viewed

Binary file

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -970,7 +970,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         table_has_feature = [False] * T_
         for t in self.feature_table_map:
             table_has_feature[t] = True
-        assert all(table_has_feature), "Each table must have at least one feature!"
+        assert all(table_has_feature), (
+            "Each table must have at least one feature!"
+            + f"{[(i, x) for i, x in enumerate(table_has_feature)]}"
+        )
         feature_dims = [dims[t] for t in self.feature_table_map]
         D_offsets = [0] + list(accumulate(feature_dims))
@@ -1786,6 +1789,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         cache: int,
         total_static_sparse: int,
         ephemeral: int,
+        cache_weights: int = 0,
+        cache_aux: int = 0,
     ) -> None:
         """Report HBM memory breakdown to stats reporter."""
         stats_reporter.report_data_amount(
@@ -1809,6 +1814,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             embedding_id=self.logging_table_name,
             tbe_id=self.uuid,
         )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.cache_weights",
+            data_bytes=cache_weights,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.cache_aux",
+            data_bytes=cache_aux,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="tbe.hbm.total_static_sparse",
@@ -1832,6 +1851,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         cache: int,
         total_static_sparse: int,
         ephemeral: int,
+        cache_weights: int = 0,
+        cache_aux: int = 0,
     ) -> None:
         """Report UVM memory breakdown to stats reporter."""
         stats_reporter.report_data_amount(
@@ -1855,6 +1876,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             embedding_id=self.logging_table_name,
             tbe_id=self.uuid,
         )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.cache_weights",
+            data_bytes=cache_weights,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.cache_aux",
+            data_bytes=cache_aux,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="tbe.uvm.total_static_sparse",
@@ -1931,34 +1966,50 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             "momentum2_host",
             "momentum2_uvm",
         ]
-        cache_tensors = [
+        # Cache weights tensor (the actual cached embeddings in HBM)
+        cache_weight_tensors = [
             "lxu_cache_weights",
-            "lxu_cache_state",
-            "lxu_state",
-            "cache_hash_size_cumsum",
-            "cache_index_table_map",
-            "cache_miss_counter",
-            "lxu_cache_locking_counter",
+        ]
+        # Cache auxiliary state tensors (metadata for cache management, excluding weights)
+        # Sizes scale with hash_size or cache_slots (hash_size × clf)
+        # Excludes constant-size tensors: cache_hash_size_cumsum, cache_miss_counter, etc.
+        cache_aux_tensors = [
+            "cache_index_table_map",  # int32, 4B × hash_size
+            "lxu_cache_state",  # int64, 8B × cache_slots
+            "lxu_state",  # int64, 8B × cache_slots (LRU) or hash_size (LFU)
+            "lxu_cache_locking_counter",  # int32, 4B × cache_slots (only if prefetch_pipeline)
         ]
         # Calculate total memory for each component
         weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
         optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
-        cache_total = sum(self._get_tensor_memory(t) for t in cache_tensors)
+        cache_weights_total = sum(
+            self._get_tensor_memory(t) for t in cache_weight_tensors
+        )
+        cache_aux_total = sum(self._get_tensor_memory(t) for t in cache_aux_tensors)
         # Categorize memory by location (HBM vs UVM)
         if self.use_cpu:
             weights_hbm, weights_uvm = 0, weights_total
             opt_hbm, opt_uvm = 0, optimizer_total
-            cache_hbm, cache_uvm = 0, cache_total
+            cache_weights_hbm, cache_weights_uvm = 0, cache_weights_total
+            cache_aux_hbm, cache_aux_uvm = 0, cache_aux_total
         else:
             weights_hbm, weights_uvm = self._categorize_memory_by_location(
                 weight_tensors
             )
             opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
-            cache_hbm, cache_uvm = self._categorize_memory_by_location(cache_tensors)
+            cache_weights_hbm, cache_weights_uvm = self._categorize_memory_by_location(
+                cache_weight_tensors
+            )
+            cache_aux_hbm, cache_aux_uvm = self._categorize_memory_by_location(
+                cache_aux_tensors
+            )
         # Calculate ephemeral memory split between HBM and UVM
+        # Total cache = cache weights + cache auxiliary state
+        cache_hbm = cache_weights_hbm + cache_aux_hbm
+        cache_uvm = cache_weights_uvm + cache_aux_uvm
         static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
         static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
         ephemeral_hbm = total_hbm_usage - static_sparse_hbm
@@ -1972,6 +2023,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             cache_hbm,
             static_sparse_hbm,
             ephemeral_hbm,
+            cache_weights_hbm,
+            cache_aux_hbm,
         )
         self._report_uvm_breakdown(
             stats_reporter,
@@ -1980,6 +2033,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             cache_uvm,
             static_sparse_uvm,
             ephemeral_uvm,
+            cache_weights_uvm,
+            cache_aux_uvm,
         )
     @torch.jit.ignore
@@ -2232,6 +2287,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 op_id=self.uuid,
                 per_sample_weights=per_sample_weights,
                 batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                embedding_specs=[(s[0], s[1]) for s in self.embedding_specs],
+                feature_table_map=self.feature_table_map,
             )
         if not is_torchdynamo_compiling():

fbgemm_gpu/tbe/bench/__init__.py CHANGED Viewed

@@ -41,7 +41,11 @@ from .tbe_data_config_param_models import (  # noqa F401
     IndicesParams,
     PoolingParams,
 )
-from .utils import fill_random_scale_bias  # noqa F401
+from .utils import (  # noqa F401
+    check_oom,
+    fill_random_scale_bias,
+    generate_merged_output_and_offsets,
+)
 try:
     torch.ops.load_library(

fbgemm_gpu/tbe/bench/tbe_data_config.py CHANGED Viewed

@@ -10,7 +10,7 @@
 import dataclasses
 import json
 import logging
-from typing import Any, Optional
+from typing import Any, List, Optional, Tuple
 import torch
@@ -32,30 +32,77 @@ except Exception:
 @dataclasses.dataclass(frozen=True)
 class TBEDataConfig:
-    # Number of tables
     T: int
-    # Number of rows in the embedding table
     E: int
-    # Target embedding dimension for a table (number of columns)
     D: int
-    # Generate mixed dimensions if true
     mixed_dim: bool
-    # Whether the lookup rows are weighted or not
     weighted: bool
-    # Batch parameters
     batch_params: BatchParams
-    # Indices parameters
     indices_params: IndicesParams
-    # Pooling parameters
     pooling_params: PoolingParams
-    # Force generated tensors to be on CPU
     use_cpu: bool = False
-    # Number of embeddings in each embedding features (number of rows)
     Es: Optional[list[int]] = None
-    # Target embedding dimension for each features (number of columns)
     Ds: Optional[list[int]] = None
-    # Maximum number of indices
-    max_indices: Optional[int] = None  # Maximum number of indices
+    max_indices: Optional[int] = None
+    embedding_specs: Optional[List[Tuple[int, int]]] = None
+    feature_table_map: Optional[List[int]] = None
+    """
+    Configuration for TBE (Table Batched Embedding) benchmark data collection and generation.
+    This dataclass holds parameters required to generate synthetic data for
+    TBE benchmarking, including table specifications, batch parameters, indices
+    distribution parameters, and pooling parameters.
+    Args:
+        T (int): Number of embedding tables (features). Must be positive.
+        E (int): Number of rows in the embedding table (feature). If T > 1, this
+            represents the averaged number of rows across all features.
+        D (int): Target embedding dimension for a table (feature), i.e., number of
+            columns. If T > 1, this represents the averaged dimension across
+            all features.
+        mixed_dim (bool): If True, generate embeddings with mixed dimensions
+            across tables (features). This is automatically set to True if D is provided
+            as a list with non-uniform values.
+        weighted (bool): If True, the lookup rows are weighted (per-sample
+            weights). The weights will be generated as FP32 tensors.
+        batch_params (BatchParams): Parameters controlling batch generation.
+            Contains:
+            (1) `B` = target batch size (number of batch lookups per features)
+            (2) `sigma_B` = optional standard deviation for variable batch size
+            (3) `vbe_distribution` = distribution type ("normal" or "uniform")
+            (4) `vbe_num_ranks` = number of ranks for variable batch size
+            (5) `Bs` = per-feature batch sizes
+        indices_params (IndicesParams): Parameters controlling index generation
+            following a Zipf distribution. Contains:
+            (1) `heavy_hitters` = probability density map for hot indices
+            (2) `zipf_q` = q parameter in Zipf distribution (x+q)^{-s}
+            (3) `zipf_s` = s parameter (alpha) in Zipf distribution
+            (4) `index_dtype` = optional dtype for indices tensor
+            (5) `offset_dtype` = optional dtype for offsets tensor
+        pooling_params (PoolingParams): Parameters controlling pooling behavior.
+            Contains:
+            (1) `L` = target bag size (pooling factor, indices per lookup)
+            (2) `sigma_L` = optional standard deviation for variable bag size
+            (3) `length_distribution` = distribution type ("normal" or "uniform")
+            (4) `Ls` = per-feature bag sizes
+        use_cpu (bool = False): If True, force generated tensors to be placed
+            on CPU instead of the default compute device.
+        Es (Optional[List[int]] = None): Number of embeddings (rows) for each
+            individual embedding feature. If provided, must have length equal
+            to T. All elements must be positive.
+        Ds (Optional[List[int]] = None): Target embedding dimension (columns)
+            for each individual feature. If provided, must have length equal
+            to T. All elements must be positive.
+        max_indices (Optional[int] = None): Maximum number of indices for
+            bounds checking. If Es is provided as a list and max_indices is
+            None, it is automatically computed as sum(Es) - 1.
+        embedding_specs (Optional[List[Tuple[int, int]]] = None): A list of
+            embedding specs consisting of a list of tuples of (num_rows, embedding_dim).
+            See https://fburl.com/tbe_embedding_specs for details.
+        feature_table_map (Optional[List[int]] = None): An optional list that
+            specifies feature-table mapping. feature_table_map[i] indicates the
+            physical embedding table that feature i maps to.
+    """
     def __post_init__(self) -> None:
         if isinstance(self.D, list):
@@ -117,17 +164,25 @@ class TBEDataConfig:
         assert self.D > 0, "D must be positive"
         if self.Ds is not None:
             assert all(d > 0 for d in self.Ds), "All elements in Ds must be positive"
-        if isinstance(self.E, list) and isinstance(self.D, list):
+        if isinstance(self.Es, list) and isinstance(self.Ds, list):
             assert (
-                len(self.E) == len(self.D) == self.T
+                len(self.Es) == len(self.Ds) == self.T
             ), "Lengths of Es, Lengths of Ds, and T must be equal"
             if self.max_indices is not None:
                 assert self.max_indices == (
                     sum(self.Es) - 1
                 ), "max_indices must be equal to sum(Es) - 1"
         self.batch_params.validate()
+        if self.batch_params.Bs is not None:
+            assert (
+                len(self.batch_params.Bs) == self.T
+            ), f"Length of Bs must be equal to T. Expected: {self.T}, but got: {len(self.batch_params.Bs)}"
         self.indices_params.validate()
         self.pooling_params.validate()
+        if self.pooling_params.Ls is not None:
+            assert (
+                len(self.pooling_params.Ls) == self.T
+            ), f"Length of Ls must be equal to T. Expected: {self.T}, but got: {len(self.pooling_params.Ls)}"
         return self
     def variable_B(self) -> bool:

fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py CHANGED Viewed

@@ -7,6 +7,7 @@
 # pyre-strict
+import logging
 from typing import Optional
 import numpy as np
@@ -35,6 +36,9 @@ except Exception:
 def _generate_batch_sizes(
     tbe_data_config: TBEDataConfig,
 ) -> tuple[list[int], Optional[list[list[int]]]]:
+    logging.info(
+        f"DEBUG_TBE: [_generate_batch_sizes] VBE tbe_data_config.variable_B()={tbe_data_config.variable_B()}"
+    )
     if tbe_data_config.variable_B():
         assert (
             tbe_data_config.batch_params.vbe_num_ranks is not None
@@ -48,7 +52,6 @@ def _generate_batch_sizes(
             # pyre-ignore [6]
             tbe_data_config.batch_params.vbe_distribution,
         )
     else:
         return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
@@ -89,13 +92,15 @@ def _generate_indices(
         start_offset = L_offsets_list[it * total_B]
         end_offset = L_offsets_list[(it + 1) * total_B]
+        logging.info(f"DEBUG_TBE: _generate_indices E = {tbe_data_config.E=}")
         indices_list.append(
             torch.ops.fbgemm.tbe_generate_indices_from_distribution(
                 tbe_data_config.indices_params.heavy_hitters,
                 tbe_data_config.indices_params.zipf_q,
                 tbe_data_config.indices_params.zipf_s,
                 # max_index = dimensions of the embedding table
-                tbe_data_config.E,
+                int(tbe_data_config.E),
                 # num_indices = number of indices to generate
                 end_offset - start_offset,
             )
@@ -184,6 +189,10 @@ def generate_requests(
     else:
         Bs, _ = _generate_batch_sizes(tbe_data_config)
+    logging.info(
+        f"DEBUG_TBE: VBE [generate_requests] batch_size_per_feature_per_rank={batch_size_per_feature_per_rank} Bs={Bs}"
+    )
     assert Bs is not None, "Batch sizes (Bs) must be set"
     # Generate pooling info

fbgemm_gpu/tbe/bench/tbe_data_config_loader.py CHANGED Viewed

@@ -8,6 +8,8 @@
 # pyre-strict
 import dataclasses
+import logging
+import re
 from enum import Enum
 import click
@@ -45,12 +47,16 @@ class TBEDataConfigHelperText(Enum):
     TBE_INDICES_HITTERS = "Heavy hitters for indices (comma-delimited list of floats)"
     TBE_INDICES_ZIPF = "Zipf distribution parameters for indices generation (q, s)"
     TBE_INDICES_DTYPE = "The dtype of the table indices (choices: '32', '64')"
-    TBE_OFFSETS_DTYPE = "The dtype of the table indices (choices: '32', '64')"
+    TBE_OFFSETS_DTYPE = "The dtype of the table offsets (choices: '32', '64')"
     # Pooling Parameters
     TBE_POOLING_SIZE = "Bag size / pooling factor (L)"
-    TBE_POOLING_VL_SIGMA = "Standard deviation of B for VBE"
-    TBE_POOLING_VL_DIST = "VBE distribution (choices: 'uniform', 'normal')"
+    TBE_POOLING_VL_SIGMA = "Standard deviation of L for variable bag size"
+    TBE_POOLING_VL_DIST = (
+        "Variable bag size distribution (choices: 'uniform', 'normal')"
+    )
+    TBE_EMBEDDING_SPECS = "Embedding Specs which is List[Tuple[int, int, EmbeddingLocation, ComputeDevice]]"
+    TBE_FEATURE_TABLE_MAP = "Mapping of feature-table"
 class TBEDataConfigLoader:
@@ -193,6 +199,18 @@ class TBEDataConfigLoader:
                 required=False,
                 help=TBEDataConfigHelperText.TBE_POOLING_VL_DIST.value,
             ),
+            click.option(
+                "--tbe-embedding-specs",
+                type=str,
+                required=False,
+                help=TBEDataConfigHelperText.TBE_EMBEDDING_SPECS.value,
+            ),
+            click.option(
+                "--tbe-feature-table-map",
+                type=str,
+                required=False,
+                help=TBEDataConfigHelperText.TBE_FEATURE_TABLE_MAP.value,
+            ),
         ]
         for option in reversed(options):
@@ -213,15 +231,21 @@ class TBEDataConfigLoader:
         params = context.params
         # Read table parameters
-        T = params["tbe_num_tables"]
-        E = params["tbe_num_embeddings"]
+        T = params["tbe_num_tables"]  # number of features
+        E = params["tbe_num_embeddings"]  # feature_rows
         if params["tbe_num_embeddings_list"] is not None:
             Es = [int(x) for x in params["tbe_num_embeddings_list"].split(",")]
+            T = len(Es)
+            E = sum(Es) // T  # average E
         else:
             Es = None
         D = params["tbe_embedding_dim"]
         if params["tbe_embedding_dim_list"] is not None:
             Ds = [int(x) for x in params["tbe_embedding_dim_list"].split(",")]
+            assert (
+                len(Ds) == T
+            ), f"Expected tbe_embedding_dim_list to have {T} elements, but got {len(Ds)}"
+            D = sum(Ds) // T  # average D
         else:
             Ds = None
@@ -239,10 +263,31 @@ class TBEDataConfigLoader:
         vbe_num_ranks = params["tbe_batch_vbe_ranks"]
         if params["tbe_batch_sizes_list"] is not None:
             Bs = [int(x) for x in params["tbe_batch_sizes_list"].split(",")]
+            B = sum(Bs) // T  # average B
         else:
+            B = params["tbe_batch_size"]
             Bs = None
         batch_params = BatchParams(B, sigma_B, vbe_distribution, vbe_num_ranks, Bs)
+        # Parse embedding_specs: "(E,D),(E,D),..." or "(E,D,loc,dev),(E,D,loc,dev),..."
+        # Only the first two values (E, D) are extracted.
+        embedding_specs = None
+        feature_table_map = None
+        if params["tbe_embedding_specs"] is not None:
+            try:
+                tuples = re.findall(r"\(([^)]+)\)", params["tbe_embedding_specs"])
+                if tuples:
+                    embedding_specs = [
+                        (int(t.split(",")[0].strip()), int(t.split(",")[1].strip()))
+                        for t in tuples
+                    ]
+            except (ValueError, IndexError):
+                logging.warning("Failed to parse embedding_specs. Setting to None.")
+        if params["tbe_feature_table_map"] is not None:
+            feature_table_map = [
+                int(x) for x in params["tbe_feature_table_map"].split(",")
+            ]
         # Read indices parameters
         heavy_hitters = (
             torch.tensor([float(x) for x in params["tbe_indices_hitters"].split(",")])
@@ -279,6 +324,8 @@ class TBEDataConfigLoader:
             Es,
             Ds,
             max_indices,
+            embedding_specs,
+            feature_table_map,
         ).validate()
     @classmethod

fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py CHANGED Viewed

@@ -98,7 +98,7 @@ class BatchParams:
     vbe_distribution: Optional[str] = "normal"
     # Number of ranks for variable batch size generation
     vbe_num_ranks: Optional[int] = None
-    # List of target batch sizes, i.e. number of batch lookups per table
+    # List of target batch sizes, i.e. number of batch lookups per feature
     Bs: Optional[list[int]] = None
     @classmethod
@@ -142,6 +142,8 @@ class PoolingParams:
     sigma_L: Optional[int] = None
     # [Optional] Distribution of embedding sequence lengths (normal, uniform)
     length_distribution: Optional[str] = "normal"
+    # [Optional] List of target bag sizes, i.e. pooling factors per batch
+    Ls: Optional[list[float]] = None
     @classmethod
     # pyre-ignore [3]

fbgemm_gpu/tbe/bench/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # pyre-strict
-import logging
+from typing import List, Tuple
 import numpy as np
 import torch
@@ -14,8 +14,6 @@ import torch
 # fmt:skip
 from fbgemm_gpu.split_embedding_configs import SparseType
-logging.basicConfig(level=logging.DEBUG)
 def fill_random_scale_bias(
     emb: torch.nn.Module,
@@ -47,3 +45,128 @@ def fill_random_scale_bias(
                     device=scale_shift.device,
                 )
             )
+def check_oom(
+    data_size: int,
+) -> Tuple[bool, str]:
+    free_memory, total_memory = torch.cuda.mem_get_info()
+    if data_size > free_memory:
+        warning = f"Expect to allocate {round(data_size / (1024 ** 3), 2)} GB, but available memory is {round(free_memory / (1024 ** 3), 2)} GB from {round(total_memory / (1024 ** 3), 2)} GB."
+        return (True, warning)
+    return (False, "")
+def generate_batch_size_per_feature_per_rank(
+    Bs: List[int], num_ranks: int
+) -> List[List[int]]:
+    """
+    Generate batch size per feature per rank for VBE, assuming the batch size
+    is evenly distributed across ranks.
+    Args:
+        Bs (List[int]): batch size per feature
+        num_ranks (int): number of ranks
+    Returns:
+        List[List[int]]: batch size per feature per rank
+    """
+    b_per_feature_per_rank = []
+    for B in Bs:
+        b_per_feature = []
+        for i in range(num_ranks):
+            if i != num_ranks - 1:
+                b_per_feature.append(int(B / num_ranks))
+            else:
+                b_per_feature.append(B - sum(b_per_feature))
+        b_per_feature_per_rank.append(b_per_feature)
+    return b_per_feature_per_rank
+def generate_merged_output_and_offsets(
+    Ds: List[int],
+    Bs: List[int],
+    output_dtype: torch.dtype,
+    device: torch.device,
+    num_ranks: int = 2,
+    num_tbe_ops: int = 2,
+) -> Tuple[List[List[int]], torch.Tensor, torch.Tensor]:
+    """
+    Generate merged vbe_output and vbe_output_offsets tensors for VBE.
+    The vbe_output is a tensor that will contain forward output from all VBE TBE ops.
+    The vbe_output_offsets is a tensor that will contain start offsets for the output to be written to.
+    Args:
+        Ds (List[int]): embedding dimension per feature
+        Bs (List[int]): batch size per feature
+        num_ranks (int): number of ranks
+        num_tbe_ops (int): number of TBE ops
+    Returns:
+        Tuple[List[List[int]], torch.Tensor, torch.Tensor]: batch_size_per_feature_per_rank, merged vbe_output and vbe_output_offsets tensors
+    """
+    # The first embedding ops is the embedding op created in the benchmark
+    emb_op = {}
+    emb_op[0] = {}
+    emb_op[0]["dim"] = Ds
+    emb_op[0]["Bs"] = Bs
+    emb_op[0]["output_size"] = sum([b * d for b, d in zip(Bs, Ds)])
+    emb_op[0]["batch_size_per_feature_per_rank"] = (
+        generate_batch_size_per_feature_per_rank(Bs, num_ranks)
+    )
+    num_features = len(Bs)
+    # create other embedding ops to allocate output and offsets tensors
+    # Using representative values for additional TBE ops in multi-op scenarios:
+    # - batch_size=32000: typical large batch size for production workloads
+    # - dim=512: common embedding dimension for large models
+    for i in range(1, num_tbe_ops):
+        emb_op[i] = {}
+        emb_op[i]["batch_size_per_feature_per_rank"] = (
+            generate_batch_size_per_feature_per_rank([32000], num_ranks)
+        )
+        emb_op[i]["Bs"] = [sum(B) for B in emb_op[i]["batch_size_per_feature_per_rank"]]
+        emb_op[i]["dim"] = [512]
+        emb_op[i]["output_size"] = sum(
+            [b * d for b, d in zip(emb_op[i]["Bs"], emb_op[i]["dim"])]
+        )
+    total_output = 0
+    ranks = [[] for _ in range(num_ranks)]
+    for e in emb_op.values():
+        b_per_rank_per_feature = list(zip(*e["batch_size_per_feature_per_rank"]))
+        assert len(b_per_rank_per_feature) == num_ranks
+        dims = e["dim"]
+        for r, b_r in enumerate(b_per_rank_per_feature):
+            for f, b in enumerate(b_r):
+                output_size_per_batch = b * dims[f]
+                ranks[r].append(output_size_per_batch)
+                total_output += output_size_per_batch
+    ranks[0].insert(0, 0)
+    offsets_ranks: List[List[int]] = [[] for _ in range(num_ranks)]
+    total_output_offsets = []
+    start = 0
+    for r in range(num_ranks):
+        offsets_ranks[r] = [
+            start + sum(ranks[r][: i + 1]) for i in range(len(ranks[r]))
+        ]
+        start = offsets_ranks[r][-1]
+        total_output_offsets.extend(offsets_ranks[r])
+    check_total_output_size = sum([e["output_size"] for e in emb_op.values()])
+    assert (
+        total_output == check_total_output_size
+    ), f"{total_output} != {check_total_output_size}{[e['output_size'] for e in emb_op.values()]}"
+    assert (
+        total_output == total_output_offsets[-1]
+    ), f"{total_output} != {total_output_offsets[-1]}"
+    out = torch.empty(total_output, dtype=output_dtype, device=device)
+    offsets = []
+    offsets.append(offsets_ranks[0][:num_features])
+    for r in range(1, num_ranks):
+        start = [offsets_ranks[r - 1][-1]]
+        the_rest = offsets_ranks[r][: num_features - 1] if num_features > 1 else []
+        start.extend(the_rest)
+        offsets.append(start)
+    out_offsets = torch.tensor(
+        offsets,
+        dtype=torch.int64,
+        device=device,
+    )
+    batch_size_per_feature_per_rank = emb_op[0]["batch_size_per_feature_per_rank"]
+    return (batch_size_per_feature_per_rank, out, out_offsets)

fbgemm_gpu/tbe/stats/bench_params_reporter.py CHANGED Viewed

@@ -11,7 +11,7 @@ import io
 import json
 import logging
 import os
-from typing import Optional
+from typing import List, Optional, Tuple
 import fbgemm_gpu  # noqa F401
 import torch  # usort:skip
@@ -137,6 +137,20 @@ class TBEBenchmarkParamsReporter:
             path_prefix=path_prefix,
         )
+    def extract_Ls(
+        self,
+        bag_sizes: List[int],
+        Bs: List[int],
+    ) -> List[float]:
+        Ls = []
+        start = 0
+        for b in Bs:
+            end = start + b
+            avg_L = sum(bag_sizes[start:end]) / b if b > 0 else 0
+            start = end
+            Ls.append(avg_L)
+        return Ls
     def extract_params(
         self,
         feature_rows: torch.Tensor,
@@ -144,7 +158,11 @@ class TBEBenchmarkParamsReporter:
         indices: torch.Tensor,
         offsets: torch.Tensor,
         per_sample_weights: Optional[torch.Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        Es: Optional[List[int]] = None,
+        Ds: Optional[List[int]] = None,
+        embedding_specs: Optional[List[Tuple[int, int]]] = None,
+        feature_table_map: Optional[List[int]] = None,
     ) -> TBEDataConfig:
         """
         Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
@@ -201,8 +219,14 @@ class TBEBenchmarkParamsReporter:
         )
         # Compute batch parameters
+        B = int((offsets.numel() - 1) // T)
+        Bs = (
+            [sum(b_per_rank) for b_per_rank in batch_size_per_feature_per_rank]
+            if batch_size_per_feature_per_rank
+            else [B] * T
+        )
         batch_params = BatchParams(
-            B=int((offsets.numel() - 1) // T),
+            B=B,
             sigma_B=(
                 int(
                     torch.ceil(
@@ -226,10 +250,14 @@ class TBEBenchmarkParamsReporter:
                 if batch_size_per_feature_per_rank
                 else None
             ),
+            Bs=Bs,
         )
         # Compute pooling parameters
         bag_sizes = offsets[1:] - offsets[:-1]
+        if batch_size_per_feature_per_rank is None:
+            _B = int(bag_sizes.numel() // T)
+            assert _B == Bs[0], f"Expected constant batch size {Bs[0]} but got {_B}"
         mixed_bag_sizes = len(set(bag_sizes)) > 1
         pooling_params = PoolingParams(
             L=(
@@ -243,6 +271,7 @@ class TBEBenchmarkParamsReporter:
                 else None
             ),
             length_distribution=("normal" if mixed_bag_sizes else None),
+            Ls=self.extract_Ls(bag_sizes.tolist(), Bs),
         )
         return TBEDataConfig(
@@ -255,6 +284,10 @@ class TBEBenchmarkParamsReporter:
             indices_params=indices_params,
             pooling_params=pooling_params,
             use_cpu=(not torch.cuda.is_available()),
+            Es=Es,
+            Ds=Ds,
+            embedding_specs=embedding_specs,
+            feature_table_map=feature_table_map,
         )
     def report_stats(
@@ -266,7 +299,9 @@ class TBEBenchmarkParamsReporter:
         offsets: torch.Tensor,
         op_id: str = "",
         per_sample_weights: Optional[torch.Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        embedding_specs: Optional[List[Tuple[int, int]]] = None,
+        feature_table_map: Optional[List[int]] = None,
     ) -> None:
         """
         Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
@@ -280,6 +315,8 @@ class TBEBenchmarkParamsReporter:
             op_id (str, optional): The operation identifier. Defaults to an empty string.
             per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
             batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
+            embedding_specs (Optional[List[Tuple[int, int]]]): Embedding specs. Defaults to None.
+            feature_table_map (Optional[List[int]], optional): Feature table map. Defaults to None.
         """
         if (
             (iteration - self.report_iter_start) % self.report_interval == 0
@@ -299,41 +336,14 @@ class TBEBenchmarkParamsReporter:
                 offsets=offsets,
                 per_sample_weights=per_sample_weights,
                 batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                Es=feature_rows.tolist(),
+                Ds=feature_dims.tolist(),
+                embedding_specs=embedding_specs,
+                feature_table_map=feature_table_map,
             )
-            # Ad-hoc fix for adding Es and Ds to JSON output
-            # TODO: Remove this once we moved Es and Ds to be part of TBEDataConfig
-            adhoc_config = config.dict()
-            adhoc_config["Es"] = feature_rows.tolist()
-            adhoc_config["Ds"] = feature_dims.tolist()
-            if batch_size_per_feature_per_rank:
-                adhoc_config["Bs"] = [
-                    sum(batch_size_per_feature_per_rank[f])
-                    for f in range(len(adhoc_config["Es"]))
-                ]
-            bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
-            adhoc_config["Ls"] = []
-            pointer_counter = 0
-            if batch_size_per_feature_per_rank:
-                for batchs_size in adhoc_config["Bs"]:
-                    current_L = 0
-                    for _i in range(batchs_size):
-                        current_L += bag_sizes[pointer_counter]
-                        pointer_counter += 1
-                    adhoc_config["Ls"].append(current_L / batchs_size)
-            else:
-                batch_size = int(len(bag_sizes) // len(adhoc_config["Es"]))
-                for _j in range(len(adhoc_config["Es"])):
-                    current_L = 0
-                    for _i in range(batch_size):
-                        current_L += bag_sizes[pointer_counter]
-                        pointer_counter += 1
-                    adhoc_config["Ls"].append(current_L / batch_size)
             # Write the TBE config to FileStore
             self.filestore.write(
                 f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
-                io.BytesIO(json.dumps(adhoc_config, indent=2).encode()),
+                io.BytesIO(json.dumps(config.dict(), indent=2).encode()),
             )

{fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_nightly-cpu
-Version: 2026.1.22
+Version: 2026.1.29
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/RECORD RENAMED Viewed

@@ -2,15 +2,15 @@ fbgemm_gpu/__init__.py,sha256=JrSxUgY_diRl9kXapbyq3iteiB32D02CPan3stEFiAM,6434
 fbgemm_gpu/asmjit.so,sha256=DNnFdMXB8IW_9ulBAn7I5EMmAJ5y-yT0-YRDBszOqXA,501728
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=pZqqUfvPIsaIo1CWX-_W087WQg-YEZuS0GNGoKFO_9c,2915
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
-fbgemm_gpu/fbgemm.so,sha256=eX7OHRRmF7Bg-bXIriBsr5_Z9XfpNBcKnlR0tWCSyzk,5675384
-fbgemm_gpu/fbgemm_gpu_config.so,sha256=xfspmI7ouPXTm7GZnZsf_wT3oMl7lHVIUGdC2cHa9JM,51176
+fbgemm_gpu/fbgemm.so,sha256=lkRdKMi7O083a2gbKCuNsytkb9gHIgcJvNwPRm6RP8U,5659000
+fbgemm_gpu/fbgemm_gpu_config.so,sha256=ksQXaC-3-l3Agi0wxeE9o_2wAG3W4-4X84eplwPr5E8,47080
 fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so,sha256=BGXKjkImBO3W6weT6SasOOCUB2w4x1v_DMAadGbqciQ,88032
-fbgemm_gpu/fbgemm_gpu_py.so,sha256=SI2UZFDfMtfDSTb_S0DBT77gQOKnbafTcM4lUegJP78,4913424
+fbgemm_gpu/fbgemm_gpu_py.so,sha256=xD0-0Xnb8kDGbh8rPWnqEYNhKZeTm3BJ1hp24XWaxlo,4938000
 fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so,sha256=o7Vne9VVYq2VKz_jOLw97erI0RbiAMPNLk2oODNQvEA,79840
-fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=-uedOtg-UanyeCGB9jObYLAzDc5_JgS7X9Yddt4Feng,260080
-fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=BJl4Gn2tf-UNaaTzP95deLW0XsATiOWj_mh4CIDVTb0,395232
+fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=b_Dxjg-U5ilu1tyi-JmfQ8c13ABEpH1-QrVDYUYoXPI,284656
+fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=opZDhNxRVqEBDwDDZmLp2TLmLYKxmLE4I11DhR4rd_A,387040
 fbgemm_gpu/fbgemm_gpu_tbe_index_select.so,sha256=uGRcOuVR_O4bxaEfuq5ve-xXMMV2HKhHfTm_LnKJ_aw,333800
-fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=AWYlslvwCrOxOPHemDYsuTe6E8UYEk4je2ioAoCl2mU,605000
+fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=HeMj3Q_KhYBKi2MQ-jBLGgE82TGb_75twp0hVeMqWgA,605000
 fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so,sha256=QmRzU2tSEZTxarB3xNSDFKWdRKfKHrmowMiws_f3zmc,13760
 fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so,sha256=2hARnWgefCIXwYNwFwNsGa2kei8fEY_gqia8NxqFclA,1278144
 fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so,sha256=N569f2bc0y_CyBJagS8RlsmrHVWpwQMY14kFdpF6T_Y,13760
@@ -34,7 +34,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
 fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
 fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=_uUplpcyQOQuxqv8-HV94VUM5lG8e3aGWltXhOgICQc,19294
 fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
-fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=6pgmu4v5qdrrSzxRyzgg4GQsrLAha0br__GbT854UxI,189015
+fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=uCPngWxxC5OQhJv7o6aGs8xf3WlRSrdRHbpCBlPbIuE,191511
 fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
 fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=2TTKsF5yaROTaI69YdCIt8hr_v2TDEo8EraZ0QXNBxc,717
 fbgemm_gpu/tbe_input_multiplexer.py,sha256=MbZF8aZdm_kV-JRMaooeZrqlh6Pn5IuNkSXBXODp-LE,3062
@@ -49,7 +49,7 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/target.default.json.py,sha256=nW6QTSVVz4u9JpGEqWxc-4oaolSZb-Bj_4nxBJA2hn8,79
+fbgemm_gpu/docs/target.default.json.py,sha256=_BcuMA1hCJ_Jtf08E7O8t-R8A5HiRXHH3Z9rpgCq66U,79
 fbgemm_gpu/quantize/__init__.py,sha256=yPUCmLhNdahHFireHPQMmmiRp3g6W2dkIl5MB51M6SU,942
 fbgemm_gpu/quantize/quantize_ops.py,sha256=C3SN79GcL7fczzoFkxUojm6cGkvvI4iWttkGN4LFQcM,2239
 fbgemm_gpu/sll/__init__.py,sha256=nLFeTiRed6A5STRi_EgHCyNoik0zhXUk2db5kTmMUNU,4221
@@ -87,7 +87,7 @@ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py,sha256=N32H1lUb
 fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py,sha256=xWSmk56JgoYfO8eiiK4BP9Brbhixs4tUAMeWp5TPZ30,956
 fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py,sha256=bjrbKAypa-FnOIVKH-IUnWP1Jhlu0lk1SopZ0KLFVdo,6623
 fbgemm_gpu/tbe/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
-fbgemm_gpu/tbe/bench/__init__.py,sha256=wgPBmxtQMmbA39cbQ2nO4PGAk5lXjFGjG8-9FoAXg34,1589
+fbgemm_gpu/tbe/bench/__init__.py,sha256=TyUVsIH4p-RtFaXAKppYoaWbf9UTjCTUpnIV7RD_O5E,1653
 fbgemm_gpu/tbe/bench/bench_config.py,sha256=xgtlGLCeZVW6jBYwkKsiQeCslCrWDgJbV2NLLwCRSn4,5452
 fbgemm_gpu/tbe/bench/bench_runs.py,sha256=K4HRUcsX4BWqtrYwinZSXjnjNDFkvpoEdQmv-6rz7Tk,23518
 fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=ofcGsiTUj3_Ml7JSsqg_LcMw3CV-6ypmlRWAUmT_cjc,6941
@@ -95,11 +95,11 @@ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=B3QOZhtycMDwHMG3dFKnlFuWOqYRCF3RCozEQfrqv
 fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=zdL_ve1Ga6ziU5LjfnzJXOBOIqtCjLlhSrlGfa42H9w,4978
 fbgemm_gpu/tbe/bench/eval_compression.py,sha256=ulFMaNZF2g_vfkXLWZSh02ibotg1zpTz3swVU484mzU,3486
 fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
-fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=KTRIZWJIeNgoc6H68iPS45uVEQ3S96IvjLHvBS4nTyQ,4835
-fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=2lseM16Ky12FBY0E5ChfWOM6KJbi4iXWAOwkHbE2YeM,10933
-fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=i6DY6DdKSeQ5gE_MUqHY3a04MGy18Vd_lg8ht-qEEyY,10018
-fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
-fbgemm_gpu/tbe/bench/utils.py,sha256=IOPMnzTC7TUWVGyDzNPvP6r8BekWgO-TzxOQW21brj4,1728
+fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=d724L4Is3Bo2D5reglgsBs7H6ezLFDrQUbTP5tsnPEQ,8509
+fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=c-IwLbx04Qbqxzfcn9N4U9Eo9QnmgbBN6HxJYAJwvMw,11311
+fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=fSdtEAnKu6r56mHMtMJIHo-S6m3vC4cPRyXJKKUevzc,11996
+fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=I9dozlJAW_XzuopyJapJ4gmDkLU0YSUz2znugiLZRMg,6203
+fbgemm_gpu/tbe/bench/utils.py,sha256=C0GTTomJO3r9LVfbpzlkudxoA_3QyeMdM-7zM-YOAHA,6716
 fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
 fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
 fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
@@ -110,7 +110,7 @@ fbgemm_gpu/tbe/ssd/training.py,sha256=2CFA4KmA9IfcpX14K4MlzBuSRPD9h5NM1M7TqepH6v
 fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
 fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
 fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
-fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=PMcaf27LpnflA7LMsuj1OpqTN3mPqddDoSeUnzKxLCs,13040
+fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=_lA4peKXI0GCWsZHJ7IUKlUHU98CA-gVoOc-uhRfcoY,13233
 fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
 fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
 fbgemm_gpu/tbe/utils/offsets.py,sha256=DDWwGaQsVZbhaEZ_fRxxeY8ndLc7IORPZrx61eOqwJc,1904
@@ -129,7 +129,7 @@ fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg
 fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
 list_versions/cli_run.py,sha256=BCRaJvjVFBFmD5WPdjC_yJwlLv1w_TYOe3eYlf_9ZMo,4506
-fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/METADATA,sha256=XYY34QMx8MqgZZ-kaizQphklulqwMnwgI2EvtFwPwGo,2654
-fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
-fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/RECORD,,
+fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/METADATA,sha256=sqUYIVBwodRVxysq3jEToUNFX12vtC4tZenZnKnynjo,2654
+fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
+fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD,,

{fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/top_level.txt RENAMED Viewed

File without changes