fbgemm-gpu-nightly-cpu 2026.1.22__cp312-cp312-manylinux_2_28_x86_64.whl → 2026.1.29__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
 
2
2
  {
3
- "version": "2026.1.22",
3
+ "version": "2026.1.29",
4
4
  "target": "default",
5
5
  "variant": "cpu"
6
6
  }
fbgemm_gpu/fbgemm.so CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -970,7 +970,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
970
970
  table_has_feature = [False] * T_
971
971
  for t in self.feature_table_map:
972
972
  table_has_feature[t] = True
973
- assert all(table_has_feature), "Each table must have at least one feature!"
973
+ assert all(table_has_feature), (
974
+ "Each table must have at least one feature!"
975
+ + f"{[(i, x) for i, x in enumerate(table_has_feature)]}"
976
+ )
974
977
 
975
978
  feature_dims = [dims[t] for t in self.feature_table_map]
976
979
  D_offsets = [0] + list(accumulate(feature_dims))
@@ -1786,6 +1789,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1786
1789
  cache: int,
1787
1790
  total_static_sparse: int,
1788
1791
  ephemeral: int,
1792
+ cache_weights: int = 0,
1793
+ cache_aux: int = 0,
1789
1794
  ) -> None:
1790
1795
  """Report HBM memory breakdown to stats reporter."""
1791
1796
  stats_reporter.report_data_amount(
@@ -1809,6 +1814,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1809
1814
  embedding_id=self.logging_table_name,
1810
1815
  tbe_id=self.uuid,
1811
1816
  )
1817
+ stats_reporter.report_data_amount(
1818
+ iteration_step=self.step,
1819
+ event_name="tbe.hbm.cache_weights",
1820
+ data_bytes=cache_weights,
1821
+ embedding_id=self.logging_table_name,
1822
+ tbe_id=self.uuid,
1823
+ )
1824
+ stats_reporter.report_data_amount(
1825
+ iteration_step=self.step,
1826
+ event_name="tbe.hbm.cache_aux",
1827
+ data_bytes=cache_aux,
1828
+ embedding_id=self.logging_table_name,
1829
+ tbe_id=self.uuid,
1830
+ )
1812
1831
  stats_reporter.report_data_amount(
1813
1832
  iteration_step=self.step,
1814
1833
  event_name="tbe.hbm.total_static_sparse",
@@ -1832,6 +1851,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1832
1851
  cache: int,
1833
1852
  total_static_sparse: int,
1834
1853
  ephemeral: int,
1854
+ cache_weights: int = 0,
1855
+ cache_aux: int = 0,
1835
1856
  ) -> None:
1836
1857
  """Report UVM memory breakdown to stats reporter."""
1837
1858
  stats_reporter.report_data_amount(
@@ -1855,6 +1876,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1855
1876
  embedding_id=self.logging_table_name,
1856
1877
  tbe_id=self.uuid,
1857
1878
  )
1879
+ stats_reporter.report_data_amount(
1880
+ iteration_step=self.step,
1881
+ event_name="tbe.uvm.cache_weights",
1882
+ data_bytes=cache_weights,
1883
+ embedding_id=self.logging_table_name,
1884
+ tbe_id=self.uuid,
1885
+ )
1886
+ stats_reporter.report_data_amount(
1887
+ iteration_step=self.step,
1888
+ event_name="tbe.uvm.cache_aux",
1889
+ data_bytes=cache_aux,
1890
+ embedding_id=self.logging_table_name,
1891
+ tbe_id=self.uuid,
1892
+ )
1858
1893
  stats_reporter.report_data_amount(
1859
1894
  iteration_step=self.step,
1860
1895
  event_name="tbe.uvm.total_static_sparse",
@@ -1931,34 +1966,50 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1931
1966
  "momentum2_host",
1932
1967
  "momentum2_uvm",
1933
1968
  ]
1934
- cache_tensors = [
1969
+ # Cache weights tensor (the actual cached embeddings in HBM)
1970
+ cache_weight_tensors = [
1935
1971
  "lxu_cache_weights",
1936
- "lxu_cache_state",
1937
- "lxu_state",
1938
- "cache_hash_size_cumsum",
1939
- "cache_index_table_map",
1940
- "cache_miss_counter",
1941
- "lxu_cache_locking_counter",
1972
+ ]
1973
+ # Cache auxiliary state tensors (metadata for cache management, excluding weights)
1974
+ # Sizes scale with hash_size or cache_slots (hash_size × clf)
1975
+ # Excludes constant-size tensors: cache_hash_size_cumsum, cache_miss_counter, etc.
1976
+ cache_aux_tensors = [
1977
+ "cache_index_table_map", # int32, 4B × hash_size
1978
+ "lxu_cache_state", # int64, 8B × cache_slots
1979
+ "lxu_state", # int64, 8B × cache_slots (LRU) or hash_size (LFU)
1980
+ "lxu_cache_locking_counter", # int32, 4B × cache_slots (only if prefetch_pipeline)
1942
1981
  ]
1943
1982
 
1944
1983
  # Calculate total memory for each component
1945
1984
  weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
1946
1985
  optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
1947
- cache_total = sum(self._get_tensor_memory(t) for t in cache_tensors)
1986
+ cache_weights_total = sum(
1987
+ self._get_tensor_memory(t) for t in cache_weight_tensors
1988
+ )
1989
+ cache_aux_total = sum(self._get_tensor_memory(t) for t in cache_aux_tensors)
1948
1990
 
1949
1991
  # Categorize memory by location (HBM vs UVM)
1950
1992
  if self.use_cpu:
1951
1993
  weights_hbm, weights_uvm = 0, weights_total
1952
1994
  opt_hbm, opt_uvm = 0, optimizer_total
1953
- cache_hbm, cache_uvm = 0, cache_total
1995
+ cache_weights_hbm, cache_weights_uvm = 0, cache_weights_total
1996
+ cache_aux_hbm, cache_aux_uvm = 0, cache_aux_total
1954
1997
  else:
1955
1998
  weights_hbm, weights_uvm = self._categorize_memory_by_location(
1956
1999
  weight_tensors
1957
2000
  )
1958
2001
  opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
1959
- cache_hbm, cache_uvm = self._categorize_memory_by_location(cache_tensors)
2002
+ cache_weights_hbm, cache_weights_uvm = self._categorize_memory_by_location(
2003
+ cache_weight_tensors
2004
+ )
2005
+ cache_aux_hbm, cache_aux_uvm = self._categorize_memory_by_location(
2006
+ cache_aux_tensors
2007
+ )
1960
2008
 
1961
2009
  # Calculate ephemeral memory split between HBM and UVM
2010
+ # Total cache = cache weights + cache auxiliary state
2011
+ cache_hbm = cache_weights_hbm + cache_aux_hbm
2012
+ cache_uvm = cache_weights_uvm + cache_aux_uvm
1962
2013
  static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
1963
2014
  static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
1964
2015
  ephemeral_hbm = total_hbm_usage - static_sparse_hbm
@@ -1972,6 +2023,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1972
2023
  cache_hbm,
1973
2024
  static_sparse_hbm,
1974
2025
  ephemeral_hbm,
2026
+ cache_weights_hbm,
2027
+ cache_aux_hbm,
1975
2028
  )
1976
2029
  self._report_uvm_breakdown(
1977
2030
  stats_reporter,
@@ -1980,6 +2033,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1980
2033
  cache_uvm,
1981
2034
  static_sparse_uvm,
1982
2035
  ephemeral_uvm,
2036
+ cache_weights_uvm,
2037
+ cache_aux_uvm,
1983
2038
  )
1984
2039
 
1985
2040
  @torch.jit.ignore
@@ -2232,6 +2287,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
2232
2287
  op_id=self.uuid,
2233
2288
  per_sample_weights=per_sample_weights,
2234
2289
  batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
2290
+ embedding_specs=[(s[0], s[1]) for s in self.embedding_specs],
2291
+ feature_table_map=self.feature_table_map,
2235
2292
  )
2236
2293
 
2237
2294
  if not is_torchdynamo_compiling():
@@ -41,7 +41,11 @@ from .tbe_data_config_param_models import ( # noqa F401
41
41
  IndicesParams,
42
42
  PoolingParams,
43
43
  )
44
- from .utils import fill_random_scale_bias # noqa F401
44
+ from .utils import ( # noqa F401
45
+ check_oom,
46
+ fill_random_scale_bias,
47
+ generate_merged_output_and_offsets,
48
+ )
45
49
 
46
50
  try:
47
51
  torch.ops.load_library(
@@ -10,7 +10,7 @@
10
10
  import dataclasses
11
11
  import json
12
12
  import logging
13
- from typing import Any, Optional
13
+ from typing import Any, List, Optional, Tuple
14
14
 
15
15
  import torch
16
16
 
@@ -32,30 +32,77 @@ except Exception:
32
32
 
33
33
  @dataclasses.dataclass(frozen=True)
34
34
  class TBEDataConfig:
35
- # Number of tables
36
35
  T: int
37
- # Number of rows in the embedding table
38
36
  E: int
39
- # Target embedding dimension for a table (number of columns)
40
37
  D: int
41
- # Generate mixed dimensions if true
42
38
  mixed_dim: bool
43
- # Whether the lookup rows are weighted or not
44
39
  weighted: bool
45
- # Batch parameters
46
40
  batch_params: BatchParams
47
- # Indices parameters
48
41
  indices_params: IndicesParams
49
- # Pooling parameters
50
42
  pooling_params: PoolingParams
51
- # Force generated tensors to be on CPU
52
43
  use_cpu: bool = False
53
- # Number of embeddings in each embedding features (number of rows)
54
44
  Es: Optional[list[int]] = None
55
- # Target embedding dimension for each features (number of columns)
56
45
  Ds: Optional[list[int]] = None
57
- # Maximum number of indices
58
- max_indices: Optional[int] = None # Maximum number of indices
46
+ max_indices: Optional[int] = None
47
+ embedding_specs: Optional[List[Tuple[int, int]]] = None
48
+ feature_table_map: Optional[List[int]] = None
49
+ """
50
+ Configuration for TBE (Table Batched Embedding) benchmark data collection and generation.
51
+
52
+ This dataclass holds parameters required to generate synthetic data for
53
+ TBE benchmarking, including table specifications, batch parameters, indices
54
+ distribution parameters, and pooling parameters.
55
+
56
+ Args:
57
+ T (int): Number of embedding tables (features). Must be positive.
58
+ E (int): Number of rows in the embedding table (feature). If T > 1, this
59
+ represents the averaged number of rows across all features.
60
+ D (int): Target embedding dimension for a table (feature), i.e., number of
61
+ columns. If T > 1, this represents the averaged dimension across
62
+ all features.
63
+ mixed_dim (bool): If True, generate embeddings with mixed dimensions
64
+ across tables (features). This is automatically set to True if D is provided
65
+ as a list with non-uniform values.
66
+ weighted (bool): If True, the lookup rows are weighted (per-sample
67
+ weights). The weights will be generated as FP32 tensors.
68
+ batch_params (BatchParams): Parameters controlling batch generation.
69
+ Contains:
70
+ (1) `B` = target batch size (number of batch lookups per features)
71
+ (2) `sigma_B` = optional standard deviation for variable batch size
72
+ (3) `vbe_distribution` = distribution type ("normal" or "uniform")
73
+ (4) `vbe_num_ranks` = number of ranks for variable batch size
74
+ (5) `Bs` = per-feature batch sizes
75
+ indices_params (IndicesParams): Parameters controlling index generation
76
+ following a Zipf distribution. Contains:
77
+ (1) `heavy_hitters` = probability density map for hot indices
78
+ (2) `zipf_q` = q parameter in Zipf distribution (x+q)^{-s}
79
+ (3) `zipf_s` = s parameter (alpha) in Zipf distribution
80
+ (4) `index_dtype` = optional dtype for indices tensor
81
+ (5) `offset_dtype` = optional dtype for offsets tensor
82
+ pooling_params (PoolingParams): Parameters controlling pooling behavior.
83
+ Contains:
84
+ (1) `L` = target bag size (pooling factor, indices per lookup)
85
+ (2) `sigma_L` = optional standard deviation for variable bag size
86
+ (3) `length_distribution` = distribution type ("normal" or "uniform")
87
+ (4) `Ls` = per-feature bag sizes
88
+ use_cpu (bool = False): If True, force generated tensors to be placed
89
+ on CPU instead of the default compute device.
90
+ Es (Optional[List[int]] = None): Number of embeddings (rows) for each
91
+ individual embedding feature. If provided, must have length equal
92
+ to T. All elements must be positive.
93
+ Ds (Optional[List[int]] = None): Target embedding dimension (columns)
94
+ for each individual feature. If provided, must have length equal
95
+ to T. All elements must be positive.
96
+ max_indices (Optional[int] = None): Maximum number of indices for
97
+ bounds checking. If Es is provided as a list and max_indices is
98
+ None, it is automatically computed as sum(Es) - 1.
99
+ embedding_specs (Optional[List[Tuple[int, int]]] = None): A list of
100
+ embedding specs consisting of a list of tuples of (num_rows, embedding_dim).
101
+ See https://fburl.com/tbe_embedding_specs for details.
102
+ feature_table_map (Optional[List[int]] = None): An optional list that
103
+ specifies feature-table mapping. feature_table_map[i] indicates the
104
+ physical embedding table that feature i maps to.
105
+ """
59
106
 
60
107
  def __post_init__(self) -> None:
61
108
  if isinstance(self.D, list):
@@ -117,17 +164,25 @@ class TBEDataConfig:
117
164
  assert self.D > 0, "D must be positive"
118
165
  if self.Ds is not None:
119
166
  assert all(d > 0 for d in self.Ds), "All elements in Ds must be positive"
120
- if isinstance(self.E, list) and isinstance(self.D, list):
167
+ if isinstance(self.Es, list) and isinstance(self.Ds, list):
121
168
  assert (
122
- len(self.E) == len(self.D) == self.T
169
+ len(self.Es) == len(self.Ds) == self.T
123
170
  ), "Lengths of Es, Lengths of Ds, and T must be equal"
124
171
  if self.max_indices is not None:
125
172
  assert self.max_indices == (
126
173
  sum(self.Es) - 1
127
174
  ), "max_indices must be equal to sum(Es) - 1"
128
175
  self.batch_params.validate()
176
+ if self.batch_params.Bs is not None:
177
+ assert (
178
+ len(self.batch_params.Bs) == self.T
179
+ ), f"Length of Bs must be equal to T. Expected: {self.T}, but got: {len(self.batch_params.Bs)}"
129
180
  self.indices_params.validate()
130
181
  self.pooling_params.validate()
182
+ if self.pooling_params.Ls is not None:
183
+ assert (
184
+ len(self.pooling_params.Ls) == self.T
185
+ ), f"Length of Ls must be equal to T. Expected: {self.T}, but got: {len(self.pooling_params.Ls)}"
131
186
  return self
132
187
 
133
188
  def variable_B(self) -> bool:
@@ -7,6 +7,7 @@
7
7
 
8
8
  # pyre-strict
9
9
 
10
+ import logging
10
11
  from typing import Optional
11
12
 
12
13
  import numpy as np
@@ -35,6 +36,9 @@ except Exception:
35
36
  def _generate_batch_sizes(
36
37
  tbe_data_config: TBEDataConfig,
37
38
  ) -> tuple[list[int], Optional[list[list[int]]]]:
39
+ logging.info(
40
+ f"DEBUG_TBE: [_generate_batch_sizes] VBE tbe_data_config.variable_B()={tbe_data_config.variable_B()}"
41
+ )
38
42
  if tbe_data_config.variable_B():
39
43
  assert (
40
44
  tbe_data_config.batch_params.vbe_num_ranks is not None
@@ -48,7 +52,6 @@ def _generate_batch_sizes(
48
52
  # pyre-ignore [6]
49
53
  tbe_data_config.batch_params.vbe_distribution,
50
54
  )
51
-
52
55
  else:
53
56
  return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
54
57
 
@@ -89,13 +92,15 @@ def _generate_indices(
89
92
  start_offset = L_offsets_list[it * total_B]
90
93
  end_offset = L_offsets_list[(it + 1) * total_B]
91
94
 
95
+ logging.info(f"DEBUG_TBE: _generate_indices E = {tbe_data_config.E=}")
96
+
92
97
  indices_list.append(
93
98
  torch.ops.fbgemm.tbe_generate_indices_from_distribution(
94
99
  tbe_data_config.indices_params.heavy_hitters,
95
100
  tbe_data_config.indices_params.zipf_q,
96
101
  tbe_data_config.indices_params.zipf_s,
97
102
  # max_index = dimensions of the embedding table
98
- tbe_data_config.E,
103
+ int(tbe_data_config.E),
99
104
  # num_indices = number of indices to generate
100
105
  end_offset - start_offset,
101
106
  )
@@ -184,6 +189,10 @@ def generate_requests(
184
189
  else:
185
190
  Bs, _ = _generate_batch_sizes(tbe_data_config)
186
191
 
192
+ logging.info(
193
+ f"DEBUG_TBE: VBE [generate_requests] batch_size_per_feature_per_rank={batch_size_per_feature_per_rank} Bs={Bs}"
194
+ )
195
+
187
196
  assert Bs is not None, "Batch sizes (Bs) must be set"
188
197
 
189
198
  # Generate pooling info
@@ -8,6 +8,8 @@
8
8
  # pyre-strict
9
9
 
10
10
  import dataclasses
11
+ import logging
12
+ import re
11
13
  from enum import Enum
12
14
 
13
15
  import click
@@ -45,12 +47,16 @@ class TBEDataConfigHelperText(Enum):
45
47
  TBE_INDICES_HITTERS = "Heavy hitters for indices (comma-delimited list of floats)"
46
48
  TBE_INDICES_ZIPF = "Zipf distribution parameters for indices generation (q, s)"
47
49
  TBE_INDICES_DTYPE = "The dtype of the table indices (choices: '32', '64')"
48
- TBE_OFFSETS_DTYPE = "The dtype of the table indices (choices: '32', '64')"
50
+ TBE_OFFSETS_DTYPE = "The dtype of the table offsets (choices: '32', '64')"
49
51
 
50
52
  # Pooling Parameters
51
53
  TBE_POOLING_SIZE = "Bag size / pooling factor (L)"
52
- TBE_POOLING_VL_SIGMA = "Standard deviation of B for VBE"
53
- TBE_POOLING_VL_DIST = "VBE distribution (choices: 'uniform', 'normal')"
54
+ TBE_POOLING_VL_SIGMA = "Standard deviation of L for variable bag size"
55
+ TBE_POOLING_VL_DIST = (
56
+ "Variable bag size distribution (choices: 'uniform', 'normal')"
57
+ )
58
+ TBE_EMBEDDING_SPECS = "Embedding Specs which is List[Tuple[int, int, EmbeddingLocation, ComputeDevice]]"
59
+ TBE_FEATURE_TABLE_MAP = "Mapping of feature-table"
54
60
 
55
61
 
56
62
  class TBEDataConfigLoader:
@@ -193,6 +199,18 @@ class TBEDataConfigLoader:
193
199
  required=False,
194
200
  help=TBEDataConfigHelperText.TBE_POOLING_VL_DIST.value,
195
201
  ),
202
+ click.option(
203
+ "--tbe-embedding-specs",
204
+ type=str,
205
+ required=False,
206
+ help=TBEDataConfigHelperText.TBE_EMBEDDING_SPECS.value,
207
+ ),
208
+ click.option(
209
+ "--tbe-feature-table-map",
210
+ type=str,
211
+ required=False,
212
+ help=TBEDataConfigHelperText.TBE_FEATURE_TABLE_MAP.value,
213
+ ),
196
214
  ]
197
215
 
198
216
  for option in reversed(options):
@@ -213,15 +231,21 @@ class TBEDataConfigLoader:
213
231
  params = context.params
214
232
 
215
233
  # Read table parameters
216
- T = params["tbe_num_tables"]
217
- E = params["tbe_num_embeddings"]
234
+ T = params["tbe_num_tables"] # number of features
235
+ E = params["tbe_num_embeddings"] # feature_rows
218
236
  if params["tbe_num_embeddings_list"] is not None:
219
237
  Es = [int(x) for x in params["tbe_num_embeddings_list"].split(",")]
238
+ T = len(Es)
239
+ E = sum(Es) // T # average E
220
240
  else:
221
241
  Es = None
222
242
  D = params["tbe_embedding_dim"]
223
243
  if params["tbe_embedding_dim_list"] is not None:
224
244
  Ds = [int(x) for x in params["tbe_embedding_dim_list"].split(",")]
245
+ assert (
246
+ len(Ds) == T
247
+ ), f"Expected tbe_embedding_dim_list to have {T} elements, but got {len(Ds)}"
248
+ D = sum(Ds) // T # average D
225
249
  else:
226
250
  Ds = None
227
251
 
@@ -239,10 +263,31 @@ class TBEDataConfigLoader:
239
263
  vbe_num_ranks = params["tbe_batch_vbe_ranks"]
240
264
  if params["tbe_batch_sizes_list"] is not None:
241
265
  Bs = [int(x) for x in params["tbe_batch_sizes_list"].split(",")]
266
+ B = sum(Bs) // T # average B
242
267
  else:
268
+ B = params["tbe_batch_size"]
243
269
  Bs = None
244
270
  batch_params = BatchParams(B, sigma_B, vbe_distribution, vbe_num_ranks, Bs)
245
271
 
272
+ # Parse embedding_specs: "(E,D),(E,D),..." or "(E,D,loc,dev),(E,D,loc,dev),..."
273
+ # Only the first two values (E, D) are extracted.
274
+ embedding_specs = None
275
+ feature_table_map = None
276
+ if params["tbe_embedding_specs"] is not None:
277
+ try:
278
+ tuples = re.findall(r"\(([^)]+)\)", params["tbe_embedding_specs"])
279
+ if tuples:
280
+ embedding_specs = [
281
+ (int(t.split(",")[0].strip()), int(t.split(",")[1].strip()))
282
+ for t in tuples
283
+ ]
284
+ except (ValueError, IndexError):
285
+ logging.warning("Failed to parse embedding_specs. Setting to None.")
286
+ if params["tbe_feature_table_map"] is not None:
287
+ feature_table_map = [
288
+ int(x) for x in params["tbe_feature_table_map"].split(",")
289
+ ]
290
+
246
291
  # Read indices parameters
247
292
  heavy_hitters = (
248
293
  torch.tensor([float(x) for x in params["tbe_indices_hitters"].split(",")])
@@ -279,6 +324,8 @@ class TBEDataConfigLoader:
279
324
  Es,
280
325
  Ds,
281
326
  max_indices,
327
+ embedding_specs,
328
+ feature_table_map,
282
329
  ).validate()
283
330
 
284
331
  @classmethod
@@ -98,7 +98,7 @@ class BatchParams:
98
98
  vbe_distribution: Optional[str] = "normal"
99
99
  # Number of ranks for variable batch size generation
100
100
  vbe_num_ranks: Optional[int] = None
101
- # List of target batch sizes, i.e. number of batch lookups per table
101
+ # List of target batch sizes, i.e. number of batch lookups per feature
102
102
  Bs: Optional[list[int]] = None
103
103
 
104
104
  @classmethod
@@ -142,6 +142,8 @@ class PoolingParams:
142
142
  sigma_L: Optional[int] = None
143
143
  # [Optional] Distribution of embedding sequence lengths (normal, uniform)
144
144
  length_distribution: Optional[str] = "normal"
145
+ # [Optional] List of target bag sizes, i.e. pooling factors per batch
146
+ Ls: Optional[list[float]] = None
145
147
 
146
148
  @classmethod
147
149
  # pyre-ignore [3]
@@ -6,7 +6,7 @@
6
6
 
7
7
  # pyre-strict
8
8
 
9
- import logging
9
+ from typing import List, Tuple
10
10
 
11
11
  import numpy as np
12
12
  import torch
@@ -14,8 +14,6 @@ import torch
14
14
  # fmt:skip
15
15
  from fbgemm_gpu.split_embedding_configs import SparseType
16
16
 
17
- logging.basicConfig(level=logging.DEBUG)
18
-
19
17
 
20
18
  def fill_random_scale_bias(
21
19
  emb: torch.nn.Module,
@@ -47,3 +45,128 @@ def fill_random_scale_bias(
47
45
  device=scale_shift.device,
48
46
  )
49
47
  )
48
+
49
+
50
+ def check_oom(
51
+ data_size: int,
52
+ ) -> Tuple[bool, str]:
53
+ free_memory, total_memory = torch.cuda.mem_get_info()
54
+ if data_size > free_memory:
55
+ warning = f"Expect to allocate {round(data_size / (1024 ** 3), 2)} GB, but available memory is {round(free_memory / (1024 ** 3), 2)} GB from {round(total_memory / (1024 ** 3), 2)} GB."
56
+ return (True, warning)
57
+ return (False, "")
58
+
59
+
60
+ def generate_batch_size_per_feature_per_rank(
61
+ Bs: List[int], num_ranks: int
62
+ ) -> List[List[int]]:
63
+ """
64
+ Generate batch size per feature per rank for VBE, assuming the batch size
65
+ is evenly distributed across ranks.
66
+ Args:
67
+ Bs (List[int]): batch size per feature
68
+ num_ranks (int): number of ranks
69
+ Returns:
70
+ List[List[int]]: batch size per feature per rank
71
+ """
72
+ b_per_feature_per_rank = []
73
+ for B in Bs:
74
+ b_per_feature = []
75
+ for i in range(num_ranks):
76
+ if i != num_ranks - 1:
77
+ b_per_feature.append(int(B / num_ranks))
78
+ else:
79
+ b_per_feature.append(B - sum(b_per_feature))
80
+ b_per_feature_per_rank.append(b_per_feature)
81
+ return b_per_feature_per_rank
82
+
83
+
84
+ def generate_merged_output_and_offsets(
85
+ Ds: List[int],
86
+ Bs: List[int],
87
+ output_dtype: torch.dtype,
88
+ device: torch.device,
89
+ num_ranks: int = 2,
90
+ num_tbe_ops: int = 2,
91
+ ) -> Tuple[List[List[int]], torch.Tensor, torch.Tensor]:
92
+ """
93
+ Generate merged vbe_output and vbe_output_offsets tensors for VBE.
94
+ The vbe_output is a tensor that will contain forward output from all VBE TBE ops.
95
+ The vbe_output_offsets is a tensor that will contain start offsets for the output to be written to.
96
+
97
+ Args:
98
+ Ds (List[int]): embedding dimension per feature
99
+ Bs (List[int]): batch size per feature
100
+ num_ranks (int): number of ranks
101
+ num_tbe_ops (int): number of TBE ops
102
+ Returns:
103
+ Tuple[List[List[int]], torch.Tensor, torch.Tensor]: batch_size_per_feature_per_rank, merged vbe_output and vbe_output_offsets tensors
104
+ """
105
+ # The first embedding ops is the embedding op created in the benchmark
106
+ emb_op = {}
107
+ emb_op[0] = {}
108
+ emb_op[0]["dim"] = Ds
109
+ emb_op[0]["Bs"] = Bs
110
+ emb_op[0]["output_size"] = sum([b * d for b, d in zip(Bs, Ds)])
111
+ emb_op[0]["batch_size_per_feature_per_rank"] = (
112
+ generate_batch_size_per_feature_per_rank(Bs, num_ranks)
113
+ )
114
+ num_features = len(Bs)
115
+ # create other embedding ops to allocate output and offsets tensors
116
+ # Using representative values for additional TBE ops in multi-op scenarios:
117
+ # - batch_size=32000: typical large batch size for production workloads
118
+ # - dim=512: common embedding dimension for large models
119
+ for i in range(1, num_tbe_ops):
120
+ emb_op[i] = {}
121
+ emb_op[i]["batch_size_per_feature_per_rank"] = (
122
+ generate_batch_size_per_feature_per_rank([32000], num_ranks)
123
+ )
124
+ emb_op[i]["Bs"] = [sum(B) for B in emb_op[i]["batch_size_per_feature_per_rank"]]
125
+ emb_op[i]["dim"] = [512]
126
+ emb_op[i]["output_size"] = sum(
127
+ [b * d for b, d in zip(emb_op[i]["Bs"], emb_op[i]["dim"])]
128
+ )
129
+ total_output = 0
130
+ ranks = [[] for _ in range(num_ranks)]
131
+ for e in emb_op.values():
132
+ b_per_rank_per_feature = list(zip(*e["batch_size_per_feature_per_rank"]))
133
+ assert len(b_per_rank_per_feature) == num_ranks
134
+ dims = e["dim"]
135
+ for r, b_r in enumerate(b_per_rank_per_feature):
136
+ for f, b in enumerate(b_r):
137
+ output_size_per_batch = b * dims[f]
138
+ ranks[r].append(output_size_per_batch)
139
+ total_output += output_size_per_batch
140
+ ranks[0].insert(0, 0)
141
+ offsets_ranks: List[List[int]] = [[] for _ in range(num_ranks)]
142
+ total_output_offsets = []
143
+ start = 0
144
+ for r in range(num_ranks):
145
+ offsets_ranks[r] = [
146
+ start + sum(ranks[r][: i + 1]) for i in range(len(ranks[r]))
147
+ ]
148
+ start = offsets_ranks[r][-1]
149
+ total_output_offsets.extend(offsets_ranks[r])
150
+ check_total_output_size = sum([e["output_size"] for e in emb_op.values()])
151
+ assert (
152
+ total_output == check_total_output_size
153
+ ), f"{total_output} != {check_total_output_size}{[e['output_size'] for e in emb_op.values()]}"
154
+ assert (
155
+ total_output == total_output_offsets[-1]
156
+ ), f"{total_output} != {total_output_offsets[-1]}"
157
+ out = torch.empty(total_output, dtype=output_dtype, device=device)
158
+ offsets = []
159
+ offsets.append(offsets_ranks[0][:num_features])
160
+ for r in range(1, num_ranks):
161
+ start = [offsets_ranks[r - 1][-1]]
162
+ the_rest = offsets_ranks[r][: num_features - 1] if num_features > 1 else []
163
+ start.extend(the_rest)
164
+ offsets.append(start)
165
+
166
+ out_offsets = torch.tensor(
167
+ offsets,
168
+ dtype=torch.int64,
169
+ device=device,
170
+ )
171
+ batch_size_per_feature_per_rank = emb_op[0]["batch_size_per_feature_per_rank"]
172
+ return (batch_size_per_feature_per_rank, out, out_offsets)
@@ -11,7 +11,7 @@ import io
11
11
  import json
12
12
  import logging
13
13
  import os
14
- from typing import Optional
14
+ from typing import List, Optional, Tuple
15
15
 
16
16
  import fbgemm_gpu # noqa F401
17
17
  import torch # usort:skip
@@ -137,6 +137,20 @@ class TBEBenchmarkParamsReporter:
137
137
  path_prefix=path_prefix,
138
138
  )
139
139
 
140
+ def extract_Ls(
141
+ self,
142
+ bag_sizes: List[int],
143
+ Bs: List[int],
144
+ ) -> List[float]:
145
+ Ls = []
146
+ start = 0
147
+ for b in Bs:
148
+ end = start + b
149
+ avg_L = sum(bag_sizes[start:end]) / b if b > 0 else 0
150
+ start = end
151
+ Ls.append(avg_L)
152
+ return Ls
153
+
140
154
  def extract_params(
141
155
  self,
142
156
  feature_rows: torch.Tensor,
@@ -144,7 +158,11 @@ class TBEBenchmarkParamsReporter:
144
158
  indices: torch.Tensor,
145
159
  offsets: torch.Tensor,
146
160
  per_sample_weights: Optional[torch.Tensor] = None,
147
- batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
161
+ batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
162
+ Es: Optional[List[int]] = None,
163
+ Ds: Optional[List[int]] = None,
164
+ embedding_specs: Optional[List[Tuple[int, int]]] = None,
165
+ feature_table_map: Optional[List[int]] = None,
148
166
  ) -> TBEDataConfig:
149
167
  """
150
168
  Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
@@ -201,8 +219,14 @@ class TBEBenchmarkParamsReporter:
201
219
  )
202
220
 
203
221
  # Compute batch parameters
222
+ B = int((offsets.numel() - 1) // T)
223
+ Bs = (
224
+ [sum(b_per_rank) for b_per_rank in batch_size_per_feature_per_rank]
225
+ if batch_size_per_feature_per_rank
226
+ else [B] * T
227
+ )
204
228
  batch_params = BatchParams(
205
- B=int((offsets.numel() - 1) // T),
229
+ B=B,
206
230
  sigma_B=(
207
231
  int(
208
232
  torch.ceil(
@@ -226,10 +250,14 @@ class TBEBenchmarkParamsReporter:
226
250
  if batch_size_per_feature_per_rank
227
251
  else None
228
252
  ),
253
+ Bs=Bs,
229
254
  )
230
255
 
231
256
  # Compute pooling parameters
232
257
  bag_sizes = offsets[1:] - offsets[:-1]
258
+ if batch_size_per_feature_per_rank is None:
259
+ _B = int(bag_sizes.numel() // T)
260
+ assert _B == Bs[0], f"Expected constant batch size {Bs[0]} but got {_B}"
233
261
  mixed_bag_sizes = len(set(bag_sizes)) > 1
234
262
  pooling_params = PoolingParams(
235
263
  L=(
@@ -243,6 +271,7 @@ class TBEBenchmarkParamsReporter:
243
271
  else None
244
272
  ),
245
273
  length_distribution=("normal" if mixed_bag_sizes else None),
274
+ Ls=self.extract_Ls(bag_sizes.tolist(), Bs),
246
275
  )
247
276
 
248
277
  return TBEDataConfig(
@@ -255,6 +284,10 @@ class TBEBenchmarkParamsReporter:
255
284
  indices_params=indices_params,
256
285
  pooling_params=pooling_params,
257
286
  use_cpu=(not torch.cuda.is_available()),
287
+ Es=Es,
288
+ Ds=Ds,
289
+ embedding_specs=embedding_specs,
290
+ feature_table_map=feature_table_map,
258
291
  )
259
292
 
260
293
  def report_stats(
@@ -266,7 +299,9 @@ class TBEBenchmarkParamsReporter:
266
299
  offsets: torch.Tensor,
267
300
  op_id: str = "",
268
301
  per_sample_weights: Optional[torch.Tensor] = None,
269
- batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
302
+ batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
303
+ embedding_specs: Optional[List[Tuple[int, int]]] = None,
304
+ feature_table_map: Optional[List[int]] = None,
270
305
  ) -> None:
271
306
  """
272
307
  Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
@@ -280,6 +315,8 @@ class TBEBenchmarkParamsReporter:
280
315
  op_id (str, optional): The operation identifier. Defaults to an empty string.
281
316
  per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
282
317
  batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
318
+ embedding_specs (Optional[List[Tuple[int, int]]]): Embedding specs. Defaults to None.
319
+ feature_table_map (Optional[List[int]], optional): Feature table map. Defaults to None.
283
320
  """
284
321
  if (
285
322
  (iteration - self.report_iter_start) % self.report_interval == 0
@@ -299,41 +336,14 @@ class TBEBenchmarkParamsReporter:
299
336
  offsets=offsets,
300
337
  per_sample_weights=per_sample_weights,
301
338
  batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
339
+ Es=feature_rows.tolist(),
340
+ Ds=feature_dims.tolist(),
341
+ embedding_specs=embedding_specs,
342
+ feature_table_map=feature_table_map,
302
343
  )
303
344
 
304
- # Ad-hoc fix for adding Es and Ds to JSON output
305
- # TODO: Remove this once we moved Es and Ds to be part of TBEDataConfig
306
- adhoc_config = config.dict()
307
- adhoc_config["Es"] = feature_rows.tolist()
308
- adhoc_config["Ds"] = feature_dims.tolist()
309
- if batch_size_per_feature_per_rank:
310
- adhoc_config["Bs"] = [
311
- sum(batch_size_per_feature_per_rank[f])
312
- for f in range(len(adhoc_config["Es"]))
313
- ]
314
-
315
- bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
316
- adhoc_config["Ls"] = []
317
- pointer_counter = 0
318
- if batch_size_per_feature_per_rank:
319
- for batchs_size in adhoc_config["Bs"]:
320
- current_L = 0
321
- for _i in range(batchs_size):
322
- current_L += bag_sizes[pointer_counter]
323
- pointer_counter += 1
324
- adhoc_config["Ls"].append(current_L / batchs_size)
325
- else:
326
- batch_size = int(len(bag_sizes) // len(adhoc_config["Es"]))
327
-
328
- for _j in range(len(adhoc_config["Es"])):
329
- current_L = 0
330
- for _i in range(batch_size):
331
- current_L += bag_sizes[pointer_counter]
332
- pointer_counter += 1
333
- adhoc_config["Ls"].append(current_L / batch_size)
334
-
335
345
  # Write the TBE config to FileStore
336
346
  self.filestore.write(
337
347
  f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
338
- io.BytesIO(json.dumps(adhoc_config, indent=2).encode()),
348
+ io.BytesIO(json.dumps(config.dict(), indent=2).encode()),
339
349
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_nightly-cpu
3
- Version: 2026.1.22
3
+ Version: 2026.1.29
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -2,15 +2,15 @@ fbgemm_gpu/__init__.py,sha256=JrSxUgY_diRl9kXapbyq3iteiB32D02CPan3stEFiAM,6434
2
2
  fbgemm_gpu/asmjit.so,sha256=DNnFdMXB8IW_9ulBAn7I5EMmAJ5y-yT0-YRDBszOqXA,501728
3
3
  fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=pZqqUfvPIsaIo1CWX-_W087WQg-YEZuS0GNGoKFO_9c,2915
4
4
  fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
5
- fbgemm_gpu/fbgemm.so,sha256=eX7OHRRmF7Bg-bXIriBsr5_Z9XfpNBcKnlR0tWCSyzk,5675384
6
- fbgemm_gpu/fbgemm_gpu_config.so,sha256=xfspmI7ouPXTm7GZnZsf_wT3oMl7lHVIUGdC2cHa9JM,51176
5
+ fbgemm_gpu/fbgemm.so,sha256=lkRdKMi7O083a2gbKCuNsytkb9gHIgcJvNwPRm6RP8U,5659000
6
+ fbgemm_gpu/fbgemm_gpu_config.so,sha256=ksQXaC-3-l3Agi0wxeE9o_2wAG3W4-4X84eplwPr5E8,47080
7
7
  fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so,sha256=BGXKjkImBO3W6weT6SasOOCUB2w4x1v_DMAadGbqciQ,88032
8
- fbgemm_gpu/fbgemm_gpu_py.so,sha256=SI2UZFDfMtfDSTb_S0DBT77gQOKnbafTcM4lUegJP78,4913424
8
+ fbgemm_gpu/fbgemm_gpu_py.so,sha256=xD0-0Xnb8kDGbh8rPWnqEYNhKZeTm3BJ1hp24XWaxlo,4938000
9
9
  fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so,sha256=o7Vne9VVYq2VKz_jOLw97erI0RbiAMPNLk2oODNQvEA,79840
10
- fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=-uedOtg-UanyeCGB9jObYLAzDc5_JgS7X9Yddt4Feng,260080
11
- fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=BJl4Gn2tf-UNaaTzP95deLW0XsATiOWj_mh4CIDVTb0,395232
10
+ fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=b_Dxjg-U5ilu1tyi-JmfQ8c13ABEpH1-QrVDYUYoXPI,284656
11
+ fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=opZDhNxRVqEBDwDDZmLp2TLmLYKxmLE4I11DhR4rd_A,387040
12
12
  fbgemm_gpu/fbgemm_gpu_tbe_index_select.so,sha256=uGRcOuVR_O4bxaEfuq5ve-xXMMV2HKhHfTm_LnKJ_aw,333800
13
- fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=AWYlslvwCrOxOPHemDYsuTe6E8UYEk4je2ioAoCl2mU,605000
13
+ fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=HeMj3Q_KhYBKi2MQ-jBLGgE82TGb_75twp0hVeMqWgA,605000
14
14
  fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so,sha256=QmRzU2tSEZTxarB3xNSDFKWdRKfKHrmowMiws_f3zmc,13760
15
15
  fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so,sha256=2hARnWgefCIXwYNwFwNsGa2kei8fEY_gqia8NxqFclA,1278144
16
16
  fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so,sha256=N569f2bc0y_CyBJagS8RlsmrHVWpwQMY14kFdpF6T_Y,13760
@@ -34,7 +34,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
34
34
  fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
35
35
  fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=_uUplpcyQOQuxqv8-HV94VUM5lG8e3aGWltXhOgICQc,19294
36
36
  fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
37
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=6pgmu4v5qdrrSzxRyzgg4GQsrLAha0br__GbT854UxI,189015
37
+ fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=uCPngWxxC5OQhJv7o6aGs8xf3WlRSrdRHbpCBlPbIuE,191511
38
38
  fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
39
39
  fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=2TTKsF5yaROTaI69YdCIt8hr_v2TDEo8EraZ0QXNBxc,717
40
40
  fbgemm_gpu/tbe_input_multiplexer.py,sha256=MbZF8aZdm_kV-JRMaooeZrqlh6Pn5IuNkSXBXODp-LE,3062
@@ -49,7 +49,7 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
49
49
  fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
50
50
  fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
51
51
  fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
52
- fbgemm_gpu/docs/target.default.json.py,sha256=nW6QTSVVz4u9JpGEqWxc-4oaolSZb-Bj_4nxBJA2hn8,79
52
+ fbgemm_gpu/docs/target.default.json.py,sha256=_BcuMA1hCJ_Jtf08E7O8t-R8A5HiRXHH3Z9rpgCq66U,79
53
53
  fbgemm_gpu/quantize/__init__.py,sha256=yPUCmLhNdahHFireHPQMmmiRp3g6W2dkIl5MB51M6SU,942
54
54
  fbgemm_gpu/quantize/quantize_ops.py,sha256=C3SN79GcL7fczzoFkxUojm6cGkvvI4iWttkGN4LFQcM,2239
55
55
  fbgemm_gpu/sll/__init__.py,sha256=nLFeTiRed6A5STRi_EgHCyNoik0zhXUk2db5kTmMUNU,4221
@@ -87,7 +87,7 @@ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py,sha256=N32H1lUb
87
87
  fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py,sha256=xWSmk56JgoYfO8eiiK4BP9Brbhixs4tUAMeWp5TPZ30,956
88
88
  fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py,sha256=bjrbKAypa-FnOIVKH-IUnWP1Jhlu0lk1SopZ0KLFVdo,6623
89
89
  fbgemm_gpu/tbe/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
90
- fbgemm_gpu/tbe/bench/__init__.py,sha256=wgPBmxtQMmbA39cbQ2nO4PGAk5lXjFGjG8-9FoAXg34,1589
90
+ fbgemm_gpu/tbe/bench/__init__.py,sha256=TyUVsIH4p-RtFaXAKppYoaWbf9UTjCTUpnIV7RD_O5E,1653
91
91
  fbgemm_gpu/tbe/bench/bench_config.py,sha256=xgtlGLCeZVW6jBYwkKsiQeCslCrWDgJbV2NLLwCRSn4,5452
92
92
  fbgemm_gpu/tbe/bench/bench_runs.py,sha256=K4HRUcsX4BWqtrYwinZSXjnjNDFkvpoEdQmv-6rz7Tk,23518
93
93
  fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=ofcGsiTUj3_Ml7JSsqg_LcMw3CV-6ypmlRWAUmT_cjc,6941
@@ -95,11 +95,11 @@ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=B3QOZhtycMDwHMG3dFKnlFuWOqYRCF3RCozEQfrqv
95
95
  fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=zdL_ve1Ga6ziU5LjfnzJXOBOIqtCjLlhSrlGfa42H9w,4978
96
96
  fbgemm_gpu/tbe/bench/eval_compression.py,sha256=ulFMaNZF2g_vfkXLWZSh02ibotg1zpTz3swVU484mzU,3486
97
97
  fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
98
- fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=KTRIZWJIeNgoc6H68iPS45uVEQ3S96IvjLHvBS4nTyQ,4835
99
- fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=2lseM16Ky12FBY0E5ChfWOM6KJbi4iXWAOwkHbE2YeM,10933
100
- fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=i6DY6DdKSeQ5gE_MUqHY3a04MGy18Vd_lg8ht-qEEyY,10018
101
- fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
102
- fbgemm_gpu/tbe/bench/utils.py,sha256=IOPMnzTC7TUWVGyDzNPvP6r8BekWgO-TzxOQW21brj4,1728
98
+ fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=d724L4Is3Bo2D5reglgsBs7H6ezLFDrQUbTP5tsnPEQ,8509
99
+ fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=c-IwLbx04Qbqxzfcn9N4U9Eo9QnmgbBN6HxJYAJwvMw,11311
100
+ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=fSdtEAnKu6r56mHMtMJIHo-S6m3vC4cPRyXJKKUevzc,11996
101
+ fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=I9dozlJAW_XzuopyJapJ4gmDkLU0YSUz2znugiLZRMg,6203
102
+ fbgemm_gpu/tbe/bench/utils.py,sha256=C0GTTomJO3r9LVfbpzlkudxoA_3QyeMdM-7zM-YOAHA,6716
103
103
  fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
104
104
  fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
105
105
  fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
@@ -110,7 +110,7 @@ fbgemm_gpu/tbe/ssd/training.py,sha256=2CFA4KmA9IfcpX14K4MlzBuSRPD9h5NM1M7TqepH6v
110
110
  fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
111
111
  fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
112
112
  fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
113
- fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=PMcaf27LpnflA7LMsuj1OpqTN3mPqddDoSeUnzKxLCs,13040
113
+ fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=_lA4peKXI0GCWsZHJ7IUKlUHU98CA-gVoOc-uhRfcoY,13233
114
114
  fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
115
115
  fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
116
116
  fbgemm_gpu/tbe/utils/offsets.py,sha256=DDWwGaQsVZbhaEZ_fRxxeY8ndLc7IORPZrx61eOqwJc,1904
@@ -129,7 +129,7 @@ fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg
129
129
  fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
130
130
  list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
131
131
  list_versions/cli_run.py,sha256=BCRaJvjVFBFmD5WPdjC_yJwlLv1w_TYOe3eYlf_9ZMo,4506
132
- fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/METADATA,sha256=XYY34QMx8MqgZZ-kaizQphklulqwMnwgI2EvtFwPwGo,2654
133
- fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
134
- fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
135
- fbgemm_gpu_nightly_cpu-2026.1.22.dist-info/RECORD,,
132
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/METADATA,sha256=sqUYIVBwodRVxysq3jEToUNFX12vtC4tZenZnKnynjo,2654
133
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
134
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
135
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD,,