fbgemm-gpu-hstu-nightly 2025.6.17__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.6.18__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,6 +60,12 @@ class FeatureGateName(Enum):
60
60
  # Enable bounds_check_indices_v2
61
61
  BOUNDS_CHECK_INDICES_V2 = auto()
62
62
 
63
+ # Disable FP8 quantization vectorization
64
+ DISABLE_FP8_QUANT_VECTORIZATION = auto()
65
+
66
+ # Enable TBE input parameters extraction
67
+ TBE_REPORT_INPUT_PARAMS = auto()
68
+
63
69
  def is_enabled(self) -> bool:
64
70
  return FeatureGate.is_enabled(self)
65
71
 
@@ -6,6 +6,6 @@
6
6
  # This source code is licensed under the BSD-style license found in the
7
7
  # LICENSE file in the root directory of this source tree.
8
8
 
9
- __version__: str = "2025.6.17"
9
+ __version__: str = "2025.6.18"
10
10
  __target__: str = "hstu"
11
11
  __variant__: str = "cuda"
@@ -11,7 +11,7 @@
11
11
 
12
12
  import enum
13
13
  from dataclasses import dataclass
14
- from typing import List, NamedTuple, Tuple
14
+ from typing import List, NamedTuple, Optional, Tuple
15
15
 
16
16
  import torch
17
17
  from torch import Tensor
@@ -60,6 +60,43 @@ class EmbeddingLocation(enum.IntEnum):
60
60
  raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
61
61
 
62
62
 
63
+ class EvictionPolicy(NamedTuple):
64
+ eviction_trigger_mode: int = (
65
+ 0 # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
66
+ )
67
+ eviction_strategy: int = (
68
+ 0 # 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
69
+ )
70
+ eviction_step_intervals: Optional[int] = (
71
+ None # trigger_step_interval if trigger mode is iteration
72
+ )
73
+ eviction_mem_threshold_gb: Optional[int] = (
74
+ None # eviction trigger condition if trigger mode is mem_util
75
+ )
76
+ counter_thresholds: Optional[List[int]] = (
77
+ None # count_thresholds for each table if eviction strategy is feature score
78
+ )
79
+ ttls_in_mins: Optional[List[int]] = (
80
+ None # ttls_in_mins for each table if eviction strategy is timestamp
81
+ )
82
+ counter_decay_rates: Optional[List[float]] = (
83
+ None # count_decay_rates for each table if eviction strategy is feature score
84
+ )
85
+ l2_weight_thresholds: Optional[List[float]] = (
86
+ None # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
87
+ )
88
+ interval_for_insufficient_eviction_s: int = (
89
+ # wait at least # seconds before trigger next round of eviction, if last finished eviction is insufficient
90
+ # insufficient means we didn't evict enough rows, so we want to wait longer time to
91
+ # avoid another insufficient eviction
92
+ 600
93
+ )
94
+ interval_for_sufficient_eviction_s: int = (
95
+ # wait at least # seconds before trigger next round of eviction, if last finished eviction is sufficient
96
+ 60
97
+ )
98
+
99
+
63
100
  class KVZCHParams(NamedTuple):
64
101
  # global bucket id start and global bucket id end offsets for each logical table,
65
102
  # where start offset is inclusive and end offset is exclusive
@@ -69,6 +106,7 @@ class KVZCHParams(NamedTuple):
69
106
  bucket_sizes: List[int] = []
70
107
  # enable optimizer offloading or not
71
108
  enable_optimizer_offloading: bool = False
109
+ eviction_policy: Optional[EvictionPolicy] = None
72
110
 
73
111
  def validate(self) -> None:
74
112
  assert len(self.bucket_offsets) == len(self.bucket_sizes), (
@@ -51,6 +51,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
51
51
  generate_vbe_metadata,
52
52
  is_torchdynamo_compiling,
53
53
  )
54
+ from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter
54
55
  from fbgemm_gpu.tbe_input_multiplexer import (
55
56
  TBEInfo,
56
57
  TBEInputInfo,
@@ -1441,6 +1442,11 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1441
1442
  self._debug_print_input_stats_factory()
1442
1443
  )
1443
1444
 
1445
+ # Get a reporter function pointer
1446
+ self._report_input_params: Callable[..., None] = (
1447
+ self.__report_input_params_factory()
1448
+ )
1449
+
1444
1450
  if optimizer == OptimType.EXACT_SGD and self.use_writeback_bwd_prehook:
1445
1451
  # Register writeback hook for Exact_SGD optimizer
1446
1452
  self.log(
@@ -1953,6 +1959,18 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1953
1959
  # Print input stats if enable (for debugging purpose only)
1954
1960
  self._debug_print_input_stats(indices, offsets, per_sample_weights)
1955
1961
 
1962
+ # Extract and Write input stats if enable
1963
+ self._report_input_params(
1964
+ feature_rows=self.rows_per_table,
1965
+ feature_dims=self.feature_dims,
1966
+ iteration=self.iter.item() if hasattr(self, "iter") else 0,
1967
+ indices=indices,
1968
+ offsets=offsets,
1969
+ op_id=self.uuid,
1970
+ per_sample_weights=per_sample_weights,
1971
+ batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
1972
+ )
1973
+
1956
1974
  if not is_torchdynamo_compiling():
1957
1975
  # Mutations of nn.Module attr forces dynamo restart of Analysis which increases compilation time
1958
1976
 
@@ -3804,6 +3822,39 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
3804
3822
  return _debug_print_input_stats_factory_impl
3805
3823
  return _debug_print_input_stats_factory_null
3806
3824
 
3825
+ @torch.jit.ignore
3826
+ def __report_input_params_factory(
3827
+ self,
3828
+ ) -> Callable[..., None]:
3829
+ """
3830
+ This function returns a function pointer based on the environment variable `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL`.
3831
+
3832
+ If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is set to a value greater than 0, it returns a function pointer that:
3833
+ - Reports input parameters (TBEDataConfig).
3834
+ - Writes the output as a JSON file.
3835
+
3836
+ If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is not set or is set to 0, it returns a dummy function pointer that performs no action.
3837
+ """
3838
+
3839
+ @torch.jit.ignore
3840
+ def __report_input_params_factory_null(
3841
+ feature_rows: Tensor,
3842
+ feature_dims: Tensor,
3843
+ iteration: int,
3844
+ indices: Tensor,
3845
+ offsets: Tensor,
3846
+ op_id: Optional[str] = None,
3847
+ per_sample_weights: Optional[Tensor] = None,
3848
+ batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
3849
+ ) -> None:
3850
+ pass
3851
+
3852
+ if self._feature_is_enabled(FeatureGateName.TBE_REPORT_INPUT_PARAMS):
3853
+
3854
+ reporter = TBEBenchmarkParamsReporter.create()
3855
+ return reporter.report_stats
3856
+ return __report_input_params_factory_null
3857
+
3807
3858
 
3808
3859
  class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
3809
3860
  """
@@ -9,19 +9,11 @@
9
9
 
10
10
  import dataclasses
11
11
  import json
12
- from typing import Any, Dict, List, Optional, Tuple
12
+ from typing import Any, Dict, Optional
13
13
 
14
- import numpy as np
15
14
  import torch
16
15
 
17
- from fbgemm_gpu.tbe.utils.common import get_device, round_up
18
- from fbgemm_gpu.tbe.utils.requests import (
19
- generate_batch_sizes_from_stats,
20
- generate_pooling_factors_from_stats,
21
- get_table_batched_offsets_from_dense,
22
- maybe_to_dtype,
23
- TBERequest,
24
- )
16
+ from fbgemm_gpu.tbe.utils.common import get_device
25
17
 
26
18
  from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
27
19
 
@@ -104,175 +96,3 @@ class TBEDataConfig:
104
96
  def _new_weights(self, size: int) -> Optional[torch.Tensor]:
105
97
  # Per-sample weights will always be FP32
106
98
  return None if not self.weighted else torch.randn(size, device=get_device())
107
-
108
- def _generate_batch_sizes(self) -> Tuple[List[int], Optional[List[List[int]]]]:
109
- if self.variable_B():
110
- assert (
111
- self.batch_params.vbe_num_ranks is not None
112
- ), "vbe_num_ranks must be set for varaible batch size generation"
113
- return generate_batch_sizes_from_stats(
114
- self.batch_params.B,
115
- self.T,
116
- # pyre-ignore [6]
117
- self.batch_params.sigma_B,
118
- self.batch_params.vbe_num_ranks,
119
- # pyre-ignore [6]
120
- self.batch_params.vbe_distribution,
121
- )
122
-
123
- else:
124
- return ([self.batch_params.B] * self.T, None)
125
-
126
- def _generate_pooling_info(self, iters: int, Bs: List[int]) -> torch.Tensor:
127
- if self.variable_L():
128
- # Generate L from stats
129
- _, L_offsets = generate_pooling_factors_from_stats(
130
- iters,
131
- Bs,
132
- self.pooling_params.L,
133
- # pyre-ignore [6]
134
- self.pooling_params.sigma_L,
135
- # pyre-ignore [6]
136
- self.pooling_params.length_distribution,
137
- )
138
-
139
- else:
140
- Ls = [self.pooling_params.L] * (sum(Bs) * iters)
141
- L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
142
-
143
- return L_offsets
144
-
145
- def _generate_indices(
146
- self,
147
- iters: int,
148
- Bs: List[int],
149
- L_offsets: torch.Tensor,
150
- ) -> torch.Tensor:
151
- total_B = sum(Bs)
152
- L_offsets_list = L_offsets.tolist()
153
- indices_list = []
154
- for it in range(iters):
155
- # L_offsets is defined over the entire set of batches for a single iteration
156
- start_offset = L_offsets_list[it * total_B]
157
- end_offset = L_offsets_list[(it + 1) * total_B]
158
-
159
- indices_list.append(
160
- torch.ops.fbgemm.tbe_generate_indices_from_distribution(
161
- self.indices_params.heavy_hitters,
162
- self.indices_params.zipf_q,
163
- self.indices_params.zipf_s,
164
- # max_index = dimensions of the embedding table
165
- self.E,
166
- # num_indices = number of indices to generate
167
- end_offset - start_offset,
168
- )
169
- )
170
-
171
- return torch.cat(indices_list)
172
-
173
- def _build_requests_jagged(
174
- self,
175
- iters: int,
176
- Bs: List[int],
177
- Bs_feature_rank: Optional[List[List[int]]],
178
- L_offsets: torch.Tensor,
179
- all_indices: torch.Tensor,
180
- ) -> List[TBERequest]:
181
- total_B = sum(Bs)
182
- all_indices = all_indices.flatten()
183
- requests = []
184
- for it in range(iters):
185
- start_offset = L_offsets[it * total_B]
186
- it_L_offsets = torch.concat(
187
- [
188
- torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
189
- L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
190
- ]
191
- )
192
- requests.append(
193
- TBERequest(
194
- maybe_to_dtype(
195
- all_indices[start_offset : L_offsets[(it + 1) * total_B]],
196
- self.indices_params.index_dtype,
197
- ),
198
- maybe_to_dtype(
199
- it_L_offsets.to(get_device()), self.indices_params.offset_dtype
200
- ),
201
- self._new_weights(int(it_L_offsets[-1].item())),
202
- Bs_feature_rank if self.variable_B() else None,
203
- )
204
- )
205
- return requests
206
-
207
- def _build_requests_dense(
208
- self, iters: int, all_indices: torch.Tensor
209
- ) -> List[TBERequest]:
210
- # NOTE: We're using existing code from requests.py to build the
211
- # requests, and since the existing code requires 2D view of all_indices,
212
- # the existing all_indices must be reshaped
213
- all_indices = all_indices.reshape(iters, -1)
214
-
215
- requests = []
216
- for it in range(iters):
217
- indices, offsets = get_table_batched_offsets_from_dense(
218
- all_indices[it].view(
219
- self.T, self.batch_params.B, self.pooling_params.L
220
- ),
221
- use_cpu=self.use_cpu,
222
- )
223
- requests.append(
224
- TBERequest(
225
- maybe_to_dtype(indices, self.indices_params.index_dtype),
226
- maybe_to_dtype(offsets, self.indices_params.offset_dtype),
227
- self._new_weights(
228
- self.T * self.batch_params.B * self.pooling_params.L
229
- ),
230
- )
231
- )
232
- return requests
233
-
234
- def generate_requests(
235
- self,
236
- iters: int = 1,
237
- ) -> List[TBERequest]:
238
- # Generate batch sizes
239
- Bs, Bs_feature_rank = self._generate_batch_sizes()
240
-
241
- # Generate pooling info
242
- L_offsets = self._generate_pooling_info(iters, Bs)
243
-
244
- # Generate indices
245
- all_indices = self._generate_indices(iters, Bs, L_offsets)
246
-
247
- # Build TBE requests
248
- if self.variable_B() or self.variable_L():
249
- return self._build_requests_jagged(
250
- iters, Bs, Bs_feature_rank, L_offsets, all_indices
251
- )
252
- else:
253
- return self._build_requests_dense(iters, all_indices)
254
-
255
- def generate_embedding_dims(self) -> Tuple[int, List[int]]:
256
- if self.mixed_dim:
257
- Ds = [
258
- round_up(
259
- np.random.randint(low=int(0.5 * self.D), high=int(1.5 * self.D)), 4
260
- )
261
- for _ in range(self.T)
262
- ]
263
- return (int(np.average(Ds)), Ds)
264
- else:
265
- return (self.D, [self.D] * self.T)
266
-
267
- def generate_feature_requires_grad(self, size: int) -> torch.Tensor:
268
- assert size <= self.T, "size of feature_requires_grad must be less than T"
269
- weighted_requires_grad_tables = np.random.choice(
270
- self.T, replace=False, size=(size,)
271
- ).tolist()
272
- return (
273
- torch.tensor(
274
- [1 if t in weighted_requires_grad_tables else 0 for t in range(self.T)]
275
- )
276
- .to(get_device())
277
- .int()
278
- )
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ from typing import List, Optional, Tuple
11
+
12
+ import torch
13
+
14
+ from fbgemm_gpu.tbe.bench import TBEDataConfig
15
+ from fbgemm_gpu.tbe.utils.common import get_device, round_up
16
+
17
+ from fbgemm_gpu.tbe.utils.requests import (
18
+ generate_batch_sizes_from_stats,
19
+ generate_pooling_factors_from_stats,
20
+ get_table_batched_offsets_from_dense,
21
+ maybe_to_dtype,
22
+ TBERequest,
23
+ )
24
+
25
+
26
+ def _generate_batch_sizes(
27
+ tbe_data_config: TBEDataConfig,
28
+ ) -> Tuple[List[int], Optional[List[List[int]]]]:
29
+ if tbe_data_config.variable_B():
30
+ assert (
31
+ tbe_data_config.batch_params.vbe_num_ranks is not None
32
+ ), "vbe_num_ranks must be set for varaible batch size generation"
33
+ return generate_batch_sizes_from_stats(
34
+ tbe_data_config.batch_params.B,
35
+ tbe_data_config.T,
36
+ # pyre-ignore [6]
37
+ tbe_data_config.batch_params.sigma_B,
38
+ tbe_data_config.batch_params.vbe_num_ranks,
39
+ # pyre-ignore [6]
40
+ tbe_data_config.batch_params.vbe_distribution,
41
+ )
42
+
43
+ else:
44
+ return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
45
+
46
+
47
+ def _generate_pooling_info(
48
+ tbe_data_config: TBEDataConfig, iters: int, Bs: List[int]
49
+ ) -> torch.Tensor:
50
+ if tbe_data_config.variable_L():
51
+ # Generate L from stats
52
+ _, L_offsets = generate_pooling_factors_from_stats(
53
+ iters,
54
+ Bs,
55
+ tbe_data_config.pooling_params.L,
56
+ # pyre-ignore [6]
57
+ tbe_data_config.pooling_params.sigma_L,
58
+ # pyre-ignore [6]
59
+ tbe_data_config.pooling_params.length_distribution,
60
+ )
61
+ else:
62
+ Ls = [tbe_data_config.pooling_params.L] * (sum(Bs) * iters)
63
+ L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
64
+
65
+ return L_offsets
66
+
67
+
68
+ def _generate_indices(
69
+ tbe_data_config: TBEDataConfig,
70
+ iters: int,
71
+ Bs: List[int],
72
+ L_offsets: torch.Tensor,
73
+ ) -> torch.Tensor:
74
+ total_B = sum(Bs)
75
+ L_offsets_list = L_offsets.tolist()
76
+ indices_list = []
77
+ for it in range(iters):
78
+ # L_offsets is defined over the entire set of batches for a single iteration
79
+ start_offset = L_offsets_list[it * total_B]
80
+ end_offset = L_offsets_list[(it + 1) * total_B]
81
+
82
+ indices_list.append(
83
+ torch.ops.fbgemm.tbe_generate_indices_from_distribution(
84
+ tbe_data_config.indices_params.heavy_hitters,
85
+ tbe_data_config.indices_params.zipf_q,
86
+ tbe_data_config.indices_params.zipf_s,
87
+ # max_index = dimensions of the embedding table
88
+ tbe_data_config.E,
89
+ # num_indices = number of indices to generate
90
+ end_offset - start_offset,
91
+ )
92
+ )
93
+
94
+ return torch.cat(indices_list)
95
+
96
+
97
+ def _build_requests_jagged(
98
+ tbe_data_config: TBEDataConfig,
99
+ iters: int,
100
+ Bs: List[int],
101
+ Bs_feature_rank: Optional[List[List[int]]],
102
+ L_offsets: torch.Tensor,
103
+ all_indices: torch.Tensor,
104
+ ) -> List[TBERequest]:
105
+ total_B = sum(Bs)
106
+ all_indices = all_indices.flatten()
107
+ requests = []
108
+ for it in range(iters):
109
+ start_offset = L_offsets[it * total_B]
110
+ it_L_offsets = torch.concat(
111
+ [
112
+ torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
113
+ L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
114
+ ]
115
+ )
116
+ requests.append(
117
+ TBERequest(
118
+ maybe_to_dtype(
119
+ all_indices[start_offset : L_offsets[(it + 1) * total_B]],
120
+ tbe_data_config.indices_params.index_dtype,
121
+ ),
122
+ maybe_to_dtype(
123
+ it_L_offsets.to(get_device()),
124
+ tbe_data_config.indices_params.offset_dtype,
125
+ ),
126
+ tbe_data_config._new_weights(int(it_L_offsets[-1].item())),
127
+ Bs_feature_rank if tbe_data_config.variable_B() else None,
128
+ )
129
+ )
130
+ return requests
131
+
132
+
133
+ def _build_requests_dense(
134
+ tbe_data_config: TBEDataConfig, iters: int, all_indices: torch.Tensor
135
+ ) -> List[TBERequest]:
136
+ # NOTE: We're using existing code from requests.py to build the
137
+ # requests, and since the existing code requires 2D view of all_indices,
138
+ # the existing all_indices must be reshaped
139
+ all_indices = all_indices.reshape(iters, -1)
140
+
141
+ requests = []
142
+ for it in range(iters):
143
+ indices, offsets = get_table_batched_offsets_from_dense(
144
+ all_indices[it].view(
145
+ tbe_data_config.T,
146
+ tbe_data_config.batch_params.B,
147
+ tbe_data_config.pooling_params.L,
148
+ ),
149
+ use_cpu=tbe_data_config.use_cpu,
150
+ )
151
+ requests.append(
152
+ TBERequest(
153
+ maybe_to_dtype(indices, tbe_data_config.indices_params.index_dtype),
154
+ maybe_to_dtype(offsets, tbe_data_config.indices_params.offset_dtype),
155
+ tbe_data_config._new_weights(
156
+ tbe_data_config.T
157
+ * tbe_data_config.batch_params.B
158
+ * tbe_data_config.pooling_params.L
159
+ ),
160
+ )
161
+ )
162
+ return requests
163
+
164
+
165
+ def generate_requests(
166
+ tbe_data_config: TBEDataConfig,
167
+ iters: int = 1,
168
+ ) -> List[TBERequest]:
169
+ # Generate batch sizes
170
+ Bs, Bs_feature_rank = _generate_batch_sizes(tbe_data_config)
171
+
172
+ # Generate pooling info
173
+ L_offsets = _generate_pooling_info(tbe_data_config, iters, Bs)
174
+
175
+ # Generate indices
176
+ all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
177
+
178
+ # Build TBE requests
179
+ if tbe_data_config.variable_B() or tbe_data_config.variable_L():
180
+ return _build_requests_jagged(
181
+ tbe_data_config, iters, Bs, Bs_feature_rank, L_offsets, all_indices
182
+ )
183
+ else:
184
+ return _build_requests_dense(tbe_data_config, iters, all_indices)
185
+
186
+
187
+ def generate_embedding_dims(tbe_data_config: TBEDataConfig) -> Tuple[int, List[int]]:
188
+ if tbe_data_config.mixed_dim:
189
+ Ds = [
190
+ round_up(
191
+ int(
192
+ torch.randint(
193
+ low=int(0.5 * tbe_data_config.D),
194
+ high=int(1.5 * tbe_data_config.D),
195
+ size=(1,),
196
+ ).item()
197
+ ),
198
+ 4,
199
+ )
200
+ for _ in range(tbe_data_config.T)
201
+ ]
202
+ return (sum(Ds) // len(Ds), Ds)
203
+ else:
204
+ return (tbe_data_config.D, [tbe_data_config.D] * tbe_data_config.T)
205
+
206
+
207
+ def generate_feature_requires_grad(
208
+ tbe_data_config: TBEDataConfig, size: int
209
+ ) -> torch.Tensor:
210
+ assert (
211
+ size <= tbe_data_config.T
212
+ ), "size of feature_requires_grad must be less than T"
213
+ weighted_requires_grad_tables = torch.randperm(tbe_data_config.T)[:size].tolist()
214
+ return (
215
+ torch.tensor(
216
+ [
217
+ 1 if t in weighted_requires_grad_tables else 0
218
+ for t in range(tbe_data_config.T)
219
+ ]
220
+ )
221
+ .to(get_device())
222
+ .int()
223
+ )
@@ -11,8 +11,12 @@ import click
11
11
  import torch
12
12
  import yaml
13
13
 
14
- from .tbe_data_config import TBEDataConfig
15
- from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
14
+ from fbgemm_gpu.tbe.bench.tbe_data_config import (
15
+ BatchParams,
16
+ IndicesParams,
17
+ PoolingParams,
18
+ TBEDataConfig,
19
+ )
16
20
 
17
21
 
18
22
  class TBEDataConfigLoader:
@@ -248,6 +248,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
248
248
  self.total_hash_size_bits: int = 0
249
249
  else:
250
250
  self.total_hash_size_bits: int = int(log2(float(hash_size_cumsum[-1])) + 1)
251
+ self.register_buffer(
252
+ "table_hash_size_cumsum",
253
+ torch.tensor(
254
+ hash_size_cumsum, device=self.current_device, dtype=torch.int64
255
+ ),
256
+ )
251
257
  # The last element is to easily access # of rows of each table by
252
258
  self.total_hash_size_bits = int(log2(float(hash_size_cumsum[-1])) + 1)
253
259
  self.total_hash_size: int = hash_size_cumsum[-1]
@@ -288,6 +294,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
288
294
  "feature_dims",
289
295
  torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
290
296
  )
297
+ self.register_buffer(
298
+ "table_dims",
299
+ torch.tensor(dims, device="cpu", dtype=torch.int64),
300
+ )
291
301
 
292
302
  (info_B_num_bits_, info_B_mask_) = torch.ops.fbgemm.get_infos_metadata(
293
303
  self.D_offsets, # unused tensor
@@ -518,6 +528,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
518
528
  logging.warning("dist is not initialized, treating as single gpu cases")
519
529
  tbe_unique_id = SSDTableBatchedEmbeddingBags._local_instance_index
520
530
  self.tbe_unique_id = tbe_unique_id
531
+ self.l2_cache_size = l2_cache_size
521
532
  logging.info(f"tbe_unique_id: {tbe_unique_id}")
522
533
  if self.backend_type == BackendType.SSD:
523
534
  logging.info(
@@ -564,12 +575,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
564
575
  self.res_params.table_offsets,
565
576
  self.res_params.table_sizes,
566
577
  (
567
- tensor_pad4(self.feature_dims.cpu())
578
+ tensor_pad4(self.table_dims)
568
579
  if self.enable_optimizer_offloading
569
580
  else None
570
581
  ),
571
582
  (
572
- self.hash_size_cumsum.cpu()
583
+ self.table_hash_size_cumsum.cpu()
573
584
  if self.enable_optimizer_offloading
574
585
  else None
575
586
  ),
@@ -609,28 +620,42 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
609
620
  f"feature_dims={self.feature_dims},"
610
621
  f"hash_size_cumsum={self.hash_size_cumsum}"
611
622
  )
623
+ table_dims = (
624
+ tensor_pad4(self.table_dims)
625
+ if self.enable_optimizer_offloading
626
+ else None
627
+ ) # table_dims
628
+ eviction_config = None
629
+ if self.kv_zch_params and self.kv_zch_params.eviction_policy:
630
+ eviction_mem_threshold_gb = (
631
+ self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
632
+ if self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
633
+ else self.l2_cache_size
634
+ )
635
+ eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
636
+ self.kv_zch_params.eviction_policy.eviction_trigger_mode, # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
637
+ self.kv_zch_params.eviction_policy.eviction_strategy, # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
638
+ self.kv_zch_params.eviction_policy.eviction_step_intervals, # trigger_step_interval if trigger mode is iteration
639
+ eviction_mem_threshold_gb, # mem_util_threshold_in_GB if trigger mode is mem_util
640
+ self.kv_zch_params.eviction_policy.ttls_in_mins, # ttls_in_mins for each table if eviction strategy is timestamp
641
+ self.kv_zch_params.eviction_policy.counter_thresholds, # counter_thresholds for each table if eviction strategy is feature score
642
+ self.kv_zch_params.eviction_policy.counter_decay_rates, # counter_decay_rates for each table if eviction strategy is feature score
643
+ self.kv_zch_params.eviction_policy.l2_weight_thresholds, # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
644
+ table_dims.tolist() if table_dims is not None else None,
645
+ self.kv_zch_params.eviction_policy.interval_for_insufficient_eviction_s,
646
+ self.kv_zch_params.eviction_policy.interval_for_sufficient_eviction_s,
647
+ )
612
648
  self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
613
649
  self.cache_row_dim,
614
650
  ssd_uniform_init_lower,
615
651
  ssd_uniform_init_upper,
616
- 0, # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
617
- 0, # trigger_step_interval if trigger mode is iteration
618
- 0, # mem_util_threshold_in_GB if trigger mode is mem_util
619
- 0, # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
620
- None, # count_thresholds for each table if eviction strategy is feature score
621
- None, # ttls_in_mins for each table if eviction strategy is timestamp
622
- None, # count_decay_rates for each table if eviction strategy is feature score
623
- None, # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
652
+ eviction_config,
624
653
  ssd_rocksdb_shards, # num_shards
625
654
  ssd_rocksdb_shards, # num_threads
626
655
  weights_precision.bit_rate(), # row_storage_bitwidth
656
+ table_dims,
627
657
  (
628
- tensor_pad4(self.feature_dims.cpu())
629
- if self.enable_optimizer_offloading
630
- else None
631
- ), # table_dims
632
- (
633
- self.hash_size_cumsum.cpu()
658
+ self.table_hash_size_cumsum.cpu()
634
659
  if self.enable_optimizer_offloading
635
660
  else None
636
661
  ), # hash_size_cumsum
@@ -2434,6 +2459,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
2434
2459
  f"created snapshot for weight states: {snapshot_handle}, latency: {(time.time() - start_time) * 1000} ms"
2435
2460
  )
2436
2461
  elif self.backend_type == BackendType.DRAM:
2462
+ # if there is any ongoing eviction, lets wait until eviction is finished before state_dict
2463
+ # so that we can reach consistent model state before/after state_dict
2464
+ evict_wait_start_time = time.time()
2465
+ self.ssd_db.wait_until_eviction_done()
2466
+ logging.info(
2467
+ f"state_dict wait for ongoing eviction: {time.time() - evict_wait_start_time} s"
2468
+ )
2437
2469
  self.flush(force=should_flush)
2438
2470
  return snapshot_handle, checkpoint_handle
2439
2471
 
@@ -8,26 +8,28 @@
8
8
  # pyre-strict
9
9
 
10
10
  import io
11
+ import json
11
12
  import logging
12
13
  import os
13
14
  from typing import List, Optional
14
15
 
15
16
  import fbgemm_gpu # noqa F401
16
- import numpy as np # usort:skip
17
17
  import torch # usort:skip
18
18
 
19
- from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
20
- SplitTableBatchedEmbeddingBagsCodegen,
21
- )
22
- from fbgemm_gpu.tbe.bench import (
19
+ from fbgemm_gpu.tbe.bench.tbe_data_config import (
23
20
  BatchParams,
24
21
  IndicesParams,
25
22
  PoolingParams,
26
23
  TBEDataConfig,
27
24
  )
28
25
 
29
- # pyre-ignore[16]
30
- open_source: bool = getattr(fbgemm_gpu, "open_source", False)
26
+ open_source: bool = False
27
+ try:
28
+ # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
29
+ if getattr(fbgemm_gpu, "open_source", False):
30
+ open_source = True
31
+ except Exception:
32
+ pass
31
33
 
32
34
  if open_source:
33
35
  from fbgemm_gpu.utils import FileStore
@@ -43,7 +45,8 @@ class TBEBenchmarkParamsReporter:
43
45
  def __init__(
44
46
  self,
45
47
  report_interval: int,
46
- report_once: bool = False,
48
+ report_iter_start: int = 0,
49
+ report_iter_end: int = -1,
47
50
  bucket: Optional[str] = None,
48
51
  path_prefix: Optional[str] = None,
49
52
  ) -> None:
@@ -52,13 +55,30 @@ class TBEBenchmarkParamsReporter:
52
55
 
53
56
  Args:
54
57
  report_interval (int): The interval at which reports are generated.
55
- report_once (bool, optional): If True, reporting occurs only once. Defaults to False.
58
+ report_iter_start (int): The start of the iteration range to capture. Defaults to 0.
59
+ report_iter_end (int): The end of the iteration range to capture. Defaults to -1 (last iteration).
56
60
  bucket (Optional[str], optional): The storage bucket for reports. Defaults to None.
57
61
  path_prefix (Optional[str], optional): The path prefix for report storage. Defaults to None.
58
62
  """
63
+ assert report_interval > 0, "report_interval must be greater than 0"
64
+ assert (
65
+ report_iter_start >= 0
66
+ ), "report_iter_start must be greater than or equal to 0"
67
+ assert (
68
+ report_iter_end >= -1
69
+ ), "report_iter_end must be greater than or equal to -1"
70
+ assert (
71
+ report_iter_end == -1 or report_iter_start <= report_iter_end
72
+ ), "report_iter_start must be less than or equal to report_iter_end"
73
+
59
74
  self.report_interval = report_interval
60
- self.report_once = report_once
61
- self.has_reported = False
75
+ self.report_iter_start = report_iter_start
76
+ self.report_iter_end = report_iter_end
77
+
78
+ if path_prefix is not None and path_prefix.endswith("/"):
79
+ path_prefix = path_prefix[:-1]
80
+
81
+ self.path_prefix = path_prefix
62
82
 
63
83
  default_bucket = "/tmp" if open_source else "tlparse_reports"
64
84
  bucket = (
@@ -68,22 +88,65 @@ class TBEBenchmarkParamsReporter:
68
88
  )
69
89
  self.filestore = FileStore(bucket)
70
90
 
91
+ if self.path_prefix is not None and not self.filestore.exists(self.path_prefix):
92
+ self.filestore.create_directory(self.path_prefix)
93
+
71
94
  self.logger: logging.Logger = logging.getLogger(__name__)
72
95
  self.logger.setLevel(logging.INFO)
73
96
 
97
+ @classmethod
98
+ def create(cls) -> "TBEBenchmarkParamsReporter":
99
+ """
100
+ This method returns an instance of TBEBenchmarkParamsReporter based on environment variables.
101
+
102
+ If the `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` environment variable is set to a value greater than 0, it creates an instance that:
103
+ - Reports input parameters (TBEDataConfig).
104
+ - Writes the output as a JSON file.
105
+
106
+ Additionally, the following environment variables are considered:
107
+ - `FBGEMM_REPORT_INPUT_PARAMS_ITER_START`: Specifies the start of the iteration range to capture.
108
+ - `FBGEMM_REPORT_INPUT_PARAMS_ITER_END`: Specifies the end of the iteration range to capture.
109
+ - `FBGEMM_REPORT_INPUT_PARAMS_BUCKET`: Specifies the bucket for reporting.
110
+ - `FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX`: Specifies the path prefix for reporting.
111
+
112
+ Returns:
113
+ TBEBenchmarkParamsReporter: An instance configured based on the environment variables.
114
+ """
115
+ report_interval = int(
116
+ os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_INTERVAL", "1")
117
+ )
118
+ report_iter_start = int(
119
+ os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_START", "0")
120
+ )
121
+ report_iter_end = int(
122
+ os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_END", "-1")
123
+ )
124
+ bucket = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_BUCKET", "")
125
+ path_prefix = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX", "")
126
+
127
+ return cls(
128
+ report_interval=report_interval,
129
+ report_iter_start=report_iter_start,
130
+ report_iter_end=report_iter_end,
131
+ bucket=bucket,
132
+ path_prefix=path_prefix,
133
+ )
134
+
74
135
  def extract_params(
75
136
  self,
76
- embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
137
+ feature_rows: torch.Tensor,
138
+ feature_dims: torch.Tensor,
77
139
  indices: torch.Tensor,
78
140
  offsets: torch.Tensor,
79
141
  per_sample_weights: Optional[torch.Tensor] = None,
80
142
  batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
81
143
  ) -> TBEDataConfig:
82
144
  """
83
- Extracts parameters from the embedding operation, input indices and offsets to create a TBEDataConfig.
145
+ Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
84
146
 
85
147
  Args:
86
- embedding_op (SplitTableBatchedEmbeddingBagsCodegen): The embedding operation.
148
+ feature_rows (torch.Tensor): Number of rows in each feature.
149
+ feature_dims (torch.Tensor): Number of dimensions in each feature.
87
150
  indices (torch.Tensor): The input indices tensor.
88
151
  offsets (torch.Tensor): The input offsets tensor.
89
152
  per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
@@ -92,24 +155,33 @@ class TBEBenchmarkParamsReporter:
92
155
  Returns:
93
156
  TBEDataConfig: The configuration data for TBE benchmarking.
94
157
  """
158
+
159
+ Es = feature_rows.tolist()
160
+ Ds = feature_dims.tolist()
161
+
162
+ assert len(Es) == len(
163
+ Ds
164
+ ), "feature_rows and feature_dims must have the same length"
165
+
95
166
  # Transfer indices back to CPU for EEG analysis
96
167
  indices_cpu = indices.cpu()
97
168
 
98
- # Extract embedding table specs
99
- embedding_specs = [
100
- embedding_op.embedding_specs[t] for t in embedding_op.feature_table_map
101
- ]
102
- rowcounts = [embedding_spec[0] for embedding_spec in embedding_specs]
103
- dims = [embedding_spec[1] for embedding_spec in embedding_specs]
104
-
105
169
  # Set T to be the number of features we are looking at
106
- T = len(embedding_op.feature_table_map)
170
+ T = len(Ds)
107
171
  # Set E to be the mean of the rowcounts to avoid biasing
108
- E = rowcounts[0] if len(set(rowcounts)) == 1 else np.ceil((np.mean(rowcounts)))
172
+ E = (
173
+ Es[0]
174
+ if len(set(Es)) == 1
175
+ else torch.ceil(torch.mean(torch.tensor(feature_rows)))
176
+ )
109
177
  # Set mixed_dim to be True if there are multiple dims
110
- mixed_dim = len(set(dims)) > 1
178
+ mixed_dim = len(set(Ds)) > 1
111
179
  # Set D to be the mean of the dims to avoid biasing
112
- D = dims[0] if not mixed_dim else np.ceil((np.mean(dims)))
180
+ D = (
181
+ Ds[0]
182
+ if not mixed_dim
183
+ else torch.ceil(torch.mean(torch.tensor(feature_dims)))
184
+ )
113
185
 
114
186
  # Compute indices distribution parameters
115
187
  heavy_hitters, q, s, _, _ = torch.ops.fbgemm.tbe_estimate_indices_distribution(
@@ -123,8 +195,18 @@ class TBEBenchmarkParamsReporter:
123
195
  batch_params = BatchParams(
124
196
  B=((offsets.numel() - 1) // T),
125
197
  sigma_B=(
126
- np.ceil(
127
- np.std([b for bs in batch_size_per_feature_per_rank for b in bs])
198
+ int(
199
+ torch.ceil(
200
+ torch.std(
201
+ torch.tensor(
202
+ [
203
+ b
204
+ for bs in batch_size_per_feature_per_rank
205
+ for b in bs
206
+ ]
207
+ )
208
+ )
209
+ )
128
210
  )
129
211
  if batch_size_per_feature_per_rank
130
212
  else None
@@ -138,11 +220,19 @@ class TBEBenchmarkParamsReporter:
138
220
  )
139
221
 
140
222
  # Compute pooling parameters
141
- bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
223
+ bag_sizes = offsets[1:] - offsets[:-1]
142
224
  mixed_bag_sizes = len(set(bag_sizes)) > 1
143
225
  pooling_params = PoolingParams(
144
- L=np.ceil(np.mean(bag_sizes)) if mixed_bag_sizes else bag_sizes[0],
145
- sigma_L=(np.ceil(np.std(bag_sizes)) if mixed_bag_sizes else None),
226
+ L=(
227
+ int(torch.ceil(torch.mean(bag_sizes.float())))
228
+ if mixed_bag_sizes
229
+ else int(bag_sizes[0])
230
+ ),
231
+ sigma_L=(
232
+ int(torch.ceil(torch.std(bag_sizes.float())))
233
+ if mixed_bag_sizes
234
+ else None
235
+ ),
146
236
  length_distribution=("normal" if mixed_bag_sizes else None),
147
237
  )
148
238
 
@@ -160,34 +250,58 @@ class TBEBenchmarkParamsReporter:
160
250
 
161
251
  def report_stats(
162
252
  self,
163
- embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
253
+ feature_rows: torch.Tensor,
254
+ feature_dims: torch.Tensor,
255
+ iteration: int,
164
256
  indices: torch.Tensor,
165
257
  offsets: torch.Tensor,
258
+ op_id: str = "",
166
259
  per_sample_weights: Optional[torch.Tensor] = None,
167
260
  batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
168
261
  ) -> None:
169
262
  """
170
- Reports the configuration of the embedding operation and input data then writes the TBE configuration to the filestore.
263
+ Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
171
264
 
172
265
  Args:
173
- embedding_op (SplitTableBatchedEmbeddingBagsCodegen): The embedding operation.
266
+ feature_rows (torch.Tensor): Number of rows in each feature.
267
+ feature_dims (torch.Tensor): Number of dimensions in each feature.
268
+ iteration (int): The current iteration number.
174
269
  indices (torch.Tensor): The input indices tensor.
175
270
  offsets (torch.Tensor): The input offsets tensor.
271
+ op_id (str, optional): The operation identifier. Defaults to an empty string.
176
272
  per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
177
273
  batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
178
274
  """
179
- if embedding_op.iter.item() % self.report_interval == 0 and (
180
- not self.report_once or (self.report_once and not self.has_reported)
275
+ if (
276
+ (iteration - self.report_iter_start) % self.report_interval == 0
277
+ and (iteration >= self.report_iter_start)
278
+ and (self.report_iter_end == -1 or iteration <= self.report_iter_end)
181
279
  ):
182
280
  # Extract TBE config
183
281
  config = self.extract_params(
184
- embedding_op, indices, offsets, per_sample_weights
282
+ feature_rows=feature_rows,
283
+ feature_dims=feature_dims,
284
+ indices=indices,
285
+ offsets=offsets,
286
+ per_sample_weights=per_sample_weights,
287
+ batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
185
288
  )
186
289
 
290
+ config.json()
291
+
292
+ # Ad-hoc fix for adding Es and Ds to JSON output
293
+ # TODO: Remove this once we moved Es and Ds to be part of TBEDataConfig
294
+ adhoc_config = config.dict()
295
+ adhoc_config["Es"] = feature_rows.tolist()
296
+ adhoc_config["Ds"] = feature_dims.tolist()
297
+ if batch_size_per_feature_per_rank:
298
+ adhoc_config["Bs"] = [
299
+ sum(batch_size_per_feature_per_rank[f])
300
+ for f in range(len(adhoc_config["Es"]))
301
+ ]
302
+
187
303
  # Write the TBE config to FileStore
188
304
  self.filestore.write(
189
- f"tbe-{embedding_op.uuid}-config-estimation-{embedding_op.iter.item()}.json",
190
- io.BytesIO(config.json(format=True).encode()),
305
+ f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
306
+ io.BytesIO(json.dumps(adhoc_config, indent=2).encode()),
191
307
  )
192
-
193
- self.has_reported = True
@@ -14,9 +14,6 @@ import numpy as np
14
14
  import numpy.typing as npt
15
15
  import torch
16
16
 
17
- # pyre-fixme[21]: Could not find name `default_rng` in `numpy.random` (stubbed).
18
- from numpy.random import default_rng
19
-
20
17
  from .common import get_device
21
18
  from .offsets import get_table_batched_offsets_from_dense
22
19
 
@@ -309,11 +306,9 @@ def generate_indices_zipf(
309
306
  indices, torch.tensor([0, L], dtype=torch.long), True
310
307
  )
311
308
  if deterministic_output:
312
- rng = default_rng(12345)
313
- else:
314
- rng = default_rng()
309
+ np.random.seed(12345)
315
310
  permutation = torch.as_tensor(
316
- rng.choice(E, size=indices.max().item() + 1, replace=False)
311
+ np.random.choice(E, size=indices.max().item() + 1, replace=False)
317
312
  )
318
313
  indices = permutation.gather(0, indices.flatten())
319
314
  indices = indices.to(get_device()).int()
@@ -11,7 +11,6 @@
11
11
  import io
12
12
  import logging
13
13
  import os
14
- import shutil
15
14
  from dataclasses import dataclass
16
15
  from pathlib import Path
17
16
  from typing import BinaryIO, Union
@@ -76,7 +75,12 @@ class FileStore:
76
75
  elif isinstance(raw_input, Path):
77
76
  if not os.path.exists(raw_input):
78
77
  raise FileNotFoundError(f"File {raw_input} does not exist")
79
- shutil.copyfile(raw_input, filepath)
78
+ # Open the source file and destination file, and copy the contents
79
+ with open(raw_input, "rb") as src_file, open(
80
+ filepath, "wb"
81
+ ) as dst_file:
82
+ while chunk := src_file.read(4096): # Read 4 KB at a time
83
+ dst_file.write(chunk)
80
84
 
81
85
  elif isinstance(raw_input, io.BytesIO) or isinstance(raw_input, BinaryIO):
82
86
  with open(filepath, "wb") as file:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_hstu_nightly
3
- Version: 2025.6.17
3
+ Version: 2025.6.18
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -15,15 +15,15 @@ fbgemm_gpu/split_embedding_inference_converter.py,sha256=ilVVowkTiY0WDpOYorj917T
15
15
  fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
16
16
  fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
17
17
  fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
18
- fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=qglNRKKuHkrKiTw90ACjZpMzcjHKXKV7ME3a8QHfQt4,8237
18
+ fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=qbc1n-PPWKc75H0lXxK5kuCCprh4xEMS8A0TiE5fbHs,9906
19
19
  fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=bUDWa6IR0vGLDThgB3nmD1yfYa8_HD34B0dtLnd7thw,81692
20
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=GY3Wm5X2utjIXcjIcHwByzI38hLa9NlShtHg58wIOOU,164383
20
+ fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=IrGhStc8TSwvxjgPtwIVDmfjsXbThmh64pVulNhMR9M,166355
21
21
  fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=ktC10-nakOBpcmJNCOGQsxuBCP8XTwXJ2WeEgIg91tc,5455
22
22
  fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
23
23
  fbgemm_gpu/tbe_input_multiplexer.py,sha256=DjU7dPHgAT1avXGvgi8SFfw2Pq7yT8S_7IH8qCXoptA,3069
24
24
  fbgemm_gpu/uvm.py,sha256=-cZunsuvnAKUEQptIwdYVar_3hUE99FbQUsyfBVeXPE,925
25
25
  fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
26
- fbgemm_gpu/config/feature_list.py,sha256=kBXRV_3Hc-eC2zy5YGo_viS0t7awojQzmkHE7AgATuY,2267
26
+ fbgemm_gpu/config/feature_list.py,sha256=04l_k0t6nkLRxnvSeO4ZjkGj_If9KQGl8PTl-HmxOIQ,2441
27
27
  fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
28
28
  fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
29
29
  fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
@@ -32,10 +32,10 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
32
32
  fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
33
33
  fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
34
34
  fbgemm_gpu/docs/sparse_ops.py,sha256=NTcTm0q9h8W2B8PKPoic2fHsAaCbCYunSa_EYK0LtHQ,21382
35
- fbgemm_gpu/docs/version.py,sha256=mYY8Au7MNpxkPW4e6-KmEl1zCys0U_I92tLFTvoN8Oc,315
35
+ fbgemm_gpu/docs/version.py,sha256=gWwQLocgNkScd-zRubMAconDahzJwrllhvezB_jXyQs,315
36
36
  fbgemm_gpu/experimental/hstu/__init__.py,sha256=KNisP6qDMwgjgxkGlqUZRNjJ_8o8R-cTmm3HxF7pSqI,1564
37
37
  fbgemm_gpu/experimental/hstu/cuda_hstu_attention.py,sha256=5425GRjJuzpXQC-TowgQOCFjZmOwv_EK0lKbURhHBTQ,9920
38
- fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so,sha256=m0lq55NT281t05PPW3dH2uBVT9AtUSH5uHGsL_Mut7g,352696288
38
+ fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so,sha256=svpW3UvcmqdzJMNC1rSNKDKtTBWT3Ht8LVdGZZRve1U,352696288
39
39
  fbgemm_gpu/quantize/__init__.py,sha256=pftciXHE7csekDFkl7Ui1AWglVMMnSrOO04mREnUdb0,921
40
40
  fbgemm_gpu/quantize/quantize_ops.py,sha256=25AIOv9n2UoxamMUaI6EK1Ur4gSHxbZIReHBtgOjjCs,2228
41
41
  fbgemm_gpu/sll/__init__.py,sha256=rgXh35-OFUE54E9gGBq3NGxouGvgMv2ccY2bWUTxONY,4191
@@ -64,8 +64,9 @@ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=T8Wa1PeRyFZ0Ge-SErHQEYDY8LvHVoCV_qQlE_6kE
64
64
  fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=mdG3JZwgclp6DiVwQSKl5jrirLSId4OuM64knj9TkEk,4973
65
65
  fbgemm_gpu/tbe/bench/eval_compression.py,sha256=bINVERk42VJDSdenQHKWApmRMrW8rhkevOgE0hDR-S8,3499
66
66
  fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
67
- fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=uvepBrbzERALBB-RPZVGFra4a8ALCqsOe9X6iWpqAyU,9413
68
- fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=eSzD7JP4oIo3LP6nVhqwfxfeDARSy-TS48ue-5duodE,7519
67
+ fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=0NxlQtvBb4BBeBiK8DaMVByyJjgzFFgrAsGQt-EFqgM,2913
68
+ fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=uIFdxCBgrz3_l6C9fmE2bMmULhK1eX5ZfB78Pz7tjkw,7312
69
+ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=ajDmXjxNLxtHu8728CsSZQmuT6nra82jTb9uJJE3yzI,7519
69
70
  fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=tuEQgffV-_zGS4zza1I3x9ZWOYGh9jl3Aal1g-78veE,5852
70
71
  fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
71
72
  fbgemm_gpu/tbe/cache/__init__.py,sha256=oM-g5nq0EXZgO79C6DhAl_Om9FTPC-WiaqclQCG3HTk,323
@@ -73,16 +74,16 @@ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=mQkCl0xN8xUu5bjEWcOOFN
73
74
  fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
74
75
  fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
75
76
  fbgemm_gpu/tbe/ssd/inference.py,sha256=DTjwj3f6JaUMcecWoRNkZpRgXDJ-eE3grtixYwKb5DI,22829
76
- fbgemm_gpu/tbe/ssd/training.py,sha256=Dx-rJqjrD1A4U4MEVaP3OJl3CZz0VRSTWcukx5557Jw,131715
77
+ fbgemm_gpu/tbe/ssd/training.py,sha256=GnhVZOxkgYoDgYOh34xL1pg5SwncSoLMv48mSHt4lQc,133710
77
78
  fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
78
79
  fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=uwwEdUiaVlnWZ_rQax2z28VYROfivdMqIdWLy8IZ6cE,7646
79
80
  fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
80
- fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=7XIWVObJOxSVUG73xsd_lVSuCFUQkMEGSWW--BoyCH0,7358
81
+ fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=9HCR8Y0j_5oWGn1KRSNNYKGf_pmbGZyKT_KII8qf2Fc,11670
81
82
  fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
82
83
  fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
83
84
  fbgemm_gpu/tbe/utils/offsets.py,sha256=bs08kDiQ54oucZl6rmPLDs-bN6m1EMa1Wju06mCKZBY,1917
84
85
  fbgemm_gpu/tbe/utils/quantize.py,sha256=byjmzGpUjXD_UVAiBKyszmWlzYLkQxq5HBs6hBOuHZo,9185
85
- fbgemm_gpu/tbe/utils/requests.py,sha256=JR100WetSbj0X6FC431ysG8i7eb1T1Ej_GYt9DnNAjk,18053
86
+ fbgemm_gpu/tbe/utils/requests.py,sha256=uyWxOYxsmSyd48UhPHvDEdxbb-_zDV60FuoKiuTzMdM,17899
86
87
  fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
87
88
  fbgemm_gpu/triton/common.py,sha256=wnkLd2a8fKpefymLL-LjNKEL4hDVSxFiF5g3aF8mzsw,2131
88
89
  fbgemm_gpu/triton/quantize.py,sha256=K5pqBQqs4YsD5m5TibZCbkd0E4Si0i_xcpIeF1B6jA0,26815
@@ -90,10 +91,10 @@ fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2M
90
91
  fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
91
92
  fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=AIC1G6_QBQtMVTyOyEV4ZKJyDzu36UI_9HDgWmZIRaA,29884
92
93
  fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,354
93
- fbgemm_gpu/utils/filestore.py,sha256=ijoJYDqHUQlv0OrEtLrgSjkNCreCvy5ZXHfd8atwewc,6186
94
+ fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
94
95
  fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
95
96
  fbgemm_gpu/utils/torch_library.py,sha256=dQcHv1qgpu5QYlJjxjd6oeHjtxnmmXzx3PL6vjCmxL4,4199
96
- fbgemm_gpu_hstu_nightly-2025.6.17.dist-info/METADATA,sha256=JW6tQAPy8jF9_SLsAZ8BY4c4YnTy3uVQaEc2ac8VyA4,2654
97
- fbgemm_gpu_hstu_nightly-2025.6.17.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
98
- fbgemm_gpu_hstu_nightly-2025.6.17.dist-info/top_level.txt,sha256=2tlbTWLkPjhqvLF_6BbqKzkcPluSE-oPRVjI8axK76I,11
99
- fbgemm_gpu_hstu_nightly-2025.6.17.dist-info/RECORD,,
97
+ fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/METADATA,sha256=rFbfG2H1ql2hm2bSjq3oSTKiMe3RXdKYafu9kp7D4qU,2654
98
+ fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
99
+ fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/top_level.txt,sha256=2tlbTWLkPjhqvLF_6BbqKzkcPluSE-oPRVjI8axK76I,11
100
+ fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/RECORD,,