fbgemm-gpu-hstu-nightly 2025.6.16__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.6.18__cp313-cp313-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fbgemm_gpu/config/feature_list.py +6 -0
- fbgemm_gpu/docs/version.py +1 -1
- fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so +0 -0
- fbgemm_gpu/fbgemm.so +0 -0
- fbgemm_gpu/split_table_batched_embeddings_ops_common.py +39 -1
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +65 -2
- fbgemm_gpu/tbe/bench/tbe_data_config.py +2 -182
- fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +223 -0
- fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +6 -2
- fbgemm_gpu/tbe/ssd/training.py +48 -16
- fbgemm_gpu/tbe/stats/bench_params_reporter.py +154 -40
- fbgemm_gpu/tbe/utils/requests.py +2 -7
- fbgemm_gpu/utils/filestore.py +56 -3
- {fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/METADATA +1 -4
- {fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/RECORD +17 -16
- {fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/top_level.txt +0 -0
|
@@ -60,6 +60,12 @@ class FeatureGateName(Enum):
|
|
|
60
60
|
# Enable bounds_check_indices_v2
|
|
61
61
|
BOUNDS_CHECK_INDICES_V2 = auto()
|
|
62
62
|
|
|
63
|
+
# Disable FP8 quantization vectorization
|
|
64
|
+
DISABLE_FP8_QUANT_VECTORIZATION = auto()
|
|
65
|
+
|
|
66
|
+
# Enable TBE input parameters extraction
|
|
67
|
+
TBE_REPORT_INPUT_PARAMS = auto()
|
|
68
|
+
|
|
63
69
|
def is_enabled(self) -> bool:
|
|
64
70
|
return FeatureGate.is_enabled(self)
|
|
65
71
|
|
fbgemm_gpu/docs/version.py
CHANGED
|
Binary file
|
fbgemm_gpu/fbgemm.so
CHANGED
|
Binary file
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
import enum
|
|
13
13
|
from dataclasses import dataclass
|
|
14
|
-
from typing import List, NamedTuple, Tuple
|
|
14
|
+
from typing import List, NamedTuple, Optional, Tuple
|
|
15
15
|
|
|
16
16
|
import torch
|
|
17
17
|
from torch import Tensor
|
|
@@ -60,6 +60,43 @@ class EmbeddingLocation(enum.IntEnum):
|
|
|
60
60
|
raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
class EvictionPolicy(NamedTuple):
|
|
64
|
+
eviction_trigger_mode: int = (
|
|
65
|
+
0 # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
|
|
66
|
+
)
|
|
67
|
+
eviction_strategy: int = (
|
|
68
|
+
0 # 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
|
|
69
|
+
)
|
|
70
|
+
eviction_step_intervals: Optional[int] = (
|
|
71
|
+
None # trigger_step_interval if trigger mode is iteration
|
|
72
|
+
)
|
|
73
|
+
eviction_mem_threshold_gb: Optional[int] = (
|
|
74
|
+
None # eviction trigger condition if trigger mode is mem_util
|
|
75
|
+
)
|
|
76
|
+
counter_thresholds: Optional[List[int]] = (
|
|
77
|
+
None # count_thresholds for each table if eviction strategy is feature score
|
|
78
|
+
)
|
|
79
|
+
ttls_in_mins: Optional[List[int]] = (
|
|
80
|
+
None # ttls_in_mins for each table if eviction strategy is timestamp
|
|
81
|
+
)
|
|
82
|
+
counter_decay_rates: Optional[List[float]] = (
|
|
83
|
+
None # count_decay_rates for each table if eviction strategy is feature score
|
|
84
|
+
)
|
|
85
|
+
l2_weight_thresholds: Optional[List[float]] = (
|
|
86
|
+
None # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
|
|
87
|
+
)
|
|
88
|
+
interval_for_insufficient_eviction_s: int = (
|
|
89
|
+
# wait at least # seconds before trigger next round of eviction, if last finished eviction is insufficient
|
|
90
|
+
# insufficient means we didn't evict enough rows, so we want to wait longer time to
|
|
91
|
+
# avoid another insufficient eviction
|
|
92
|
+
600
|
|
93
|
+
)
|
|
94
|
+
interval_for_sufficient_eviction_s: int = (
|
|
95
|
+
# wait at least # seconds before trigger next round of eviction, if last finished eviction is sufficient
|
|
96
|
+
60
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
63
100
|
class KVZCHParams(NamedTuple):
|
|
64
101
|
# global bucket id start and global bucket id end offsets for each logical table,
|
|
65
102
|
# where start offset is inclusive and end offset is exclusive
|
|
@@ -69,6 +106,7 @@ class KVZCHParams(NamedTuple):
|
|
|
69
106
|
bucket_sizes: List[int] = []
|
|
70
107
|
# enable optimizer offloading or not
|
|
71
108
|
enable_optimizer_offloading: bool = False
|
|
109
|
+
eviction_policy: Optional[EvictionPolicy] = None
|
|
72
110
|
|
|
73
111
|
def validate(self) -> None:
|
|
74
112
|
assert len(self.bucket_offsets) == len(self.bucket_sizes), (
|
|
@@ -51,6 +51,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
|
|
|
51
51
|
generate_vbe_metadata,
|
|
52
52
|
is_torchdynamo_compiling,
|
|
53
53
|
)
|
|
54
|
+
from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter
|
|
54
55
|
from fbgemm_gpu.tbe_input_multiplexer import (
|
|
55
56
|
TBEInfo,
|
|
56
57
|
TBEInputInfo,
|
|
@@ -1441,6 +1442,11 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1441
1442
|
self._debug_print_input_stats_factory()
|
|
1442
1443
|
)
|
|
1443
1444
|
|
|
1445
|
+
# Get a reporter function pointer
|
|
1446
|
+
self._report_input_params: Callable[..., None] = (
|
|
1447
|
+
self.__report_input_params_factory()
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1444
1450
|
if optimizer == OptimType.EXACT_SGD and self.use_writeback_bwd_prehook:
|
|
1445
1451
|
# Register writeback hook for Exact_SGD optimizer
|
|
1446
1452
|
self.log(
|
|
@@ -1947,11 +1953,24 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1947
1953
|
per_sample_weights,
|
|
1948
1954
|
batch_size_per_feature_per_rank,
|
|
1949
1955
|
force_cast_input_types=True,
|
|
1956
|
+
prefetch_pipeline=False,
|
|
1950
1957
|
)
|
|
1951
1958
|
|
|
1952
1959
|
# Print input stats if enable (for debugging purpose only)
|
|
1953
1960
|
self._debug_print_input_stats(indices, offsets, per_sample_weights)
|
|
1954
1961
|
|
|
1962
|
+
# Extract and Write input stats if enable
|
|
1963
|
+
self._report_input_params(
|
|
1964
|
+
feature_rows=self.rows_per_table,
|
|
1965
|
+
feature_dims=self.feature_dims,
|
|
1966
|
+
iteration=self.iter.item() if hasattr(self, "iter") else 0,
|
|
1967
|
+
indices=indices,
|
|
1968
|
+
offsets=offsets,
|
|
1969
|
+
op_id=self.uuid,
|
|
1970
|
+
per_sample_weights=per_sample_weights,
|
|
1971
|
+
batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
|
|
1972
|
+
)
|
|
1973
|
+
|
|
1955
1974
|
if not is_torchdynamo_compiling():
|
|
1956
1975
|
# Mutations of nn.Module attr forces dynamo restart of Analysis which increases compilation time
|
|
1957
1976
|
|
|
@@ -2478,6 +2497,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
2478
2497
|
per_sample_weights=None,
|
|
2479
2498
|
batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
|
|
2480
2499
|
force_cast_input_types=False,
|
|
2500
|
+
prefetch_pipeline=self.prefetch_pipeline,
|
|
2481
2501
|
)
|
|
2482
2502
|
|
|
2483
2503
|
with self._recording_to_timer(
|
|
@@ -3543,6 +3563,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
3543
3563
|
per_sample_weights: Optional[Tensor] = None,
|
|
3544
3564
|
batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
|
|
3545
3565
|
force_cast_input_types: bool = True,
|
|
3566
|
+
prefetch_pipeline: bool = False,
|
|
3546
3567
|
) -> Tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
|
|
3547
3568
|
"""
|
|
3548
3569
|
Prepare TBE inputs as follows:
|
|
@@ -3613,9 +3634,17 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
3613
3634
|
per_sample_weights = per_sample_weights.float()
|
|
3614
3635
|
|
|
3615
3636
|
if self.bounds_check_mode_int != BoundsCheckMode.NONE.value:
|
|
3637
|
+
# Override the bounds check version based on prefetch_pipeline
|
|
3638
|
+
use_bounds_check_v2 = self.bounds_check_version == 2 or prefetch_pipeline
|
|
3639
|
+
bounds_check_version = (
|
|
3640
|
+
2 if use_bounds_check_v2 else self.bounds_check_version
|
|
3641
|
+
)
|
|
3642
|
+
|
|
3643
|
+
vbe = vbe_metadata.B_offsets is not None
|
|
3644
|
+
|
|
3616
3645
|
# Compute B info and VBE metadata for bounds_check_indices only if
|
|
3617
3646
|
# VBE and bounds check indices v2 are used
|
|
3618
|
-
if vbe and
|
|
3647
|
+
if vbe and use_bounds_check_v2:
|
|
3619
3648
|
B_offsets = vbe_metadata.B_offsets
|
|
3620
3649
|
B_offsets_rank_per_feature = vbe_metadata.B_offsets_rank_per_feature
|
|
3621
3650
|
output_offsets_feature_rank = vbe_metadata.output_offsets_feature_rank
|
|
@@ -3653,7 +3682,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
3653
3682
|
b_t_map=b_t_map,
|
|
3654
3683
|
info_B_num_bits=self.info_B_num_bits,
|
|
3655
3684
|
info_B_mask=self.info_B_mask,
|
|
3656
|
-
bounds_check_version=
|
|
3685
|
+
bounds_check_version=bounds_check_version,
|
|
3686
|
+
prefetch_pipeline=prefetch_pipeline,
|
|
3657
3687
|
)
|
|
3658
3688
|
|
|
3659
3689
|
return indices, offsets, per_sample_weights, vbe_metadata
|
|
@@ -3792,6 +3822,39 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
3792
3822
|
return _debug_print_input_stats_factory_impl
|
|
3793
3823
|
return _debug_print_input_stats_factory_null
|
|
3794
3824
|
|
|
3825
|
+
@torch.jit.ignore
|
|
3826
|
+
def __report_input_params_factory(
|
|
3827
|
+
self,
|
|
3828
|
+
) -> Callable[..., None]:
|
|
3829
|
+
"""
|
|
3830
|
+
This function returns a function pointer based on the environment variable `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL`.
|
|
3831
|
+
|
|
3832
|
+
If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is set to a value greater than 0, it returns a function pointer that:
|
|
3833
|
+
- Reports input parameters (TBEDataConfig).
|
|
3834
|
+
- Writes the output as a JSON file.
|
|
3835
|
+
|
|
3836
|
+
If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is not set or is set to 0, it returns a dummy function pointer that performs no action.
|
|
3837
|
+
"""
|
|
3838
|
+
|
|
3839
|
+
@torch.jit.ignore
|
|
3840
|
+
def __report_input_params_factory_null(
|
|
3841
|
+
feature_rows: Tensor,
|
|
3842
|
+
feature_dims: Tensor,
|
|
3843
|
+
iteration: int,
|
|
3844
|
+
indices: Tensor,
|
|
3845
|
+
offsets: Tensor,
|
|
3846
|
+
op_id: Optional[str] = None,
|
|
3847
|
+
per_sample_weights: Optional[Tensor] = None,
|
|
3848
|
+
batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
|
|
3849
|
+
) -> None:
|
|
3850
|
+
pass
|
|
3851
|
+
|
|
3852
|
+
if self._feature_is_enabled(FeatureGateName.TBE_REPORT_INPUT_PARAMS):
|
|
3853
|
+
|
|
3854
|
+
reporter = TBEBenchmarkParamsReporter.create()
|
|
3855
|
+
return reporter.report_stats
|
|
3856
|
+
return __report_input_params_factory_null
|
|
3857
|
+
|
|
3795
3858
|
|
|
3796
3859
|
class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
3797
3860
|
"""
|
|
@@ -9,19 +9,11 @@
|
|
|
9
9
|
|
|
10
10
|
import dataclasses
|
|
11
11
|
import json
|
|
12
|
-
from typing import Any, Dict,
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
13
|
|
|
14
|
-
import numpy as np
|
|
15
14
|
import torch
|
|
16
15
|
|
|
17
|
-
from fbgemm_gpu.tbe.utils.common import get_device
|
|
18
|
-
from fbgemm_gpu.tbe.utils.requests import (
|
|
19
|
-
generate_batch_sizes_from_stats,
|
|
20
|
-
generate_pooling_factors_from_stats,
|
|
21
|
-
get_table_batched_offsets_from_dense,
|
|
22
|
-
maybe_to_dtype,
|
|
23
|
-
TBERequest,
|
|
24
|
-
)
|
|
16
|
+
from fbgemm_gpu.tbe.utils.common import get_device
|
|
25
17
|
|
|
26
18
|
from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
|
|
27
19
|
|
|
@@ -104,175 +96,3 @@ class TBEDataConfig:
|
|
|
104
96
|
def _new_weights(self, size: int) -> Optional[torch.Tensor]:
|
|
105
97
|
# Per-sample weights will always be FP32
|
|
106
98
|
return None if not self.weighted else torch.randn(size, device=get_device())
|
|
107
|
-
|
|
108
|
-
def _generate_batch_sizes(self) -> Tuple[List[int], Optional[List[List[int]]]]:
|
|
109
|
-
if self.variable_B():
|
|
110
|
-
assert (
|
|
111
|
-
self.batch_params.vbe_num_ranks is not None
|
|
112
|
-
), "vbe_num_ranks must be set for varaible batch size generation"
|
|
113
|
-
return generate_batch_sizes_from_stats(
|
|
114
|
-
self.batch_params.B,
|
|
115
|
-
self.T,
|
|
116
|
-
# pyre-ignore [6]
|
|
117
|
-
self.batch_params.sigma_B,
|
|
118
|
-
self.batch_params.vbe_num_ranks,
|
|
119
|
-
# pyre-ignore [6]
|
|
120
|
-
self.batch_params.vbe_distribution,
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
else:
|
|
124
|
-
return ([self.batch_params.B] * self.T, None)
|
|
125
|
-
|
|
126
|
-
def _generate_pooling_info(self, iters: int, Bs: List[int]) -> torch.Tensor:
|
|
127
|
-
if self.variable_L():
|
|
128
|
-
# Generate L from stats
|
|
129
|
-
_, L_offsets = generate_pooling_factors_from_stats(
|
|
130
|
-
iters,
|
|
131
|
-
Bs,
|
|
132
|
-
self.pooling_params.L,
|
|
133
|
-
# pyre-ignore [6]
|
|
134
|
-
self.pooling_params.sigma_L,
|
|
135
|
-
# pyre-ignore [6]
|
|
136
|
-
self.pooling_params.length_distribution,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
else:
|
|
140
|
-
Ls = [self.pooling_params.L] * (sum(Bs) * iters)
|
|
141
|
-
L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
|
|
142
|
-
|
|
143
|
-
return L_offsets
|
|
144
|
-
|
|
145
|
-
def _generate_indices(
|
|
146
|
-
self,
|
|
147
|
-
iters: int,
|
|
148
|
-
Bs: List[int],
|
|
149
|
-
L_offsets: torch.Tensor,
|
|
150
|
-
) -> torch.Tensor:
|
|
151
|
-
total_B = sum(Bs)
|
|
152
|
-
L_offsets_list = L_offsets.tolist()
|
|
153
|
-
indices_list = []
|
|
154
|
-
for it in range(iters):
|
|
155
|
-
# L_offsets is defined over the entire set of batches for a single iteration
|
|
156
|
-
start_offset = L_offsets_list[it * total_B]
|
|
157
|
-
end_offset = L_offsets_list[(it + 1) * total_B]
|
|
158
|
-
|
|
159
|
-
indices_list.append(
|
|
160
|
-
torch.ops.fbgemm.tbe_generate_indices_from_distribution(
|
|
161
|
-
self.indices_params.heavy_hitters,
|
|
162
|
-
self.indices_params.zipf_q,
|
|
163
|
-
self.indices_params.zipf_s,
|
|
164
|
-
# max_index = dimensions of the embedding table
|
|
165
|
-
self.E,
|
|
166
|
-
# num_indices = number of indices to generate
|
|
167
|
-
end_offset - start_offset,
|
|
168
|
-
)
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
return torch.cat(indices_list)
|
|
172
|
-
|
|
173
|
-
def _build_requests_jagged(
|
|
174
|
-
self,
|
|
175
|
-
iters: int,
|
|
176
|
-
Bs: List[int],
|
|
177
|
-
Bs_feature_rank: Optional[List[List[int]]],
|
|
178
|
-
L_offsets: torch.Tensor,
|
|
179
|
-
all_indices: torch.Tensor,
|
|
180
|
-
) -> List[TBERequest]:
|
|
181
|
-
total_B = sum(Bs)
|
|
182
|
-
all_indices = all_indices.flatten()
|
|
183
|
-
requests = []
|
|
184
|
-
for it in range(iters):
|
|
185
|
-
start_offset = L_offsets[it * total_B]
|
|
186
|
-
it_L_offsets = torch.concat(
|
|
187
|
-
[
|
|
188
|
-
torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
|
|
189
|
-
L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
|
|
190
|
-
]
|
|
191
|
-
)
|
|
192
|
-
requests.append(
|
|
193
|
-
TBERequest(
|
|
194
|
-
maybe_to_dtype(
|
|
195
|
-
all_indices[start_offset : L_offsets[(it + 1) * total_B]],
|
|
196
|
-
self.indices_params.index_dtype,
|
|
197
|
-
),
|
|
198
|
-
maybe_to_dtype(
|
|
199
|
-
it_L_offsets.to(get_device()), self.indices_params.offset_dtype
|
|
200
|
-
),
|
|
201
|
-
self._new_weights(int(it_L_offsets[-1].item())),
|
|
202
|
-
Bs_feature_rank if self.variable_B() else None,
|
|
203
|
-
)
|
|
204
|
-
)
|
|
205
|
-
return requests
|
|
206
|
-
|
|
207
|
-
def _build_requests_dense(
|
|
208
|
-
self, iters: int, all_indices: torch.Tensor
|
|
209
|
-
) -> List[TBERequest]:
|
|
210
|
-
# NOTE: We're using existing code from requests.py to build the
|
|
211
|
-
# requests, and since the existing code requires 2D view of all_indices,
|
|
212
|
-
# the existing all_indices must be reshaped
|
|
213
|
-
all_indices = all_indices.reshape(iters, -1)
|
|
214
|
-
|
|
215
|
-
requests = []
|
|
216
|
-
for it in range(iters):
|
|
217
|
-
indices, offsets = get_table_batched_offsets_from_dense(
|
|
218
|
-
all_indices[it].view(
|
|
219
|
-
self.T, self.batch_params.B, self.pooling_params.L
|
|
220
|
-
),
|
|
221
|
-
use_cpu=self.use_cpu,
|
|
222
|
-
)
|
|
223
|
-
requests.append(
|
|
224
|
-
TBERequest(
|
|
225
|
-
maybe_to_dtype(indices, self.indices_params.index_dtype),
|
|
226
|
-
maybe_to_dtype(offsets, self.indices_params.offset_dtype),
|
|
227
|
-
self._new_weights(
|
|
228
|
-
self.T * self.batch_params.B * self.pooling_params.L
|
|
229
|
-
),
|
|
230
|
-
)
|
|
231
|
-
)
|
|
232
|
-
return requests
|
|
233
|
-
|
|
234
|
-
def generate_requests(
|
|
235
|
-
self,
|
|
236
|
-
iters: int = 1,
|
|
237
|
-
) -> List[TBERequest]:
|
|
238
|
-
# Generate batch sizes
|
|
239
|
-
Bs, Bs_feature_rank = self._generate_batch_sizes()
|
|
240
|
-
|
|
241
|
-
# Generate pooling info
|
|
242
|
-
L_offsets = self._generate_pooling_info(iters, Bs)
|
|
243
|
-
|
|
244
|
-
# Generate indices
|
|
245
|
-
all_indices = self._generate_indices(iters, Bs, L_offsets)
|
|
246
|
-
|
|
247
|
-
# Build TBE requests
|
|
248
|
-
if self.variable_B() or self.variable_L():
|
|
249
|
-
return self._build_requests_jagged(
|
|
250
|
-
iters, Bs, Bs_feature_rank, L_offsets, all_indices
|
|
251
|
-
)
|
|
252
|
-
else:
|
|
253
|
-
return self._build_requests_dense(iters, all_indices)
|
|
254
|
-
|
|
255
|
-
def generate_embedding_dims(self) -> Tuple[int, List[int]]:
|
|
256
|
-
if self.mixed_dim:
|
|
257
|
-
Ds = [
|
|
258
|
-
round_up(
|
|
259
|
-
np.random.randint(low=int(0.5 * self.D), high=int(1.5 * self.D)), 4
|
|
260
|
-
)
|
|
261
|
-
for _ in range(self.T)
|
|
262
|
-
]
|
|
263
|
-
return (int(np.average(Ds)), Ds)
|
|
264
|
-
else:
|
|
265
|
-
return (self.D, [self.D] * self.T)
|
|
266
|
-
|
|
267
|
-
def generate_feature_requires_grad(self, size: int) -> torch.Tensor:
|
|
268
|
-
assert size <= self.T, "size of feature_requires_grad must be less than T"
|
|
269
|
-
weighted_requires_grad_tables = np.random.choice(
|
|
270
|
-
self.T, replace=False, size=(size,)
|
|
271
|
-
).tolist()
|
|
272
|
-
return (
|
|
273
|
-
torch.tensor(
|
|
274
|
-
[1 if t in weighted_requires_grad_tables else 0 for t in range(self.T)]
|
|
275
|
-
)
|
|
276
|
-
.to(get_device())
|
|
277
|
-
.int()
|
|
278
|
-
)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This source code is licensed under the BSD-style license found in the
|
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
|
7
|
+
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
10
|
+
from typing import List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
import torch
|
|
13
|
+
|
|
14
|
+
from fbgemm_gpu.tbe.bench import TBEDataConfig
|
|
15
|
+
from fbgemm_gpu.tbe.utils.common import get_device, round_up
|
|
16
|
+
|
|
17
|
+
from fbgemm_gpu.tbe.utils.requests import (
|
|
18
|
+
generate_batch_sizes_from_stats,
|
|
19
|
+
generate_pooling_factors_from_stats,
|
|
20
|
+
get_table_batched_offsets_from_dense,
|
|
21
|
+
maybe_to_dtype,
|
|
22
|
+
TBERequest,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _generate_batch_sizes(
|
|
27
|
+
tbe_data_config: TBEDataConfig,
|
|
28
|
+
) -> Tuple[List[int], Optional[List[List[int]]]]:
|
|
29
|
+
if tbe_data_config.variable_B():
|
|
30
|
+
assert (
|
|
31
|
+
tbe_data_config.batch_params.vbe_num_ranks is not None
|
|
32
|
+
), "vbe_num_ranks must be set for varaible batch size generation"
|
|
33
|
+
return generate_batch_sizes_from_stats(
|
|
34
|
+
tbe_data_config.batch_params.B,
|
|
35
|
+
tbe_data_config.T,
|
|
36
|
+
# pyre-ignore [6]
|
|
37
|
+
tbe_data_config.batch_params.sigma_B,
|
|
38
|
+
tbe_data_config.batch_params.vbe_num_ranks,
|
|
39
|
+
# pyre-ignore [6]
|
|
40
|
+
tbe_data_config.batch_params.vbe_distribution,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
else:
|
|
44
|
+
return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _generate_pooling_info(
|
|
48
|
+
tbe_data_config: TBEDataConfig, iters: int, Bs: List[int]
|
|
49
|
+
) -> torch.Tensor:
|
|
50
|
+
if tbe_data_config.variable_L():
|
|
51
|
+
# Generate L from stats
|
|
52
|
+
_, L_offsets = generate_pooling_factors_from_stats(
|
|
53
|
+
iters,
|
|
54
|
+
Bs,
|
|
55
|
+
tbe_data_config.pooling_params.L,
|
|
56
|
+
# pyre-ignore [6]
|
|
57
|
+
tbe_data_config.pooling_params.sigma_L,
|
|
58
|
+
# pyre-ignore [6]
|
|
59
|
+
tbe_data_config.pooling_params.length_distribution,
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
Ls = [tbe_data_config.pooling_params.L] * (sum(Bs) * iters)
|
|
63
|
+
L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
|
|
64
|
+
|
|
65
|
+
return L_offsets
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _generate_indices(
|
|
69
|
+
tbe_data_config: TBEDataConfig,
|
|
70
|
+
iters: int,
|
|
71
|
+
Bs: List[int],
|
|
72
|
+
L_offsets: torch.Tensor,
|
|
73
|
+
) -> torch.Tensor:
|
|
74
|
+
total_B = sum(Bs)
|
|
75
|
+
L_offsets_list = L_offsets.tolist()
|
|
76
|
+
indices_list = []
|
|
77
|
+
for it in range(iters):
|
|
78
|
+
# L_offsets is defined over the entire set of batches for a single iteration
|
|
79
|
+
start_offset = L_offsets_list[it * total_B]
|
|
80
|
+
end_offset = L_offsets_list[(it + 1) * total_B]
|
|
81
|
+
|
|
82
|
+
indices_list.append(
|
|
83
|
+
torch.ops.fbgemm.tbe_generate_indices_from_distribution(
|
|
84
|
+
tbe_data_config.indices_params.heavy_hitters,
|
|
85
|
+
tbe_data_config.indices_params.zipf_q,
|
|
86
|
+
tbe_data_config.indices_params.zipf_s,
|
|
87
|
+
# max_index = dimensions of the embedding table
|
|
88
|
+
tbe_data_config.E,
|
|
89
|
+
# num_indices = number of indices to generate
|
|
90
|
+
end_offset - start_offset,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return torch.cat(indices_list)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _build_requests_jagged(
|
|
98
|
+
tbe_data_config: TBEDataConfig,
|
|
99
|
+
iters: int,
|
|
100
|
+
Bs: List[int],
|
|
101
|
+
Bs_feature_rank: Optional[List[List[int]]],
|
|
102
|
+
L_offsets: torch.Tensor,
|
|
103
|
+
all_indices: torch.Tensor,
|
|
104
|
+
) -> List[TBERequest]:
|
|
105
|
+
total_B = sum(Bs)
|
|
106
|
+
all_indices = all_indices.flatten()
|
|
107
|
+
requests = []
|
|
108
|
+
for it in range(iters):
|
|
109
|
+
start_offset = L_offsets[it * total_B]
|
|
110
|
+
it_L_offsets = torch.concat(
|
|
111
|
+
[
|
|
112
|
+
torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
|
|
113
|
+
L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
requests.append(
|
|
117
|
+
TBERequest(
|
|
118
|
+
maybe_to_dtype(
|
|
119
|
+
all_indices[start_offset : L_offsets[(it + 1) * total_B]],
|
|
120
|
+
tbe_data_config.indices_params.index_dtype,
|
|
121
|
+
),
|
|
122
|
+
maybe_to_dtype(
|
|
123
|
+
it_L_offsets.to(get_device()),
|
|
124
|
+
tbe_data_config.indices_params.offset_dtype,
|
|
125
|
+
),
|
|
126
|
+
tbe_data_config._new_weights(int(it_L_offsets[-1].item())),
|
|
127
|
+
Bs_feature_rank if tbe_data_config.variable_B() else None,
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
return requests
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _build_requests_dense(
|
|
134
|
+
tbe_data_config: TBEDataConfig, iters: int, all_indices: torch.Tensor
|
|
135
|
+
) -> List[TBERequest]:
|
|
136
|
+
# NOTE: We're using existing code from requests.py to build the
|
|
137
|
+
# requests, and since the existing code requires 2D view of all_indices,
|
|
138
|
+
# the existing all_indices must be reshaped
|
|
139
|
+
all_indices = all_indices.reshape(iters, -1)
|
|
140
|
+
|
|
141
|
+
requests = []
|
|
142
|
+
for it in range(iters):
|
|
143
|
+
indices, offsets = get_table_batched_offsets_from_dense(
|
|
144
|
+
all_indices[it].view(
|
|
145
|
+
tbe_data_config.T,
|
|
146
|
+
tbe_data_config.batch_params.B,
|
|
147
|
+
tbe_data_config.pooling_params.L,
|
|
148
|
+
),
|
|
149
|
+
use_cpu=tbe_data_config.use_cpu,
|
|
150
|
+
)
|
|
151
|
+
requests.append(
|
|
152
|
+
TBERequest(
|
|
153
|
+
maybe_to_dtype(indices, tbe_data_config.indices_params.index_dtype),
|
|
154
|
+
maybe_to_dtype(offsets, tbe_data_config.indices_params.offset_dtype),
|
|
155
|
+
tbe_data_config._new_weights(
|
|
156
|
+
tbe_data_config.T
|
|
157
|
+
* tbe_data_config.batch_params.B
|
|
158
|
+
* tbe_data_config.pooling_params.L
|
|
159
|
+
),
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
return requests
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def generate_requests(
|
|
166
|
+
tbe_data_config: TBEDataConfig,
|
|
167
|
+
iters: int = 1,
|
|
168
|
+
) -> List[TBERequest]:
|
|
169
|
+
# Generate batch sizes
|
|
170
|
+
Bs, Bs_feature_rank = _generate_batch_sizes(tbe_data_config)
|
|
171
|
+
|
|
172
|
+
# Generate pooling info
|
|
173
|
+
L_offsets = _generate_pooling_info(tbe_data_config, iters, Bs)
|
|
174
|
+
|
|
175
|
+
# Generate indices
|
|
176
|
+
all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
|
|
177
|
+
|
|
178
|
+
# Build TBE requests
|
|
179
|
+
if tbe_data_config.variable_B() or tbe_data_config.variable_L():
|
|
180
|
+
return _build_requests_jagged(
|
|
181
|
+
tbe_data_config, iters, Bs, Bs_feature_rank, L_offsets, all_indices
|
|
182
|
+
)
|
|
183
|
+
else:
|
|
184
|
+
return _build_requests_dense(tbe_data_config, iters, all_indices)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def generate_embedding_dims(tbe_data_config: TBEDataConfig) -> Tuple[int, List[int]]:
|
|
188
|
+
if tbe_data_config.mixed_dim:
|
|
189
|
+
Ds = [
|
|
190
|
+
round_up(
|
|
191
|
+
int(
|
|
192
|
+
torch.randint(
|
|
193
|
+
low=int(0.5 * tbe_data_config.D),
|
|
194
|
+
high=int(1.5 * tbe_data_config.D),
|
|
195
|
+
size=(1,),
|
|
196
|
+
).item()
|
|
197
|
+
),
|
|
198
|
+
4,
|
|
199
|
+
)
|
|
200
|
+
for _ in range(tbe_data_config.T)
|
|
201
|
+
]
|
|
202
|
+
return (sum(Ds) // len(Ds), Ds)
|
|
203
|
+
else:
|
|
204
|
+
return (tbe_data_config.D, [tbe_data_config.D] * tbe_data_config.T)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def generate_feature_requires_grad(
|
|
208
|
+
tbe_data_config: TBEDataConfig, size: int
|
|
209
|
+
) -> torch.Tensor:
|
|
210
|
+
assert (
|
|
211
|
+
size <= tbe_data_config.T
|
|
212
|
+
), "size of feature_requires_grad must be less than T"
|
|
213
|
+
weighted_requires_grad_tables = torch.randperm(tbe_data_config.T)[:size].tolist()
|
|
214
|
+
return (
|
|
215
|
+
torch.tensor(
|
|
216
|
+
[
|
|
217
|
+
1 if t in weighted_requires_grad_tables else 0
|
|
218
|
+
for t in range(tbe_data_config.T)
|
|
219
|
+
]
|
|
220
|
+
)
|
|
221
|
+
.to(get_device())
|
|
222
|
+
.int()
|
|
223
|
+
)
|
|
@@ -11,8 +11,12 @@ import click
|
|
|
11
11
|
import torch
|
|
12
12
|
import yaml
|
|
13
13
|
|
|
14
|
-
from .tbe_data_config import
|
|
15
|
-
|
|
14
|
+
from fbgemm_gpu.tbe.bench.tbe_data_config import (
|
|
15
|
+
BatchParams,
|
|
16
|
+
IndicesParams,
|
|
17
|
+
PoolingParams,
|
|
18
|
+
TBEDataConfig,
|
|
19
|
+
)
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
class TBEDataConfigLoader:
|
fbgemm_gpu/tbe/ssd/training.py
CHANGED
|
@@ -248,6 +248,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
|
|
|
248
248
|
self.total_hash_size_bits: int = 0
|
|
249
249
|
else:
|
|
250
250
|
self.total_hash_size_bits: int = int(log2(float(hash_size_cumsum[-1])) + 1)
|
|
251
|
+
self.register_buffer(
|
|
252
|
+
"table_hash_size_cumsum",
|
|
253
|
+
torch.tensor(
|
|
254
|
+
hash_size_cumsum, device=self.current_device, dtype=torch.int64
|
|
255
|
+
),
|
|
256
|
+
)
|
|
251
257
|
# The last element is to easily access # of rows of each table by
|
|
252
258
|
self.total_hash_size_bits = int(log2(float(hash_size_cumsum[-1])) + 1)
|
|
253
259
|
self.total_hash_size: int = hash_size_cumsum[-1]
|
|
@@ -288,6 +294,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
|
|
|
288
294
|
"feature_dims",
|
|
289
295
|
torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
|
|
290
296
|
)
|
|
297
|
+
self.register_buffer(
|
|
298
|
+
"table_dims",
|
|
299
|
+
torch.tensor(dims, device="cpu", dtype=torch.int64),
|
|
300
|
+
)
|
|
291
301
|
|
|
292
302
|
(info_B_num_bits_, info_B_mask_) = torch.ops.fbgemm.get_infos_metadata(
|
|
293
303
|
self.D_offsets, # unused tensor
|
|
@@ -518,6 +528,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
|
|
|
518
528
|
logging.warning("dist is not initialized, treating as single gpu cases")
|
|
519
529
|
tbe_unique_id = SSDTableBatchedEmbeddingBags._local_instance_index
|
|
520
530
|
self.tbe_unique_id = tbe_unique_id
|
|
531
|
+
self.l2_cache_size = l2_cache_size
|
|
521
532
|
logging.info(f"tbe_unique_id: {tbe_unique_id}")
|
|
522
533
|
if self.backend_type == BackendType.SSD:
|
|
523
534
|
logging.info(
|
|
@@ -564,12 +575,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
|
|
|
564
575
|
self.res_params.table_offsets,
|
|
565
576
|
self.res_params.table_sizes,
|
|
566
577
|
(
|
|
567
|
-
tensor_pad4(self.
|
|
578
|
+
tensor_pad4(self.table_dims)
|
|
568
579
|
if self.enable_optimizer_offloading
|
|
569
580
|
else None
|
|
570
581
|
),
|
|
571
582
|
(
|
|
572
|
-
self.
|
|
583
|
+
self.table_hash_size_cumsum.cpu()
|
|
573
584
|
if self.enable_optimizer_offloading
|
|
574
585
|
else None
|
|
575
586
|
),
|
|
@@ -609,28 +620,42 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
|
|
|
609
620
|
f"feature_dims={self.feature_dims},"
|
|
610
621
|
f"hash_size_cumsum={self.hash_size_cumsum}"
|
|
611
622
|
)
|
|
623
|
+
table_dims = (
|
|
624
|
+
tensor_pad4(self.table_dims)
|
|
625
|
+
if self.enable_optimizer_offloading
|
|
626
|
+
else None
|
|
627
|
+
) # table_dims
|
|
628
|
+
eviction_config = None
|
|
629
|
+
if self.kv_zch_params and self.kv_zch_params.eviction_policy:
|
|
630
|
+
eviction_mem_threshold_gb = (
|
|
631
|
+
self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
|
|
632
|
+
if self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
|
|
633
|
+
else self.l2_cache_size
|
|
634
|
+
)
|
|
635
|
+
eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
|
|
636
|
+
self.kv_zch_params.eviction_policy.eviction_trigger_mode, # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
|
|
637
|
+
self.kv_zch_params.eviction_policy.eviction_strategy, # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
|
|
638
|
+
self.kv_zch_params.eviction_policy.eviction_step_intervals, # trigger_step_interval if trigger mode is iteration
|
|
639
|
+
eviction_mem_threshold_gb, # mem_util_threshold_in_GB if trigger mode is mem_util
|
|
640
|
+
self.kv_zch_params.eviction_policy.ttls_in_mins, # ttls_in_mins for each table if eviction strategy is timestamp
|
|
641
|
+
self.kv_zch_params.eviction_policy.counter_thresholds, # counter_thresholds for each table if eviction strategy is feature score
|
|
642
|
+
self.kv_zch_params.eviction_policy.counter_decay_rates, # counter_decay_rates for each table if eviction strategy is feature score
|
|
643
|
+
self.kv_zch_params.eviction_policy.l2_weight_thresholds, # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
|
|
644
|
+
table_dims.tolist() if table_dims is not None else None,
|
|
645
|
+
self.kv_zch_params.eviction_policy.interval_for_insufficient_eviction_s,
|
|
646
|
+
self.kv_zch_params.eviction_policy.interval_for_sufficient_eviction_s,
|
|
647
|
+
)
|
|
612
648
|
self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
|
|
613
649
|
self.cache_row_dim,
|
|
614
650
|
ssd_uniform_init_lower,
|
|
615
651
|
ssd_uniform_init_upper,
|
|
616
|
-
|
|
617
|
-
0, # trigger_step_interval if trigger mode is iteration
|
|
618
|
-
0, # mem_util_threshold_in_GB if trigger mode is mem_util
|
|
619
|
-
0, # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
|
|
620
|
-
None, # count_thresholds for each table if eviction strategy is feature score
|
|
621
|
-
None, # ttls_in_mins for each table if eviction strategy is timestamp
|
|
622
|
-
None, # count_decay_rates for each table if eviction strategy is feature score
|
|
623
|
-
None, # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
|
|
652
|
+
eviction_config,
|
|
624
653
|
ssd_rocksdb_shards, # num_shards
|
|
625
654
|
ssd_rocksdb_shards, # num_threads
|
|
626
655
|
weights_precision.bit_rate(), # row_storage_bitwidth
|
|
656
|
+
table_dims,
|
|
627
657
|
(
|
|
628
|
-
|
|
629
|
-
if self.enable_optimizer_offloading
|
|
630
|
-
else None
|
|
631
|
-
), # table_dims
|
|
632
|
-
(
|
|
633
|
-
self.hash_size_cumsum.cpu()
|
|
658
|
+
self.table_hash_size_cumsum.cpu()
|
|
634
659
|
if self.enable_optimizer_offloading
|
|
635
660
|
else None
|
|
636
661
|
), # hash_size_cumsum
|
|
@@ -2434,6 +2459,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
|
|
|
2434
2459
|
f"created snapshot for weight states: {snapshot_handle}, latency: {(time.time() - start_time) * 1000} ms"
|
|
2435
2460
|
)
|
|
2436
2461
|
elif self.backend_type == BackendType.DRAM:
|
|
2462
|
+
# if there is any ongoing eviction, lets wait until eviction is finished before state_dict
|
|
2463
|
+
# so that we can reach consistent model state before/after state_dict
|
|
2464
|
+
evict_wait_start_time = time.time()
|
|
2465
|
+
self.ssd_db.wait_until_eviction_done()
|
|
2466
|
+
logging.info(
|
|
2467
|
+
f"state_dict wait for ongoing eviction: {time.time() - evict_wait_start_time} s"
|
|
2468
|
+
)
|
|
2437
2469
|
self.flush(force=should_flush)
|
|
2438
2470
|
return snapshot_handle, checkpoint_handle
|
|
2439
2471
|
|
|
@@ -8,26 +8,28 @@
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
|
|
10
10
|
import io
|
|
11
|
+
import json
|
|
11
12
|
import logging
|
|
12
13
|
import os
|
|
13
14
|
from typing import List, Optional
|
|
14
15
|
|
|
15
16
|
import fbgemm_gpu # noqa F401
|
|
16
|
-
import numpy as np # usort:skip
|
|
17
17
|
import torch # usort:skip
|
|
18
18
|
|
|
19
|
-
from fbgemm_gpu.
|
|
20
|
-
SplitTableBatchedEmbeddingBagsCodegen,
|
|
21
|
-
)
|
|
22
|
-
from fbgemm_gpu.tbe.bench import (
|
|
19
|
+
from fbgemm_gpu.tbe.bench.tbe_data_config import (
|
|
23
20
|
BatchParams,
|
|
24
21
|
IndicesParams,
|
|
25
22
|
PoolingParams,
|
|
26
23
|
TBEDataConfig,
|
|
27
24
|
)
|
|
28
25
|
|
|
29
|
-
|
|
30
|
-
|
|
26
|
+
open_source: bool = False
|
|
27
|
+
try:
|
|
28
|
+
# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
|
|
29
|
+
if getattr(fbgemm_gpu, "open_source", False):
|
|
30
|
+
open_source = True
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
31
33
|
|
|
32
34
|
if open_source:
|
|
33
35
|
from fbgemm_gpu.utils import FileStore
|
|
@@ -43,7 +45,8 @@ class TBEBenchmarkParamsReporter:
|
|
|
43
45
|
def __init__(
|
|
44
46
|
self,
|
|
45
47
|
report_interval: int,
|
|
46
|
-
|
|
48
|
+
report_iter_start: int = 0,
|
|
49
|
+
report_iter_end: int = -1,
|
|
47
50
|
bucket: Optional[str] = None,
|
|
48
51
|
path_prefix: Optional[str] = None,
|
|
49
52
|
) -> None:
|
|
@@ -52,13 +55,30 @@ class TBEBenchmarkParamsReporter:
|
|
|
52
55
|
|
|
53
56
|
Args:
|
|
54
57
|
report_interval (int): The interval at which reports are generated.
|
|
55
|
-
|
|
58
|
+
report_iter_start (int): The start of the iteration range to capture. Defaults to 0.
|
|
59
|
+
report_iter_end (int): The end of the iteration range to capture. Defaults to -1 (last iteration).
|
|
56
60
|
bucket (Optional[str], optional): The storage bucket for reports. Defaults to None.
|
|
57
61
|
path_prefix (Optional[str], optional): The path prefix for report storage. Defaults to None.
|
|
58
62
|
"""
|
|
63
|
+
assert report_interval > 0, "report_interval must be greater than 0"
|
|
64
|
+
assert (
|
|
65
|
+
report_iter_start >= 0
|
|
66
|
+
), "report_iter_start must be greater than or equal to 0"
|
|
67
|
+
assert (
|
|
68
|
+
report_iter_end >= -1
|
|
69
|
+
), "report_iter_end must be greater than or equal to -1"
|
|
70
|
+
assert (
|
|
71
|
+
report_iter_end == -1 or report_iter_start <= report_iter_end
|
|
72
|
+
), "report_iter_start must be less than or equal to report_iter_end"
|
|
73
|
+
|
|
59
74
|
self.report_interval = report_interval
|
|
60
|
-
self.
|
|
61
|
-
self.
|
|
75
|
+
self.report_iter_start = report_iter_start
|
|
76
|
+
self.report_iter_end = report_iter_end
|
|
77
|
+
|
|
78
|
+
if path_prefix is not None and path_prefix.endswith("/"):
|
|
79
|
+
path_prefix = path_prefix[:-1]
|
|
80
|
+
|
|
81
|
+
self.path_prefix = path_prefix
|
|
62
82
|
|
|
63
83
|
default_bucket = "/tmp" if open_source else "tlparse_reports"
|
|
64
84
|
bucket = (
|
|
@@ -68,22 +88,65 @@ class TBEBenchmarkParamsReporter:
|
|
|
68
88
|
)
|
|
69
89
|
self.filestore = FileStore(bucket)
|
|
70
90
|
|
|
91
|
+
if self.path_prefix is not None and not self.filestore.exists(self.path_prefix):
|
|
92
|
+
self.filestore.create_directory(self.path_prefix)
|
|
93
|
+
|
|
71
94
|
self.logger: logging.Logger = logging.getLogger(__name__)
|
|
72
95
|
self.logger.setLevel(logging.INFO)
|
|
73
96
|
|
|
97
|
+
@classmethod
|
|
98
|
+
def create(cls) -> "TBEBenchmarkParamsReporter":
|
|
99
|
+
"""
|
|
100
|
+
This method returns an instance of TBEBenchmarkParamsReporter based on environment variables.
|
|
101
|
+
|
|
102
|
+
If the `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` environment variable is set to a value greater than 0, it creates an instance that:
|
|
103
|
+
- Reports input parameters (TBEDataConfig).
|
|
104
|
+
- Writes the output as a JSON file.
|
|
105
|
+
|
|
106
|
+
Additionally, the following environment variables are considered:
|
|
107
|
+
- `FBGEMM_REPORT_INPUT_PARAMS_ITER_START`: Specifies the start of the iteration range to capture.
|
|
108
|
+
- `FBGEMM_REPORT_INPUT_PARAMS_ITER_END`: Specifies the end of the iteration range to capture.
|
|
109
|
+
- `FBGEMM_REPORT_INPUT_PARAMS_BUCKET`: Specifies the bucket for reporting.
|
|
110
|
+
- `FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX`: Specifies the path prefix for reporting.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
TBEBenchmarkParamsReporter: An instance configured based on the environment variables.
|
|
114
|
+
"""
|
|
115
|
+
report_interval = int(
|
|
116
|
+
os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_INTERVAL", "1")
|
|
117
|
+
)
|
|
118
|
+
report_iter_start = int(
|
|
119
|
+
os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_START", "0")
|
|
120
|
+
)
|
|
121
|
+
report_iter_end = int(
|
|
122
|
+
os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_END", "-1")
|
|
123
|
+
)
|
|
124
|
+
bucket = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_BUCKET", "")
|
|
125
|
+
path_prefix = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX", "")
|
|
126
|
+
|
|
127
|
+
return cls(
|
|
128
|
+
report_interval=report_interval,
|
|
129
|
+
report_iter_start=report_iter_start,
|
|
130
|
+
report_iter_end=report_iter_end,
|
|
131
|
+
bucket=bucket,
|
|
132
|
+
path_prefix=path_prefix,
|
|
133
|
+
)
|
|
134
|
+
|
|
74
135
|
def extract_params(
|
|
75
136
|
self,
|
|
76
|
-
|
|
137
|
+
feature_rows: torch.Tensor,
|
|
138
|
+
feature_dims: torch.Tensor,
|
|
77
139
|
indices: torch.Tensor,
|
|
78
140
|
offsets: torch.Tensor,
|
|
79
141
|
per_sample_weights: Optional[torch.Tensor] = None,
|
|
80
142
|
batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
|
|
81
143
|
) -> TBEDataConfig:
|
|
82
144
|
"""
|
|
83
|
-
Extracts parameters from the embedding operation, input indices and offsets to create a TBEDataConfig.
|
|
145
|
+
Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
|
|
84
146
|
|
|
85
147
|
Args:
|
|
86
|
-
|
|
148
|
+
feature_rows (torch.Tensor): Number of rows in each feature.
|
|
149
|
+
feature_dims (torch.Tensor): Number of dimensions in each feature.
|
|
87
150
|
indices (torch.Tensor): The input indices tensor.
|
|
88
151
|
offsets (torch.Tensor): The input offsets tensor.
|
|
89
152
|
per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
|
|
@@ -92,24 +155,33 @@ class TBEBenchmarkParamsReporter:
|
|
|
92
155
|
Returns:
|
|
93
156
|
TBEDataConfig: The configuration data for TBE benchmarking.
|
|
94
157
|
"""
|
|
158
|
+
|
|
159
|
+
Es = feature_rows.tolist()
|
|
160
|
+
Ds = feature_dims.tolist()
|
|
161
|
+
|
|
162
|
+
assert len(Es) == len(
|
|
163
|
+
Ds
|
|
164
|
+
), "feature_rows and feature_dims must have the same length"
|
|
165
|
+
|
|
95
166
|
# Transfer indices back to CPU for EEG analysis
|
|
96
167
|
indices_cpu = indices.cpu()
|
|
97
168
|
|
|
98
|
-
# Extract embedding table specs
|
|
99
|
-
embedding_specs = [
|
|
100
|
-
embedding_op.embedding_specs[t] for t in embedding_op.feature_table_map
|
|
101
|
-
]
|
|
102
|
-
rowcounts = [embedding_spec[0] for embedding_spec in embedding_specs]
|
|
103
|
-
dims = [embedding_spec[1] for embedding_spec in embedding_specs]
|
|
104
|
-
|
|
105
169
|
# Set T to be the number of features we are looking at
|
|
106
|
-
T = len(
|
|
170
|
+
T = len(Ds)
|
|
107
171
|
# Set E to be the mean of the rowcounts to avoid biasing
|
|
108
|
-
E =
|
|
172
|
+
E = (
|
|
173
|
+
Es[0]
|
|
174
|
+
if len(set(Es)) == 1
|
|
175
|
+
else torch.ceil(torch.mean(torch.tensor(feature_rows)))
|
|
176
|
+
)
|
|
109
177
|
# Set mixed_dim to be True if there are multiple dims
|
|
110
|
-
mixed_dim = len(set(
|
|
178
|
+
mixed_dim = len(set(Ds)) > 1
|
|
111
179
|
# Set D to be the mean of the dims to avoid biasing
|
|
112
|
-
D =
|
|
180
|
+
D = (
|
|
181
|
+
Ds[0]
|
|
182
|
+
if not mixed_dim
|
|
183
|
+
else torch.ceil(torch.mean(torch.tensor(feature_dims)))
|
|
184
|
+
)
|
|
113
185
|
|
|
114
186
|
# Compute indices distribution parameters
|
|
115
187
|
heavy_hitters, q, s, _, _ = torch.ops.fbgemm.tbe_estimate_indices_distribution(
|
|
@@ -123,8 +195,18 @@ class TBEBenchmarkParamsReporter:
|
|
|
123
195
|
batch_params = BatchParams(
|
|
124
196
|
B=((offsets.numel() - 1) // T),
|
|
125
197
|
sigma_B=(
|
|
126
|
-
|
|
127
|
-
|
|
198
|
+
int(
|
|
199
|
+
torch.ceil(
|
|
200
|
+
torch.std(
|
|
201
|
+
torch.tensor(
|
|
202
|
+
[
|
|
203
|
+
b
|
|
204
|
+
for bs in batch_size_per_feature_per_rank
|
|
205
|
+
for b in bs
|
|
206
|
+
]
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
)
|
|
128
210
|
)
|
|
129
211
|
if batch_size_per_feature_per_rank
|
|
130
212
|
else None
|
|
@@ -138,11 +220,19 @@ class TBEBenchmarkParamsReporter:
|
|
|
138
220
|
)
|
|
139
221
|
|
|
140
222
|
# Compute pooling parameters
|
|
141
|
-
bag_sizes =
|
|
223
|
+
bag_sizes = offsets[1:] - offsets[:-1]
|
|
142
224
|
mixed_bag_sizes = len(set(bag_sizes)) > 1
|
|
143
225
|
pooling_params = PoolingParams(
|
|
144
|
-
L=
|
|
145
|
-
|
|
226
|
+
L=(
|
|
227
|
+
int(torch.ceil(torch.mean(bag_sizes.float())))
|
|
228
|
+
if mixed_bag_sizes
|
|
229
|
+
else int(bag_sizes[0])
|
|
230
|
+
),
|
|
231
|
+
sigma_L=(
|
|
232
|
+
int(torch.ceil(torch.std(bag_sizes.float())))
|
|
233
|
+
if mixed_bag_sizes
|
|
234
|
+
else None
|
|
235
|
+
),
|
|
146
236
|
length_distribution=("normal" if mixed_bag_sizes else None),
|
|
147
237
|
)
|
|
148
238
|
|
|
@@ -160,34 +250,58 @@ class TBEBenchmarkParamsReporter:
|
|
|
160
250
|
|
|
161
251
|
def report_stats(
|
|
162
252
|
self,
|
|
163
|
-
|
|
253
|
+
feature_rows: torch.Tensor,
|
|
254
|
+
feature_dims: torch.Tensor,
|
|
255
|
+
iteration: int,
|
|
164
256
|
indices: torch.Tensor,
|
|
165
257
|
offsets: torch.Tensor,
|
|
258
|
+
op_id: str = "",
|
|
166
259
|
per_sample_weights: Optional[torch.Tensor] = None,
|
|
167
260
|
batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
|
|
168
261
|
) -> None:
|
|
169
262
|
"""
|
|
170
|
-
Reports the configuration of the embedding operation and input data then writes the TBE configuration to the filestore.
|
|
263
|
+
Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
|
|
171
264
|
|
|
172
265
|
Args:
|
|
173
|
-
|
|
266
|
+
feature_rows (torch.Tensor): Number of rows in each feature.
|
|
267
|
+
feature_dims (torch.Tensor): Number of dimensions in each feature.
|
|
268
|
+
iteration (int): The current iteration number.
|
|
174
269
|
indices (torch.Tensor): The input indices tensor.
|
|
175
270
|
offsets (torch.Tensor): The input offsets tensor.
|
|
271
|
+
op_id (str, optional): The operation identifier. Defaults to an empty string.
|
|
176
272
|
per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
|
|
177
273
|
batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
|
|
178
274
|
"""
|
|
179
|
-
if
|
|
180
|
-
|
|
275
|
+
if (
|
|
276
|
+
(iteration - self.report_iter_start) % self.report_interval == 0
|
|
277
|
+
and (iteration >= self.report_iter_start)
|
|
278
|
+
and (self.report_iter_end == -1 or iteration <= self.report_iter_end)
|
|
181
279
|
):
|
|
182
280
|
# Extract TBE config
|
|
183
281
|
config = self.extract_params(
|
|
184
|
-
|
|
282
|
+
feature_rows=feature_rows,
|
|
283
|
+
feature_dims=feature_dims,
|
|
284
|
+
indices=indices,
|
|
285
|
+
offsets=offsets,
|
|
286
|
+
per_sample_weights=per_sample_weights,
|
|
287
|
+
batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
|
|
185
288
|
)
|
|
186
289
|
|
|
290
|
+
config.json()
|
|
291
|
+
|
|
292
|
+
# Ad-hoc fix for adding Es and Ds to JSON output
|
|
293
|
+
# TODO: Remove this once we moved Es and Ds to be part of TBEDataConfig
|
|
294
|
+
adhoc_config = config.dict()
|
|
295
|
+
adhoc_config["Es"] = feature_rows.tolist()
|
|
296
|
+
adhoc_config["Ds"] = feature_dims.tolist()
|
|
297
|
+
if batch_size_per_feature_per_rank:
|
|
298
|
+
adhoc_config["Bs"] = [
|
|
299
|
+
sum(batch_size_per_feature_per_rank[f])
|
|
300
|
+
for f in range(len(adhoc_config["Es"]))
|
|
301
|
+
]
|
|
302
|
+
|
|
187
303
|
# Write the TBE config to FileStore
|
|
188
304
|
self.filestore.write(
|
|
189
|
-
f"tbe-{
|
|
190
|
-
io.BytesIO(
|
|
305
|
+
f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
|
|
306
|
+
io.BytesIO(json.dumps(adhoc_config, indent=2).encode()),
|
|
191
307
|
)
|
|
192
|
-
|
|
193
|
-
self.has_reported = True
|
fbgemm_gpu/tbe/utils/requests.py
CHANGED
|
@@ -14,9 +14,6 @@ import numpy as np
|
|
|
14
14
|
import numpy.typing as npt
|
|
15
15
|
import torch
|
|
16
16
|
|
|
17
|
-
# pyre-fixme[21]: Could not find name `default_rng` in `numpy.random` (stubbed).
|
|
18
|
-
from numpy.random import default_rng
|
|
19
|
-
|
|
20
17
|
from .common import get_device
|
|
21
18
|
from .offsets import get_table_batched_offsets_from_dense
|
|
22
19
|
|
|
@@ -309,11 +306,9 @@ def generate_indices_zipf(
|
|
|
309
306
|
indices, torch.tensor([0, L], dtype=torch.long), True
|
|
310
307
|
)
|
|
311
308
|
if deterministic_output:
|
|
312
|
-
|
|
313
|
-
else:
|
|
314
|
-
rng = default_rng()
|
|
309
|
+
np.random.seed(12345)
|
|
315
310
|
permutation = torch.as_tensor(
|
|
316
|
-
|
|
311
|
+
np.random.choice(E, size=indices.max().item() + 1, replace=False)
|
|
317
312
|
)
|
|
318
313
|
indices = permutation.gather(0, indices.flatten())
|
|
319
314
|
indices = indices.to(get_device()).int()
|
fbgemm_gpu/utils/filestore.py
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
import io
|
|
12
12
|
import logging
|
|
13
13
|
import os
|
|
14
|
-
import shutil
|
|
15
14
|
from dataclasses import dataclass
|
|
16
15
|
from pathlib import Path
|
|
17
16
|
from typing import BinaryIO, Union
|
|
@@ -76,7 +75,12 @@ class FileStore:
|
|
|
76
75
|
elif isinstance(raw_input, Path):
|
|
77
76
|
if not os.path.exists(raw_input):
|
|
78
77
|
raise FileNotFoundError(f"File {raw_input} does not exist")
|
|
79
|
-
|
|
78
|
+
# Open the source file and destination file, and copy the contents
|
|
79
|
+
with open(raw_input, "rb") as src_file, open(
|
|
80
|
+
filepath, "wb"
|
|
81
|
+
) as dst_file:
|
|
82
|
+
while chunk := src_file.read(4096): # Read 4 KB at a time
|
|
83
|
+
dst_file.write(chunk)
|
|
80
84
|
|
|
81
85
|
elif isinstance(raw_input, io.BytesIO) or isinstance(raw_input, BinaryIO):
|
|
82
86
|
with open(filepath, "wb") as file:
|
|
@@ -155,4 +159,53 @@ class FileStore:
|
|
|
155
159
|
True if file exists, False otherwise.
|
|
156
160
|
"""
|
|
157
161
|
filepath = f"{self.bucket}/{path}"
|
|
158
|
-
return os.path.
|
|
162
|
+
return os.path.exists(filepath)
|
|
163
|
+
|
|
164
|
+
def create_directory(self, path: str) -> "FileStore":
|
|
165
|
+
"""
|
|
166
|
+
Creates a directory in the file store.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
path (str): The path of the node or symlink to a directory (relative
|
|
170
|
+
to `self.bucket`) to be created.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
self. This allows for method-chaining.
|
|
174
|
+
"""
|
|
175
|
+
filepath = f"{self.bucket}/{path}"
|
|
176
|
+
event = f"creating directory {filepath}"
|
|
177
|
+
logger.info(f"FileStore: {event}")
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
if not os.path.exists(filepath):
|
|
181
|
+
os.makedirs(filepath, exist_ok=True)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"FileStore: exception occurred when {event}: {e}")
|
|
184
|
+
raise e
|
|
185
|
+
|
|
186
|
+
return self
|
|
187
|
+
|
|
188
|
+
def remove_directory(self, path: str) -> "FileStore":
|
|
189
|
+
"""
|
|
190
|
+
Removes a directory from the file store.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
path (str): The path of the node or symlink to a directory (relative
|
|
194
|
+
to `self.bucket`) to be removed.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
self. This allows for method-chaining.
|
|
198
|
+
"""
|
|
199
|
+
filepath = f"{self.bucket}/{path}"
|
|
200
|
+
event = f"deleting {filepath}"
|
|
201
|
+
logger.info(f"FileStore: {event}")
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
if os.path.isdir(filepath):
|
|
205
|
+
os.rmdir(filepath)
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.error(f"Manifold: exception occurred when {event}: {e}")
|
|
209
|
+
raise e
|
|
210
|
+
|
|
211
|
+
return self
|
{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fbgemm_gpu_hstu_nightly
|
|
3
|
-
Version: 2025.6.
|
|
3
|
+
Version: 2025.6.18
|
|
4
4
|
Home-page: https://github.com/pytorch/fbgemm
|
|
5
5
|
Author: FBGEMM Team
|
|
6
6
|
Author-email: packages@pytorch.org
|
|
@@ -40,9 +40,6 @@ PyTorch GPU operator libraries for training and inference. The library provides
|
|
|
40
40
|
efficient table batched embedding bag, data layout transformation, and
|
|
41
41
|
quantization supports.
|
|
42
42
|
|
|
43
|
-
FBGEMM_GPU is currently tested with CUDA 12.4 and 11.8 in CI, and with PyTorch
|
|
44
|
-
packages (2.1+) that are built against those CUDA versions.
|
|
45
|
-
|
|
46
43
|
See the full [Documentation](https://pytorch.org/FBGEMM) for more information
|
|
47
44
|
on building, installing, and developing with FBGEMM_GPU, as well as the most
|
|
48
45
|
up-to-date support matrix for this library.
|
{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/RECORD
RENAMED
|
@@ -2,7 +2,7 @@ fbgemm_gpu/__init__.py,sha256=BrIitwvFsRtKEk1ZBHFUi9j6ZUgoA5K7CvepoBez0u4,3419
|
|
|
2
2
|
fbgemm_gpu/asmjit.so,sha256=1mgsQhqX1yiUdU9p2w3e7XNhDxhMprHy8qkFKYM01Ww,488288
|
|
3
3
|
fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=u7LfseNeM5gGFQGLAMVO7h2QkFWEOL3ezV5RuhbZn4M,2928
|
|
4
4
|
fbgemm_gpu/enums.py,sha256=GVuzF5cFTLzttkvlH1SdcGrxrppMhDSbQj_Vm_4zmEo,789
|
|
5
|
-
fbgemm_gpu/fbgemm.so,sha256=
|
|
5
|
+
fbgemm_gpu/fbgemm.so,sha256=2giLGFkDpN5f6NtML_Din2J98LCdwJ0kgL_U3sbGoc0,5634864
|
|
6
6
|
fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
|
|
7
7
|
fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=kjWuWmQY8e2kMRwIPTzjGjyjV4syKPrphtHdsQTAjWM,5136
|
|
8
8
|
fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=cUrEbRIvLFW_3Zmh07QkN4S1Cfvvge6TYO1VXBFCpz8,2752
|
|
@@ -15,15 +15,15 @@ fbgemm_gpu/split_embedding_inference_converter.py,sha256=ilVVowkTiY0WDpOYorj917T
|
|
|
15
15
|
fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
|
|
16
16
|
fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
|
|
17
17
|
fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
|
|
18
|
-
fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=
|
|
18
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=qbc1n-PPWKc75H0lXxK5kuCCprh4xEMS8A0TiE5fbHs,9906
|
|
19
19
|
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=bUDWa6IR0vGLDThgB3nmD1yfYa8_HD34B0dtLnd7thw,81692
|
|
20
|
-
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=
|
|
20
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=IrGhStc8TSwvxjgPtwIVDmfjsXbThmh64pVulNhMR9M,166355
|
|
21
21
|
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=ktC10-nakOBpcmJNCOGQsxuBCP8XTwXJ2WeEgIg91tc,5455
|
|
22
22
|
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
|
|
23
23
|
fbgemm_gpu/tbe_input_multiplexer.py,sha256=DjU7dPHgAT1avXGvgi8SFfw2Pq7yT8S_7IH8qCXoptA,3069
|
|
24
24
|
fbgemm_gpu/uvm.py,sha256=-cZunsuvnAKUEQptIwdYVar_3hUE99FbQUsyfBVeXPE,925
|
|
25
25
|
fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
|
|
26
|
-
fbgemm_gpu/config/feature_list.py,sha256=
|
|
26
|
+
fbgemm_gpu/config/feature_list.py,sha256=04l_k0t6nkLRxnvSeO4ZjkGj_If9KQGl8PTl-HmxOIQ,2441
|
|
27
27
|
fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
|
|
28
28
|
fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
|
|
29
29
|
fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
|
|
@@ -32,10 +32,10 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
|
|
|
32
32
|
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
33
33
|
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
34
34
|
fbgemm_gpu/docs/sparse_ops.py,sha256=NTcTm0q9h8W2B8PKPoic2fHsAaCbCYunSa_EYK0LtHQ,21382
|
|
35
|
-
fbgemm_gpu/docs/version.py,sha256=
|
|
35
|
+
fbgemm_gpu/docs/version.py,sha256=gWwQLocgNkScd-zRubMAconDahzJwrllhvezB_jXyQs,315
|
|
36
36
|
fbgemm_gpu/experimental/hstu/__init__.py,sha256=KNisP6qDMwgjgxkGlqUZRNjJ_8o8R-cTmm3HxF7pSqI,1564
|
|
37
37
|
fbgemm_gpu/experimental/hstu/cuda_hstu_attention.py,sha256=5425GRjJuzpXQC-TowgQOCFjZmOwv_EK0lKbURhHBTQ,9920
|
|
38
|
-
fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so,sha256=
|
|
38
|
+
fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so,sha256=9mp_lqP2V4gBxmINu0tafkVMdl5Qu1JiFlSP6Jpglrk,352287576
|
|
39
39
|
fbgemm_gpu/quantize/__init__.py,sha256=pftciXHE7csekDFkl7Ui1AWglVMMnSrOO04mREnUdb0,921
|
|
40
40
|
fbgemm_gpu/quantize/quantize_ops.py,sha256=25AIOv9n2UoxamMUaI6EK1Ur4gSHxbZIReHBtgOjjCs,2228
|
|
41
41
|
fbgemm_gpu/sll/__init__.py,sha256=rgXh35-OFUE54E9gGBq3NGxouGvgMv2ccY2bWUTxONY,4191
|
|
@@ -64,8 +64,9 @@ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=T8Wa1PeRyFZ0Ge-SErHQEYDY8LvHVoCV_qQlE_6kE
|
|
|
64
64
|
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=mdG3JZwgclp6DiVwQSKl5jrirLSId4OuM64knj9TkEk,4973
|
|
65
65
|
fbgemm_gpu/tbe/bench/eval_compression.py,sha256=bINVERk42VJDSdenQHKWApmRMrW8rhkevOgE0hDR-S8,3499
|
|
66
66
|
fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
|
|
67
|
-
fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=
|
|
68
|
-
fbgemm_gpu/tbe/bench/
|
|
67
|
+
fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=0NxlQtvBb4BBeBiK8DaMVByyJjgzFFgrAsGQt-EFqgM,2913
|
|
68
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=uIFdxCBgrz3_l6C9fmE2bMmULhK1eX5ZfB78Pz7tjkw,7312
|
|
69
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=ajDmXjxNLxtHu8728CsSZQmuT6nra82jTb9uJJE3yzI,7519
|
|
69
70
|
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=tuEQgffV-_zGS4zza1I3x9ZWOYGh9jl3Aal1g-78veE,5852
|
|
70
71
|
fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
|
|
71
72
|
fbgemm_gpu/tbe/cache/__init__.py,sha256=oM-g5nq0EXZgO79C6DhAl_Om9FTPC-WiaqclQCG3HTk,323
|
|
@@ -73,16 +74,16 @@ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=mQkCl0xN8xUu5bjEWcOOFN
|
|
|
73
74
|
fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
|
|
74
75
|
fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
|
|
75
76
|
fbgemm_gpu/tbe/ssd/inference.py,sha256=DTjwj3f6JaUMcecWoRNkZpRgXDJ-eE3grtixYwKb5DI,22829
|
|
76
|
-
fbgemm_gpu/tbe/ssd/training.py,sha256=
|
|
77
|
+
fbgemm_gpu/tbe/ssd/training.py,sha256=GnhVZOxkgYoDgYOh34xL1pg5SwncSoLMv48mSHt4lQc,133710
|
|
77
78
|
fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
|
|
78
79
|
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=uwwEdUiaVlnWZ_rQax2z28VYROfivdMqIdWLy8IZ6cE,7646
|
|
79
80
|
fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
|
|
80
|
-
fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=
|
|
81
|
+
fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=9HCR8Y0j_5oWGn1KRSNNYKGf_pmbGZyKT_KII8qf2Fc,11670
|
|
81
82
|
fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
|
|
82
83
|
fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
|
|
83
84
|
fbgemm_gpu/tbe/utils/offsets.py,sha256=bs08kDiQ54oucZl6rmPLDs-bN6m1EMa1Wju06mCKZBY,1917
|
|
84
85
|
fbgemm_gpu/tbe/utils/quantize.py,sha256=byjmzGpUjXD_UVAiBKyszmWlzYLkQxq5HBs6hBOuHZo,9185
|
|
85
|
-
fbgemm_gpu/tbe/utils/requests.py,sha256=
|
|
86
|
+
fbgemm_gpu/tbe/utils/requests.py,sha256=uyWxOYxsmSyd48UhPHvDEdxbb-_zDV60FuoKiuTzMdM,17899
|
|
86
87
|
fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
|
|
87
88
|
fbgemm_gpu/triton/common.py,sha256=wnkLd2a8fKpefymLL-LjNKEL4hDVSxFiF5g3aF8mzsw,2131
|
|
88
89
|
fbgemm_gpu/triton/quantize.py,sha256=K5pqBQqs4YsD5m5TibZCbkd0E4Si0i_xcpIeF1B6jA0,26815
|
|
@@ -90,10 +91,10 @@ fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2M
|
|
|
90
91
|
fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
|
|
91
92
|
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=AIC1G6_QBQtMVTyOyEV4ZKJyDzu36UI_9HDgWmZIRaA,29884
|
|
92
93
|
fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,354
|
|
93
|
-
fbgemm_gpu/utils/filestore.py,sha256=
|
|
94
|
+
fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
|
|
94
95
|
fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
|
|
95
96
|
fbgemm_gpu/utils/torch_library.py,sha256=dQcHv1qgpu5QYlJjxjd6oeHjtxnmmXzx3PL6vjCmxL4,4199
|
|
96
|
-
fbgemm_gpu_hstu_nightly-2025.6.
|
|
97
|
-
fbgemm_gpu_hstu_nightly-2025.6.
|
|
98
|
-
fbgemm_gpu_hstu_nightly-2025.6.
|
|
99
|
-
fbgemm_gpu_hstu_nightly-2025.6.
|
|
97
|
+
fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/METADATA,sha256=rFbfG2H1ql2hm2bSjq3oSTKiMe3RXdKYafu9kp7D4qU,2654
|
|
98
|
+
fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
|
|
99
|
+
fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/top_level.txt,sha256=2tlbTWLkPjhqvLF_6BbqKzkcPluSE-oPRVjI8axK76I,11
|
|
100
|
+
fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/RECORD,,
|
{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|