fbgemm-gpu-nightly-cpu 2026.1.22__cp312-cp312-manylinux_2_28_x86_64.whl → 2026.1.29__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fbgemm_gpu/docs/target.default.json.py +1 -1
- fbgemm_gpu/fbgemm.so +0 -0
- fbgemm_gpu/fbgemm_gpu_config.so +0 -0
- fbgemm_gpu/fbgemm_gpu_py.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +68 -11
- fbgemm_gpu/tbe/bench/__init__.py +5 -1
- fbgemm_gpu/tbe/bench/tbe_data_config.py +71 -16
- fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +11 -2
- fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +52 -5
- fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +3 -1
- fbgemm_gpu/tbe/bench/utils.py +126 -3
- fbgemm_gpu/tbe/stats/bench_params_reporter.py +46 -36
- {fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +1 -1
- {fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/RECORD +19 -19
- {fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/top_level.txt +0 -0
fbgemm_gpu/fbgemm.so
CHANGED
|
Binary file
|
fbgemm_gpu/fbgemm_gpu_config.so
CHANGED
|
Binary file
|
fbgemm_gpu/fbgemm_gpu_py.so
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -970,7 +970,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
970
970
|
table_has_feature = [False] * T_
|
|
971
971
|
for t in self.feature_table_map:
|
|
972
972
|
table_has_feature[t] = True
|
|
973
|
-
assert all(table_has_feature),
|
|
973
|
+
assert all(table_has_feature), (
|
|
974
|
+
"Each table must have at least one feature!"
|
|
975
|
+
+ f"{[(i, x) for i, x in enumerate(table_has_feature)]}"
|
|
976
|
+
)
|
|
974
977
|
|
|
975
978
|
feature_dims = [dims[t] for t in self.feature_table_map]
|
|
976
979
|
D_offsets = [0] + list(accumulate(feature_dims))
|
|
@@ -1786,6 +1789,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1786
1789
|
cache: int,
|
|
1787
1790
|
total_static_sparse: int,
|
|
1788
1791
|
ephemeral: int,
|
|
1792
|
+
cache_weights: int = 0,
|
|
1793
|
+
cache_aux: int = 0,
|
|
1789
1794
|
) -> None:
|
|
1790
1795
|
"""Report HBM memory breakdown to stats reporter."""
|
|
1791
1796
|
stats_reporter.report_data_amount(
|
|
@@ -1809,6 +1814,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1809
1814
|
embedding_id=self.logging_table_name,
|
|
1810
1815
|
tbe_id=self.uuid,
|
|
1811
1816
|
)
|
|
1817
|
+
stats_reporter.report_data_amount(
|
|
1818
|
+
iteration_step=self.step,
|
|
1819
|
+
event_name="tbe.hbm.cache_weights",
|
|
1820
|
+
data_bytes=cache_weights,
|
|
1821
|
+
embedding_id=self.logging_table_name,
|
|
1822
|
+
tbe_id=self.uuid,
|
|
1823
|
+
)
|
|
1824
|
+
stats_reporter.report_data_amount(
|
|
1825
|
+
iteration_step=self.step,
|
|
1826
|
+
event_name="tbe.hbm.cache_aux",
|
|
1827
|
+
data_bytes=cache_aux,
|
|
1828
|
+
embedding_id=self.logging_table_name,
|
|
1829
|
+
tbe_id=self.uuid,
|
|
1830
|
+
)
|
|
1812
1831
|
stats_reporter.report_data_amount(
|
|
1813
1832
|
iteration_step=self.step,
|
|
1814
1833
|
event_name="tbe.hbm.total_static_sparse",
|
|
@@ -1832,6 +1851,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1832
1851
|
cache: int,
|
|
1833
1852
|
total_static_sparse: int,
|
|
1834
1853
|
ephemeral: int,
|
|
1854
|
+
cache_weights: int = 0,
|
|
1855
|
+
cache_aux: int = 0,
|
|
1835
1856
|
) -> None:
|
|
1836
1857
|
"""Report UVM memory breakdown to stats reporter."""
|
|
1837
1858
|
stats_reporter.report_data_amount(
|
|
@@ -1855,6 +1876,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1855
1876
|
embedding_id=self.logging_table_name,
|
|
1856
1877
|
tbe_id=self.uuid,
|
|
1857
1878
|
)
|
|
1879
|
+
stats_reporter.report_data_amount(
|
|
1880
|
+
iteration_step=self.step,
|
|
1881
|
+
event_name="tbe.uvm.cache_weights",
|
|
1882
|
+
data_bytes=cache_weights,
|
|
1883
|
+
embedding_id=self.logging_table_name,
|
|
1884
|
+
tbe_id=self.uuid,
|
|
1885
|
+
)
|
|
1886
|
+
stats_reporter.report_data_amount(
|
|
1887
|
+
iteration_step=self.step,
|
|
1888
|
+
event_name="tbe.uvm.cache_aux",
|
|
1889
|
+
data_bytes=cache_aux,
|
|
1890
|
+
embedding_id=self.logging_table_name,
|
|
1891
|
+
tbe_id=self.uuid,
|
|
1892
|
+
)
|
|
1858
1893
|
stats_reporter.report_data_amount(
|
|
1859
1894
|
iteration_step=self.step,
|
|
1860
1895
|
event_name="tbe.uvm.total_static_sparse",
|
|
@@ -1931,34 +1966,50 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1931
1966
|
"momentum2_host",
|
|
1932
1967
|
"momentum2_uvm",
|
|
1933
1968
|
]
|
|
1934
|
-
|
|
1969
|
+
# Cache weights tensor (the actual cached embeddings in HBM)
|
|
1970
|
+
cache_weight_tensors = [
|
|
1935
1971
|
"lxu_cache_weights",
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
"
|
|
1972
|
+
]
|
|
1973
|
+
# Cache auxiliary state tensors (metadata for cache management, excluding weights)
|
|
1974
|
+
# Sizes scale with hash_size or cache_slots (hash_size × clf)
|
|
1975
|
+
# Excludes constant-size tensors: cache_hash_size_cumsum, cache_miss_counter, etc.
|
|
1976
|
+
cache_aux_tensors = [
|
|
1977
|
+
"cache_index_table_map", # int32, 4B × hash_size
|
|
1978
|
+
"lxu_cache_state", # int64, 8B × cache_slots
|
|
1979
|
+
"lxu_state", # int64, 8B × cache_slots (LRU) or hash_size (LFU)
|
|
1980
|
+
"lxu_cache_locking_counter", # int32, 4B × cache_slots (only if prefetch_pipeline)
|
|
1942
1981
|
]
|
|
1943
1982
|
|
|
1944
1983
|
# Calculate total memory for each component
|
|
1945
1984
|
weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
|
|
1946
1985
|
optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
|
|
1947
|
-
|
|
1986
|
+
cache_weights_total = sum(
|
|
1987
|
+
self._get_tensor_memory(t) for t in cache_weight_tensors
|
|
1988
|
+
)
|
|
1989
|
+
cache_aux_total = sum(self._get_tensor_memory(t) for t in cache_aux_tensors)
|
|
1948
1990
|
|
|
1949
1991
|
# Categorize memory by location (HBM vs UVM)
|
|
1950
1992
|
if self.use_cpu:
|
|
1951
1993
|
weights_hbm, weights_uvm = 0, weights_total
|
|
1952
1994
|
opt_hbm, opt_uvm = 0, optimizer_total
|
|
1953
|
-
|
|
1995
|
+
cache_weights_hbm, cache_weights_uvm = 0, cache_weights_total
|
|
1996
|
+
cache_aux_hbm, cache_aux_uvm = 0, cache_aux_total
|
|
1954
1997
|
else:
|
|
1955
1998
|
weights_hbm, weights_uvm = self._categorize_memory_by_location(
|
|
1956
1999
|
weight_tensors
|
|
1957
2000
|
)
|
|
1958
2001
|
opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
|
|
1959
|
-
|
|
2002
|
+
cache_weights_hbm, cache_weights_uvm = self._categorize_memory_by_location(
|
|
2003
|
+
cache_weight_tensors
|
|
2004
|
+
)
|
|
2005
|
+
cache_aux_hbm, cache_aux_uvm = self._categorize_memory_by_location(
|
|
2006
|
+
cache_aux_tensors
|
|
2007
|
+
)
|
|
1960
2008
|
|
|
1961
2009
|
# Calculate ephemeral memory split between HBM and UVM
|
|
2010
|
+
# Total cache = cache weights + cache auxiliary state
|
|
2011
|
+
cache_hbm = cache_weights_hbm + cache_aux_hbm
|
|
2012
|
+
cache_uvm = cache_weights_uvm + cache_aux_uvm
|
|
1962
2013
|
static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
|
|
1963
2014
|
static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
|
|
1964
2015
|
ephemeral_hbm = total_hbm_usage - static_sparse_hbm
|
|
@@ -1972,6 +2023,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1972
2023
|
cache_hbm,
|
|
1973
2024
|
static_sparse_hbm,
|
|
1974
2025
|
ephemeral_hbm,
|
|
2026
|
+
cache_weights_hbm,
|
|
2027
|
+
cache_aux_hbm,
|
|
1975
2028
|
)
|
|
1976
2029
|
self._report_uvm_breakdown(
|
|
1977
2030
|
stats_reporter,
|
|
@@ -1980,6 +2033,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1980
2033
|
cache_uvm,
|
|
1981
2034
|
static_sparse_uvm,
|
|
1982
2035
|
ephemeral_uvm,
|
|
2036
|
+
cache_weights_uvm,
|
|
2037
|
+
cache_aux_uvm,
|
|
1983
2038
|
)
|
|
1984
2039
|
|
|
1985
2040
|
@torch.jit.ignore
|
|
@@ -2232,6 +2287,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
2232
2287
|
op_id=self.uuid,
|
|
2233
2288
|
per_sample_weights=per_sample_weights,
|
|
2234
2289
|
batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
|
|
2290
|
+
embedding_specs=[(s[0], s[1]) for s in self.embedding_specs],
|
|
2291
|
+
feature_table_map=self.feature_table_map,
|
|
2235
2292
|
)
|
|
2236
2293
|
|
|
2237
2294
|
if not is_torchdynamo_compiling():
|
fbgemm_gpu/tbe/bench/__init__.py
CHANGED
|
@@ -41,7 +41,11 @@ from .tbe_data_config_param_models import ( # noqa F401
|
|
|
41
41
|
IndicesParams,
|
|
42
42
|
PoolingParams,
|
|
43
43
|
)
|
|
44
|
-
from .utils import
|
|
44
|
+
from .utils import ( # noqa F401
|
|
45
|
+
check_oom,
|
|
46
|
+
fill_random_scale_bias,
|
|
47
|
+
generate_merged_output_and_offsets,
|
|
48
|
+
)
|
|
45
49
|
|
|
46
50
|
try:
|
|
47
51
|
torch.ops.load_library(
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
import dataclasses
|
|
11
11
|
import json
|
|
12
12
|
import logging
|
|
13
|
-
from typing import Any, Optional
|
|
13
|
+
from typing import Any, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
import torch
|
|
16
16
|
|
|
@@ -32,30 +32,77 @@ except Exception:
|
|
|
32
32
|
|
|
33
33
|
@dataclasses.dataclass(frozen=True)
|
|
34
34
|
class TBEDataConfig:
|
|
35
|
-
# Number of tables
|
|
36
35
|
T: int
|
|
37
|
-
# Number of rows in the embedding table
|
|
38
36
|
E: int
|
|
39
|
-
# Target embedding dimension for a table (number of columns)
|
|
40
37
|
D: int
|
|
41
|
-
# Generate mixed dimensions if true
|
|
42
38
|
mixed_dim: bool
|
|
43
|
-
# Whether the lookup rows are weighted or not
|
|
44
39
|
weighted: bool
|
|
45
|
-
# Batch parameters
|
|
46
40
|
batch_params: BatchParams
|
|
47
|
-
# Indices parameters
|
|
48
41
|
indices_params: IndicesParams
|
|
49
|
-
# Pooling parameters
|
|
50
42
|
pooling_params: PoolingParams
|
|
51
|
-
# Force generated tensors to be on CPU
|
|
52
43
|
use_cpu: bool = False
|
|
53
|
-
# Number of embeddings in each embedding features (number of rows)
|
|
54
44
|
Es: Optional[list[int]] = None
|
|
55
|
-
# Target embedding dimension for each features (number of columns)
|
|
56
45
|
Ds: Optional[list[int]] = None
|
|
57
|
-
|
|
58
|
-
|
|
46
|
+
max_indices: Optional[int] = None
|
|
47
|
+
embedding_specs: Optional[List[Tuple[int, int]]] = None
|
|
48
|
+
feature_table_map: Optional[List[int]] = None
|
|
49
|
+
"""
|
|
50
|
+
Configuration for TBE (Table Batched Embedding) benchmark data collection and generation.
|
|
51
|
+
|
|
52
|
+
This dataclass holds parameters required to generate synthetic data for
|
|
53
|
+
TBE benchmarking, including table specifications, batch parameters, indices
|
|
54
|
+
distribution parameters, and pooling parameters.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
T (int): Number of embedding tables (features). Must be positive.
|
|
58
|
+
E (int): Number of rows in the embedding table (feature). If T > 1, this
|
|
59
|
+
represents the averaged number of rows across all features.
|
|
60
|
+
D (int): Target embedding dimension for a table (feature), i.e., number of
|
|
61
|
+
columns. If T > 1, this represents the averaged dimension across
|
|
62
|
+
all features.
|
|
63
|
+
mixed_dim (bool): If True, generate embeddings with mixed dimensions
|
|
64
|
+
across tables (features). This is automatically set to True if D is provided
|
|
65
|
+
as a list with non-uniform values.
|
|
66
|
+
weighted (bool): If True, the lookup rows are weighted (per-sample
|
|
67
|
+
weights). The weights will be generated as FP32 tensors.
|
|
68
|
+
batch_params (BatchParams): Parameters controlling batch generation.
|
|
69
|
+
Contains:
|
|
70
|
+
(1) `B` = target batch size (number of batch lookups per features)
|
|
71
|
+
(2) `sigma_B` = optional standard deviation for variable batch size
|
|
72
|
+
(3) `vbe_distribution` = distribution type ("normal" or "uniform")
|
|
73
|
+
(4) `vbe_num_ranks` = number of ranks for variable batch size
|
|
74
|
+
(5) `Bs` = per-feature batch sizes
|
|
75
|
+
indices_params (IndicesParams): Parameters controlling index generation
|
|
76
|
+
following a Zipf distribution. Contains:
|
|
77
|
+
(1) `heavy_hitters` = probability density map for hot indices
|
|
78
|
+
(2) `zipf_q` = q parameter in Zipf distribution (x+q)^{-s}
|
|
79
|
+
(3) `zipf_s` = s parameter (alpha) in Zipf distribution
|
|
80
|
+
(4) `index_dtype` = optional dtype for indices tensor
|
|
81
|
+
(5) `offset_dtype` = optional dtype for offsets tensor
|
|
82
|
+
pooling_params (PoolingParams): Parameters controlling pooling behavior.
|
|
83
|
+
Contains:
|
|
84
|
+
(1) `L` = target bag size (pooling factor, indices per lookup)
|
|
85
|
+
(2) `sigma_L` = optional standard deviation for variable bag size
|
|
86
|
+
(3) `length_distribution` = distribution type ("normal" or "uniform")
|
|
87
|
+
(4) `Ls` = per-feature bag sizes
|
|
88
|
+
use_cpu (bool = False): If True, force generated tensors to be placed
|
|
89
|
+
on CPU instead of the default compute device.
|
|
90
|
+
Es (Optional[List[int]] = None): Number of embeddings (rows) for each
|
|
91
|
+
individual embedding feature. If provided, must have length equal
|
|
92
|
+
to T. All elements must be positive.
|
|
93
|
+
Ds (Optional[List[int]] = None): Target embedding dimension (columns)
|
|
94
|
+
for each individual feature. If provided, must have length equal
|
|
95
|
+
to T. All elements must be positive.
|
|
96
|
+
max_indices (Optional[int] = None): Maximum number of indices for
|
|
97
|
+
bounds checking. If Es is provided as a list and max_indices is
|
|
98
|
+
None, it is automatically computed as sum(Es) - 1.
|
|
99
|
+
embedding_specs (Optional[List[Tuple[int, int]]] = None): A list of
|
|
100
|
+
embedding specs consisting of a list of tuples of (num_rows, embedding_dim).
|
|
101
|
+
See https://fburl.com/tbe_embedding_specs for details.
|
|
102
|
+
feature_table_map (Optional[List[int]] = None): An optional list that
|
|
103
|
+
specifies feature-table mapping. feature_table_map[i] indicates the
|
|
104
|
+
physical embedding table that feature i maps to.
|
|
105
|
+
"""
|
|
59
106
|
|
|
60
107
|
def __post_init__(self) -> None:
|
|
61
108
|
if isinstance(self.D, list):
|
|
@@ -117,17 +164,25 @@ class TBEDataConfig:
|
|
|
117
164
|
assert self.D > 0, "D must be positive"
|
|
118
165
|
if self.Ds is not None:
|
|
119
166
|
assert all(d > 0 for d in self.Ds), "All elements in Ds must be positive"
|
|
120
|
-
if isinstance(self.
|
|
167
|
+
if isinstance(self.Es, list) and isinstance(self.Ds, list):
|
|
121
168
|
assert (
|
|
122
|
-
len(self.
|
|
169
|
+
len(self.Es) == len(self.Ds) == self.T
|
|
123
170
|
), "Lengths of Es, Lengths of Ds, and T must be equal"
|
|
124
171
|
if self.max_indices is not None:
|
|
125
172
|
assert self.max_indices == (
|
|
126
173
|
sum(self.Es) - 1
|
|
127
174
|
), "max_indices must be equal to sum(Es) - 1"
|
|
128
175
|
self.batch_params.validate()
|
|
176
|
+
if self.batch_params.Bs is not None:
|
|
177
|
+
assert (
|
|
178
|
+
len(self.batch_params.Bs) == self.T
|
|
179
|
+
), f"Length of Bs must be equal to T. Expected: {self.T}, but got: {len(self.batch_params.Bs)}"
|
|
129
180
|
self.indices_params.validate()
|
|
130
181
|
self.pooling_params.validate()
|
|
182
|
+
if self.pooling_params.Ls is not None:
|
|
183
|
+
assert (
|
|
184
|
+
len(self.pooling_params.Ls) == self.T
|
|
185
|
+
), f"Length of Ls must be equal to T. Expected: {self.T}, but got: {len(self.pooling_params.Ls)}"
|
|
131
186
|
return self
|
|
132
187
|
|
|
133
188
|
def variable_B(self) -> bool:
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
|
|
10
|
+
import logging
|
|
10
11
|
from typing import Optional
|
|
11
12
|
|
|
12
13
|
import numpy as np
|
|
@@ -35,6 +36,9 @@ except Exception:
|
|
|
35
36
|
def _generate_batch_sizes(
|
|
36
37
|
tbe_data_config: TBEDataConfig,
|
|
37
38
|
) -> tuple[list[int], Optional[list[list[int]]]]:
|
|
39
|
+
logging.info(
|
|
40
|
+
f"DEBUG_TBE: [_generate_batch_sizes] VBE tbe_data_config.variable_B()={tbe_data_config.variable_B()}"
|
|
41
|
+
)
|
|
38
42
|
if tbe_data_config.variable_B():
|
|
39
43
|
assert (
|
|
40
44
|
tbe_data_config.batch_params.vbe_num_ranks is not None
|
|
@@ -48,7 +52,6 @@ def _generate_batch_sizes(
|
|
|
48
52
|
# pyre-ignore [6]
|
|
49
53
|
tbe_data_config.batch_params.vbe_distribution,
|
|
50
54
|
)
|
|
51
|
-
|
|
52
55
|
else:
|
|
53
56
|
return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
|
|
54
57
|
|
|
@@ -89,13 +92,15 @@ def _generate_indices(
|
|
|
89
92
|
start_offset = L_offsets_list[it * total_B]
|
|
90
93
|
end_offset = L_offsets_list[(it + 1) * total_B]
|
|
91
94
|
|
|
95
|
+
logging.info(f"DEBUG_TBE: _generate_indices E = {tbe_data_config.E=}")
|
|
96
|
+
|
|
92
97
|
indices_list.append(
|
|
93
98
|
torch.ops.fbgemm.tbe_generate_indices_from_distribution(
|
|
94
99
|
tbe_data_config.indices_params.heavy_hitters,
|
|
95
100
|
tbe_data_config.indices_params.zipf_q,
|
|
96
101
|
tbe_data_config.indices_params.zipf_s,
|
|
97
102
|
# max_index = dimensions of the embedding table
|
|
98
|
-
tbe_data_config.E,
|
|
103
|
+
int(tbe_data_config.E),
|
|
99
104
|
# num_indices = number of indices to generate
|
|
100
105
|
end_offset - start_offset,
|
|
101
106
|
)
|
|
@@ -184,6 +189,10 @@ def generate_requests(
|
|
|
184
189
|
else:
|
|
185
190
|
Bs, _ = _generate_batch_sizes(tbe_data_config)
|
|
186
191
|
|
|
192
|
+
logging.info(
|
|
193
|
+
f"DEBUG_TBE: VBE [generate_requests] batch_size_per_feature_per_rank={batch_size_per_feature_per_rank} Bs={Bs}"
|
|
194
|
+
)
|
|
195
|
+
|
|
187
196
|
assert Bs is not None, "Batch sizes (Bs) must be set"
|
|
188
197
|
|
|
189
198
|
# Generate pooling info
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
|
|
10
10
|
import dataclasses
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
11
13
|
from enum import Enum
|
|
12
14
|
|
|
13
15
|
import click
|
|
@@ -45,12 +47,16 @@ class TBEDataConfigHelperText(Enum):
|
|
|
45
47
|
TBE_INDICES_HITTERS = "Heavy hitters for indices (comma-delimited list of floats)"
|
|
46
48
|
TBE_INDICES_ZIPF = "Zipf distribution parameters for indices generation (q, s)"
|
|
47
49
|
TBE_INDICES_DTYPE = "The dtype of the table indices (choices: '32', '64')"
|
|
48
|
-
TBE_OFFSETS_DTYPE = "The dtype of the table
|
|
50
|
+
TBE_OFFSETS_DTYPE = "The dtype of the table offsets (choices: '32', '64')"
|
|
49
51
|
|
|
50
52
|
# Pooling Parameters
|
|
51
53
|
TBE_POOLING_SIZE = "Bag size / pooling factor (L)"
|
|
52
|
-
TBE_POOLING_VL_SIGMA = "Standard deviation of
|
|
53
|
-
TBE_POOLING_VL_DIST =
|
|
54
|
+
TBE_POOLING_VL_SIGMA = "Standard deviation of L for variable bag size"
|
|
55
|
+
TBE_POOLING_VL_DIST = (
|
|
56
|
+
"Variable bag size distribution (choices: 'uniform', 'normal')"
|
|
57
|
+
)
|
|
58
|
+
TBE_EMBEDDING_SPECS = "Embedding Specs which is List[Tuple[int, int, EmbeddingLocation, ComputeDevice]]"
|
|
59
|
+
TBE_FEATURE_TABLE_MAP = "Mapping of feature-table"
|
|
54
60
|
|
|
55
61
|
|
|
56
62
|
class TBEDataConfigLoader:
|
|
@@ -193,6 +199,18 @@ class TBEDataConfigLoader:
|
|
|
193
199
|
required=False,
|
|
194
200
|
help=TBEDataConfigHelperText.TBE_POOLING_VL_DIST.value,
|
|
195
201
|
),
|
|
202
|
+
click.option(
|
|
203
|
+
"--tbe-embedding-specs",
|
|
204
|
+
type=str,
|
|
205
|
+
required=False,
|
|
206
|
+
help=TBEDataConfigHelperText.TBE_EMBEDDING_SPECS.value,
|
|
207
|
+
),
|
|
208
|
+
click.option(
|
|
209
|
+
"--tbe-feature-table-map",
|
|
210
|
+
type=str,
|
|
211
|
+
required=False,
|
|
212
|
+
help=TBEDataConfigHelperText.TBE_FEATURE_TABLE_MAP.value,
|
|
213
|
+
),
|
|
196
214
|
]
|
|
197
215
|
|
|
198
216
|
for option in reversed(options):
|
|
@@ -213,15 +231,21 @@ class TBEDataConfigLoader:
|
|
|
213
231
|
params = context.params
|
|
214
232
|
|
|
215
233
|
# Read table parameters
|
|
216
|
-
T = params["tbe_num_tables"]
|
|
217
|
-
E = params["tbe_num_embeddings"]
|
|
234
|
+
T = params["tbe_num_tables"] # number of features
|
|
235
|
+
E = params["tbe_num_embeddings"] # feature_rows
|
|
218
236
|
if params["tbe_num_embeddings_list"] is not None:
|
|
219
237
|
Es = [int(x) for x in params["tbe_num_embeddings_list"].split(",")]
|
|
238
|
+
T = len(Es)
|
|
239
|
+
E = sum(Es) // T # average E
|
|
220
240
|
else:
|
|
221
241
|
Es = None
|
|
222
242
|
D = params["tbe_embedding_dim"]
|
|
223
243
|
if params["tbe_embedding_dim_list"] is not None:
|
|
224
244
|
Ds = [int(x) for x in params["tbe_embedding_dim_list"].split(",")]
|
|
245
|
+
assert (
|
|
246
|
+
len(Ds) == T
|
|
247
|
+
), f"Expected tbe_embedding_dim_list to have {T} elements, but got {len(Ds)}"
|
|
248
|
+
D = sum(Ds) // T # average D
|
|
225
249
|
else:
|
|
226
250
|
Ds = None
|
|
227
251
|
|
|
@@ -239,10 +263,31 @@ class TBEDataConfigLoader:
|
|
|
239
263
|
vbe_num_ranks = params["tbe_batch_vbe_ranks"]
|
|
240
264
|
if params["tbe_batch_sizes_list"] is not None:
|
|
241
265
|
Bs = [int(x) for x in params["tbe_batch_sizes_list"].split(",")]
|
|
266
|
+
B = sum(Bs) // T # average B
|
|
242
267
|
else:
|
|
268
|
+
B = params["tbe_batch_size"]
|
|
243
269
|
Bs = None
|
|
244
270
|
batch_params = BatchParams(B, sigma_B, vbe_distribution, vbe_num_ranks, Bs)
|
|
245
271
|
|
|
272
|
+
# Parse embedding_specs: "(E,D),(E,D),..." or "(E,D,loc,dev),(E,D,loc,dev),..."
|
|
273
|
+
# Only the first two values (E, D) are extracted.
|
|
274
|
+
embedding_specs = None
|
|
275
|
+
feature_table_map = None
|
|
276
|
+
if params["tbe_embedding_specs"] is not None:
|
|
277
|
+
try:
|
|
278
|
+
tuples = re.findall(r"\(([^)]+)\)", params["tbe_embedding_specs"])
|
|
279
|
+
if tuples:
|
|
280
|
+
embedding_specs = [
|
|
281
|
+
(int(t.split(",")[0].strip()), int(t.split(",")[1].strip()))
|
|
282
|
+
for t in tuples
|
|
283
|
+
]
|
|
284
|
+
except (ValueError, IndexError):
|
|
285
|
+
logging.warning("Failed to parse embedding_specs. Setting to None.")
|
|
286
|
+
if params["tbe_feature_table_map"] is not None:
|
|
287
|
+
feature_table_map = [
|
|
288
|
+
int(x) for x in params["tbe_feature_table_map"].split(",")
|
|
289
|
+
]
|
|
290
|
+
|
|
246
291
|
# Read indices parameters
|
|
247
292
|
heavy_hitters = (
|
|
248
293
|
torch.tensor([float(x) for x in params["tbe_indices_hitters"].split(",")])
|
|
@@ -279,6 +324,8 @@ class TBEDataConfigLoader:
|
|
|
279
324
|
Es,
|
|
280
325
|
Ds,
|
|
281
326
|
max_indices,
|
|
327
|
+
embedding_specs,
|
|
328
|
+
feature_table_map,
|
|
282
329
|
).validate()
|
|
283
330
|
|
|
284
331
|
@classmethod
|
|
@@ -98,7 +98,7 @@ class BatchParams:
|
|
|
98
98
|
vbe_distribution: Optional[str] = "normal"
|
|
99
99
|
# Number of ranks for variable batch size generation
|
|
100
100
|
vbe_num_ranks: Optional[int] = None
|
|
101
|
-
# List of target batch sizes, i.e. number of batch lookups per
|
|
101
|
+
# List of target batch sizes, i.e. number of batch lookups per feature
|
|
102
102
|
Bs: Optional[list[int]] = None
|
|
103
103
|
|
|
104
104
|
@classmethod
|
|
@@ -142,6 +142,8 @@ class PoolingParams:
|
|
|
142
142
|
sigma_L: Optional[int] = None
|
|
143
143
|
# [Optional] Distribution of embedding sequence lengths (normal, uniform)
|
|
144
144
|
length_distribution: Optional[str] = "normal"
|
|
145
|
+
# [Optional] List of target bag sizes, i.e. pooling factors per batch
|
|
146
|
+
Ls: Optional[list[float]] = None
|
|
145
147
|
|
|
146
148
|
@classmethod
|
|
147
149
|
# pyre-ignore [3]
|
fbgemm_gpu/tbe/bench/utils.py
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
# pyre-strict
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
from typing import List, Tuple
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
import torch
|
|
@@ -14,8 +14,6 @@ import torch
|
|
|
14
14
|
# fmt:skip
|
|
15
15
|
from fbgemm_gpu.split_embedding_configs import SparseType
|
|
16
16
|
|
|
17
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
18
|
-
|
|
19
17
|
|
|
20
18
|
def fill_random_scale_bias(
|
|
21
19
|
emb: torch.nn.Module,
|
|
@@ -47,3 +45,128 @@ def fill_random_scale_bias(
|
|
|
47
45
|
device=scale_shift.device,
|
|
48
46
|
)
|
|
49
47
|
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def check_oom(
|
|
51
|
+
data_size: int,
|
|
52
|
+
) -> Tuple[bool, str]:
|
|
53
|
+
free_memory, total_memory = torch.cuda.mem_get_info()
|
|
54
|
+
if data_size > free_memory:
|
|
55
|
+
warning = f"Expect to allocate {round(data_size / (1024 ** 3), 2)} GB, but available memory is {round(free_memory / (1024 ** 3), 2)} GB from {round(total_memory / (1024 ** 3), 2)} GB."
|
|
56
|
+
return (True, warning)
|
|
57
|
+
return (False, "")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def generate_batch_size_per_feature_per_rank(
|
|
61
|
+
Bs: List[int], num_ranks: int
|
|
62
|
+
) -> List[List[int]]:
|
|
63
|
+
"""
|
|
64
|
+
Generate batch size per feature per rank for VBE, assuming the batch size
|
|
65
|
+
is evenly distributed across ranks.
|
|
66
|
+
Args:
|
|
67
|
+
Bs (List[int]): batch size per feature
|
|
68
|
+
num_ranks (int): number of ranks
|
|
69
|
+
Returns:
|
|
70
|
+
List[List[int]]: batch size per feature per rank
|
|
71
|
+
"""
|
|
72
|
+
b_per_feature_per_rank = []
|
|
73
|
+
for B in Bs:
|
|
74
|
+
b_per_feature = []
|
|
75
|
+
for i in range(num_ranks):
|
|
76
|
+
if i != num_ranks - 1:
|
|
77
|
+
b_per_feature.append(int(B / num_ranks))
|
|
78
|
+
else:
|
|
79
|
+
b_per_feature.append(B - sum(b_per_feature))
|
|
80
|
+
b_per_feature_per_rank.append(b_per_feature)
|
|
81
|
+
return b_per_feature_per_rank
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def generate_merged_output_and_offsets(
|
|
85
|
+
Ds: List[int],
|
|
86
|
+
Bs: List[int],
|
|
87
|
+
output_dtype: torch.dtype,
|
|
88
|
+
device: torch.device,
|
|
89
|
+
num_ranks: int = 2,
|
|
90
|
+
num_tbe_ops: int = 2,
|
|
91
|
+
) -> Tuple[List[List[int]], torch.Tensor, torch.Tensor]:
|
|
92
|
+
"""
|
|
93
|
+
Generate merged vbe_output and vbe_output_offsets tensors for VBE.
|
|
94
|
+
The vbe_output is a tensor that will contain forward output from all VBE TBE ops.
|
|
95
|
+
The vbe_output_offsets is a tensor that will contain start offsets for the output to be written to.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
Ds (List[int]): embedding dimension per feature
|
|
99
|
+
Bs (List[int]): batch size per feature
|
|
100
|
+
num_ranks (int): number of ranks
|
|
101
|
+
num_tbe_ops (int): number of TBE ops
|
|
102
|
+
Returns:
|
|
103
|
+
Tuple[List[List[int]], torch.Tensor, torch.Tensor]: batch_size_per_feature_per_rank, merged vbe_output and vbe_output_offsets tensors
|
|
104
|
+
"""
|
|
105
|
+
# The first embedding ops is the embedding op created in the benchmark
|
|
106
|
+
emb_op = {}
|
|
107
|
+
emb_op[0] = {}
|
|
108
|
+
emb_op[0]["dim"] = Ds
|
|
109
|
+
emb_op[0]["Bs"] = Bs
|
|
110
|
+
emb_op[0]["output_size"] = sum([b * d for b, d in zip(Bs, Ds)])
|
|
111
|
+
emb_op[0]["batch_size_per_feature_per_rank"] = (
|
|
112
|
+
generate_batch_size_per_feature_per_rank(Bs, num_ranks)
|
|
113
|
+
)
|
|
114
|
+
num_features = len(Bs)
|
|
115
|
+
# create other embedding ops to allocate output and offsets tensors
|
|
116
|
+
# Using representative values for additional TBE ops in multi-op scenarios:
|
|
117
|
+
# - batch_size=32000: typical large batch size for production workloads
|
|
118
|
+
# - dim=512: common embedding dimension for large models
|
|
119
|
+
for i in range(1, num_tbe_ops):
|
|
120
|
+
emb_op[i] = {}
|
|
121
|
+
emb_op[i]["batch_size_per_feature_per_rank"] = (
|
|
122
|
+
generate_batch_size_per_feature_per_rank([32000], num_ranks)
|
|
123
|
+
)
|
|
124
|
+
emb_op[i]["Bs"] = [sum(B) for B in emb_op[i]["batch_size_per_feature_per_rank"]]
|
|
125
|
+
emb_op[i]["dim"] = [512]
|
|
126
|
+
emb_op[i]["output_size"] = sum(
|
|
127
|
+
[b * d for b, d in zip(emb_op[i]["Bs"], emb_op[i]["dim"])]
|
|
128
|
+
)
|
|
129
|
+
total_output = 0
|
|
130
|
+
ranks = [[] for _ in range(num_ranks)]
|
|
131
|
+
for e in emb_op.values():
|
|
132
|
+
b_per_rank_per_feature = list(zip(*e["batch_size_per_feature_per_rank"]))
|
|
133
|
+
assert len(b_per_rank_per_feature) == num_ranks
|
|
134
|
+
dims = e["dim"]
|
|
135
|
+
for r, b_r in enumerate(b_per_rank_per_feature):
|
|
136
|
+
for f, b in enumerate(b_r):
|
|
137
|
+
output_size_per_batch = b * dims[f]
|
|
138
|
+
ranks[r].append(output_size_per_batch)
|
|
139
|
+
total_output += output_size_per_batch
|
|
140
|
+
ranks[0].insert(0, 0)
|
|
141
|
+
offsets_ranks: List[List[int]] = [[] for _ in range(num_ranks)]
|
|
142
|
+
total_output_offsets = []
|
|
143
|
+
start = 0
|
|
144
|
+
for r in range(num_ranks):
|
|
145
|
+
offsets_ranks[r] = [
|
|
146
|
+
start + sum(ranks[r][: i + 1]) for i in range(len(ranks[r]))
|
|
147
|
+
]
|
|
148
|
+
start = offsets_ranks[r][-1]
|
|
149
|
+
total_output_offsets.extend(offsets_ranks[r])
|
|
150
|
+
check_total_output_size = sum([e["output_size"] for e in emb_op.values()])
|
|
151
|
+
assert (
|
|
152
|
+
total_output == check_total_output_size
|
|
153
|
+
), f"{total_output} != {check_total_output_size}{[e['output_size'] for e in emb_op.values()]}"
|
|
154
|
+
assert (
|
|
155
|
+
total_output == total_output_offsets[-1]
|
|
156
|
+
), f"{total_output} != {total_output_offsets[-1]}"
|
|
157
|
+
out = torch.empty(total_output, dtype=output_dtype, device=device)
|
|
158
|
+
offsets = []
|
|
159
|
+
offsets.append(offsets_ranks[0][:num_features])
|
|
160
|
+
for r in range(1, num_ranks):
|
|
161
|
+
start = [offsets_ranks[r - 1][-1]]
|
|
162
|
+
the_rest = offsets_ranks[r][: num_features - 1] if num_features > 1 else []
|
|
163
|
+
start.extend(the_rest)
|
|
164
|
+
offsets.append(start)
|
|
165
|
+
|
|
166
|
+
out_offsets = torch.tensor(
|
|
167
|
+
offsets,
|
|
168
|
+
dtype=torch.int64,
|
|
169
|
+
device=device,
|
|
170
|
+
)
|
|
171
|
+
batch_size_per_feature_per_rank = emb_op[0]["batch_size_per_feature_per_rank"]
|
|
172
|
+
return (batch_size_per_feature_per_rank, out, out_offsets)
|
|
@@ -11,7 +11,7 @@ import io
|
|
|
11
11
|
import json
|
|
12
12
|
import logging
|
|
13
13
|
import os
|
|
14
|
-
from typing import Optional
|
|
14
|
+
from typing import List, Optional, Tuple
|
|
15
15
|
|
|
16
16
|
import fbgemm_gpu # noqa F401
|
|
17
17
|
import torch # usort:skip
|
|
@@ -137,6 +137,20 @@ class TBEBenchmarkParamsReporter:
|
|
|
137
137
|
path_prefix=path_prefix,
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
+
def extract_Ls(
|
|
141
|
+
self,
|
|
142
|
+
bag_sizes: List[int],
|
|
143
|
+
Bs: List[int],
|
|
144
|
+
) -> List[float]:
|
|
145
|
+
Ls = []
|
|
146
|
+
start = 0
|
|
147
|
+
for b in Bs:
|
|
148
|
+
end = start + b
|
|
149
|
+
avg_L = sum(bag_sizes[start:end]) / b if b > 0 else 0
|
|
150
|
+
start = end
|
|
151
|
+
Ls.append(avg_L)
|
|
152
|
+
return Ls
|
|
153
|
+
|
|
140
154
|
def extract_params(
|
|
141
155
|
self,
|
|
142
156
|
feature_rows: torch.Tensor,
|
|
@@ -144,7 +158,11 @@ class TBEBenchmarkParamsReporter:
|
|
|
144
158
|
indices: torch.Tensor,
|
|
145
159
|
offsets: torch.Tensor,
|
|
146
160
|
per_sample_weights: Optional[torch.Tensor] = None,
|
|
147
|
-
batch_size_per_feature_per_rank: Optional[
|
|
161
|
+
batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
|
|
162
|
+
Es: Optional[List[int]] = None,
|
|
163
|
+
Ds: Optional[List[int]] = None,
|
|
164
|
+
embedding_specs: Optional[List[Tuple[int, int]]] = None,
|
|
165
|
+
feature_table_map: Optional[List[int]] = None,
|
|
148
166
|
) -> TBEDataConfig:
|
|
149
167
|
"""
|
|
150
168
|
Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
|
|
@@ -201,8 +219,14 @@ class TBEBenchmarkParamsReporter:
|
|
|
201
219
|
)
|
|
202
220
|
|
|
203
221
|
# Compute batch parameters
|
|
222
|
+
B = int((offsets.numel() - 1) // T)
|
|
223
|
+
Bs = (
|
|
224
|
+
[sum(b_per_rank) for b_per_rank in batch_size_per_feature_per_rank]
|
|
225
|
+
if batch_size_per_feature_per_rank
|
|
226
|
+
else [B] * T
|
|
227
|
+
)
|
|
204
228
|
batch_params = BatchParams(
|
|
205
|
-
B=
|
|
229
|
+
B=B,
|
|
206
230
|
sigma_B=(
|
|
207
231
|
int(
|
|
208
232
|
torch.ceil(
|
|
@@ -226,10 +250,14 @@ class TBEBenchmarkParamsReporter:
|
|
|
226
250
|
if batch_size_per_feature_per_rank
|
|
227
251
|
else None
|
|
228
252
|
),
|
|
253
|
+
Bs=Bs,
|
|
229
254
|
)
|
|
230
255
|
|
|
231
256
|
# Compute pooling parameters
|
|
232
257
|
bag_sizes = offsets[1:] - offsets[:-1]
|
|
258
|
+
if batch_size_per_feature_per_rank is None:
|
|
259
|
+
_B = int(bag_sizes.numel() // T)
|
|
260
|
+
assert _B == Bs[0], f"Expected constant batch size {Bs[0]} but got {_B}"
|
|
233
261
|
mixed_bag_sizes = len(set(bag_sizes)) > 1
|
|
234
262
|
pooling_params = PoolingParams(
|
|
235
263
|
L=(
|
|
@@ -243,6 +271,7 @@ class TBEBenchmarkParamsReporter:
|
|
|
243
271
|
else None
|
|
244
272
|
),
|
|
245
273
|
length_distribution=("normal" if mixed_bag_sizes else None),
|
|
274
|
+
Ls=self.extract_Ls(bag_sizes.tolist(), Bs),
|
|
246
275
|
)
|
|
247
276
|
|
|
248
277
|
return TBEDataConfig(
|
|
@@ -255,6 +284,10 @@ class TBEBenchmarkParamsReporter:
|
|
|
255
284
|
indices_params=indices_params,
|
|
256
285
|
pooling_params=pooling_params,
|
|
257
286
|
use_cpu=(not torch.cuda.is_available()),
|
|
287
|
+
Es=Es,
|
|
288
|
+
Ds=Ds,
|
|
289
|
+
embedding_specs=embedding_specs,
|
|
290
|
+
feature_table_map=feature_table_map,
|
|
258
291
|
)
|
|
259
292
|
|
|
260
293
|
def report_stats(
|
|
@@ -266,7 +299,9 @@ class TBEBenchmarkParamsReporter:
|
|
|
266
299
|
offsets: torch.Tensor,
|
|
267
300
|
op_id: str = "",
|
|
268
301
|
per_sample_weights: Optional[torch.Tensor] = None,
|
|
269
|
-
batch_size_per_feature_per_rank: Optional[
|
|
302
|
+
batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
|
|
303
|
+
embedding_specs: Optional[List[Tuple[int, int]]] = None,
|
|
304
|
+
feature_table_map: Optional[List[int]] = None,
|
|
270
305
|
) -> None:
|
|
271
306
|
"""
|
|
272
307
|
Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
|
|
@@ -280,6 +315,8 @@ class TBEBenchmarkParamsReporter:
|
|
|
280
315
|
op_id (str, optional): The operation identifier. Defaults to an empty string.
|
|
281
316
|
per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
|
|
282
317
|
batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
|
|
318
|
+
embedding_specs (Optional[List[Tuple[int, int]]]): Embedding specs. Defaults to None.
|
|
319
|
+
feature_table_map (Optional[List[int]], optional): Feature table map. Defaults to None.
|
|
283
320
|
"""
|
|
284
321
|
if (
|
|
285
322
|
(iteration - self.report_iter_start) % self.report_interval == 0
|
|
@@ -299,41 +336,14 @@ class TBEBenchmarkParamsReporter:
|
|
|
299
336
|
offsets=offsets,
|
|
300
337
|
per_sample_weights=per_sample_weights,
|
|
301
338
|
batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
|
|
339
|
+
Es=feature_rows.tolist(),
|
|
340
|
+
Ds=feature_dims.tolist(),
|
|
341
|
+
embedding_specs=embedding_specs,
|
|
342
|
+
feature_table_map=feature_table_map,
|
|
302
343
|
)
|
|
303
344
|
|
|
304
|
-
# Ad-hoc fix for adding Es and Ds to JSON output
|
|
305
|
-
# TODO: Remove this once we moved Es and Ds to be part of TBEDataConfig
|
|
306
|
-
adhoc_config = config.dict()
|
|
307
|
-
adhoc_config["Es"] = feature_rows.tolist()
|
|
308
|
-
adhoc_config["Ds"] = feature_dims.tolist()
|
|
309
|
-
if batch_size_per_feature_per_rank:
|
|
310
|
-
adhoc_config["Bs"] = [
|
|
311
|
-
sum(batch_size_per_feature_per_rank[f])
|
|
312
|
-
for f in range(len(adhoc_config["Es"]))
|
|
313
|
-
]
|
|
314
|
-
|
|
315
|
-
bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
|
|
316
|
-
adhoc_config["Ls"] = []
|
|
317
|
-
pointer_counter = 0
|
|
318
|
-
if batch_size_per_feature_per_rank:
|
|
319
|
-
for batchs_size in adhoc_config["Bs"]:
|
|
320
|
-
current_L = 0
|
|
321
|
-
for _i in range(batchs_size):
|
|
322
|
-
current_L += bag_sizes[pointer_counter]
|
|
323
|
-
pointer_counter += 1
|
|
324
|
-
adhoc_config["Ls"].append(current_L / batchs_size)
|
|
325
|
-
else:
|
|
326
|
-
batch_size = int(len(bag_sizes) // len(adhoc_config["Es"]))
|
|
327
|
-
|
|
328
|
-
for _j in range(len(adhoc_config["Es"])):
|
|
329
|
-
current_L = 0
|
|
330
|
-
for _i in range(batch_size):
|
|
331
|
-
current_L += bag_sizes[pointer_counter]
|
|
332
|
-
pointer_counter += 1
|
|
333
|
-
adhoc_config["Ls"].append(current_L / batch_size)
|
|
334
|
-
|
|
335
345
|
# Write the TBE config to FileStore
|
|
336
346
|
self.filestore.write(
|
|
337
347
|
f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
|
|
338
|
-
io.BytesIO(json.dumps(
|
|
348
|
+
io.BytesIO(json.dumps(config.dict(), indent=2).encode()),
|
|
339
349
|
)
|
{fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/RECORD
RENAMED
|
@@ -2,15 +2,15 @@ fbgemm_gpu/__init__.py,sha256=JrSxUgY_diRl9kXapbyq3iteiB32D02CPan3stEFiAM,6434
|
|
|
2
2
|
fbgemm_gpu/asmjit.so,sha256=DNnFdMXB8IW_9ulBAn7I5EMmAJ5y-yT0-YRDBszOqXA,501728
|
|
3
3
|
fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=pZqqUfvPIsaIo1CWX-_W087WQg-YEZuS0GNGoKFO_9c,2915
|
|
4
4
|
fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
|
|
5
|
-
fbgemm_gpu/fbgemm.so,sha256=
|
|
6
|
-
fbgemm_gpu/fbgemm_gpu_config.so,sha256=
|
|
5
|
+
fbgemm_gpu/fbgemm.so,sha256=lkRdKMi7O083a2gbKCuNsytkb9gHIgcJvNwPRm6RP8U,5659000
|
|
6
|
+
fbgemm_gpu/fbgemm_gpu_config.so,sha256=ksQXaC-3-l3Agi0wxeE9o_2wAG3W4-4X84eplwPr5E8,47080
|
|
7
7
|
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so,sha256=BGXKjkImBO3W6weT6SasOOCUB2w4x1v_DMAadGbqciQ,88032
|
|
8
|
-
fbgemm_gpu/fbgemm_gpu_py.so,sha256=
|
|
8
|
+
fbgemm_gpu/fbgemm_gpu_py.so,sha256=xD0-0Xnb8kDGbh8rPWnqEYNhKZeTm3BJ1hp24XWaxlo,4938000
|
|
9
9
|
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so,sha256=o7Vne9VVYq2VKz_jOLw97erI0RbiAMPNLk2oODNQvEA,79840
|
|
10
|
-
fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256
|
|
11
|
-
fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=
|
|
10
|
+
fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=b_Dxjg-U5ilu1tyi-JmfQ8c13ABEpH1-QrVDYUYoXPI,284656
|
|
11
|
+
fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=opZDhNxRVqEBDwDDZmLp2TLmLYKxmLE4I11DhR4rd_A,387040
|
|
12
12
|
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so,sha256=uGRcOuVR_O4bxaEfuq5ve-xXMMV2HKhHfTm_LnKJ_aw,333800
|
|
13
|
-
fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=
|
|
13
|
+
fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=HeMj3Q_KhYBKi2MQ-jBLGgE82TGb_75twp0hVeMqWgA,605000
|
|
14
14
|
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so,sha256=QmRzU2tSEZTxarB3xNSDFKWdRKfKHrmowMiws_f3zmc,13760
|
|
15
15
|
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so,sha256=2hARnWgefCIXwYNwFwNsGa2kei8fEY_gqia8NxqFclA,1278144
|
|
16
16
|
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so,sha256=N569f2bc0y_CyBJagS8RlsmrHVWpwQMY14kFdpF6T_Y,13760
|
|
@@ -34,7 +34,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
|
|
|
34
34
|
fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
|
|
35
35
|
fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=_uUplpcyQOQuxqv8-HV94VUM5lG8e3aGWltXhOgICQc,19294
|
|
36
36
|
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
|
|
37
|
-
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=
|
|
37
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=uCPngWxxC5OQhJv7o6aGs8xf3WlRSrdRHbpCBlPbIuE,191511
|
|
38
38
|
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
|
|
39
39
|
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=2TTKsF5yaROTaI69YdCIt8hr_v2TDEo8EraZ0QXNBxc,717
|
|
40
40
|
fbgemm_gpu/tbe_input_multiplexer.py,sha256=MbZF8aZdm_kV-JRMaooeZrqlh6Pn5IuNkSXBXODp-LE,3062
|
|
@@ -49,7 +49,7 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
|
|
|
49
49
|
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
50
50
|
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
51
51
|
fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
|
|
52
|
-
fbgemm_gpu/docs/target.default.json.py,sha256=
|
|
52
|
+
fbgemm_gpu/docs/target.default.json.py,sha256=_BcuMA1hCJ_Jtf08E7O8t-R8A5HiRXHH3Z9rpgCq66U,79
|
|
53
53
|
fbgemm_gpu/quantize/__init__.py,sha256=yPUCmLhNdahHFireHPQMmmiRp3g6W2dkIl5MB51M6SU,942
|
|
54
54
|
fbgemm_gpu/quantize/quantize_ops.py,sha256=C3SN79GcL7fczzoFkxUojm6cGkvvI4iWttkGN4LFQcM,2239
|
|
55
55
|
fbgemm_gpu/sll/__init__.py,sha256=nLFeTiRed6A5STRi_EgHCyNoik0zhXUk2db5kTmMUNU,4221
|
|
@@ -87,7 +87,7 @@ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py,sha256=N32H1lUb
|
|
|
87
87
|
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py,sha256=xWSmk56JgoYfO8eiiK4BP9Brbhixs4tUAMeWp5TPZ30,956
|
|
88
88
|
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py,sha256=bjrbKAypa-FnOIVKH-IUnWP1Jhlu0lk1SopZ0KLFVdo,6623
|
|
89
89
|
fbgemm_gpu/tbe/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
90
|
-
fbgemm_gpu/tbe/bench/__init__.py,sha256=
|
|
90
|
+
fbgemm_gpu/tbe/bench/__init__.py,sha256=TyUVsIH4p-RtFaXAKppYoaWbf9UTjCTUpnIV7RD_O5E,1653
|
|
91
91
|
fbgemm_gpu/tbe/bench/bench_config.py,sha256=xgtlGLCeZVW6jBYwkKsiQeCslCrWDgJbV2NLLwCRSn4,5452
|
|
92
92
|
fbgemm_gpu/tbe/bench/bench_runs.py,sha256=K4HRUcsX4BWqtrYwinZSXjnjNDFkvpoEdQmv-6rz7Tk,23518
|
|
93
93
|
fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=ofcGsiTUj3_Ml7JSsqg_LcMw3CV-6ypmlRWAUmT_cjc,6941
|
|
@@ -95,11 +95,11 @@ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=B3QOZhtycMDwHMG3dFKnlFuWOqYRCF3RCozEQfrqv
|
|
|
95
95
|
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=zdL_ve1Ga6ziU5LjfnzJXOBOIqtCjLlhSrlGfa42H9w,4978
|
|
96
96
|
fbgemm_gpu/tbe/bench/eval_compression.py,sha256=ulFMaNZF2g_vfkXLWZSh02ibotg1zpTz3swVU484mzU,3486
|
|
97
97
|
fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
|
|
98
|
-
fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=
|
|
99
|
-
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=
|
|
100
|
-
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=
|
|
101
|
-
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=
|
|
102
|
-
fbgemm_gpu/tbe/bench/utils.py,sha256=
|
|
98
|
+
fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=d724L4Is3Bo2D5reglgsBs7H6ezLFDrQUbTP5tsnPEQ,8509
|
|
99
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=c-IwLbx04Qbqxzfcn9N4U9Eo9QnmgbBN6HxJYAJwvMw,11311
|
|
100
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=fSdtEAnKu6r56mHMtMJIHo-S6m3vC4cPRyXJKKUevzc,11996
|
|
101
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=I9dozlJAW_XzuopyJapJ4gmDkLU0YSUz2znugiLZRMg,6203
|
|
102
|
+
fbgemm_gpu/tbe/bench/utils.py,sha256=C0GTTomJO3r9LVfbpzlkudxoA_3QyeMdM-7zM-YOAHA,6716
|
|
103
103
|
fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
|
|
104
104
|
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
|
|
105
105
|
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
|
|
@@ -110,7 +110,7 @@ fbgemm_gpu/tbe/ssd/training.py,sha256=2CFA4KmA9IfcpX14K4MlzBuSRPD9h5NM1M7TqepH6v
|
|
|
110
110
|
fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
|
|
111
111
|
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
|
|
112
112
|
fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
|
|
113
|
-
fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=
|
|
113
|
+
fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=_lA4peKXI0GCWsZHJ7IUKlUHU98CA-gVoOc-uhRfcoY,13233
|
|
114
114
|
fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
|
|
115
115
|
fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
|
|
116
116
|
fbgemm_gpu/tbe/utils/offsets.py,sha256=DDWwGaQsVZbhaEZ_fRxxeY8ndLc7IORPZrx61eOqwJc,1904
|
|
@@ -129,7 +129,7 @@ fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg
|
|
|
129
129
|
fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
|
|
130
130
|
list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
|
|
131
131
|
list_versions/cli_run.py,sha256=BCRaJvjVFBFmD5WPdjC_yJwlLv1w_TYOe3eYlf_9ZMo,4506
|
|
132
|
-
fbgemm_gpu_nightly_cpu-2026.1.
|
|
133
|
-
fbgemm_gpu_nightly_cpu-2026.1.
|
|
134
|
-
fbgemm_gpu_nightly_cpu-2026.1.
|
|
135
|
-
fbgemm_gpu_nightly_cpu-2026.1.
|
|
132
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/METADATA,sha256=sqUYIVBwodRVxysq3jEToUNFX12vtC4tZenZnKnynjo,2654
|
|
133
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
|
|
134
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
|
|
135
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD,,
|
{fbgemm_gpu_nightly_cpu-2026.1.22.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|