fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fbgemm_gpu/__init__.py +118 -23
- fbgemm_gpu/asmjit.so +0 -0
- fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
- fbgemm_gpu/config/feature_list.py +7 -1
- fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
- fbgemm_gpu/docs/sparse_ops.py +142 -1
- fbgemm_gpu/docs/target.default.json.py +6 -0
- fbgemm_gpu/enums.py +3 -4
- fbgemm_gpu/fbgemm.so +0 -0
- fbgemm_gpu/fbgemm_gpu_config.so +0 -0
- fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
- fbgemm_gpu/fbgemm_gpu_py.so +0 -0
- fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
- fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
- fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
- fbgemm_gpu/quantize/__init__.py +2 -0
- fbgemm_gpu/quantize/quantize_ops.py +1 -0
- fbgemm_gpu/quantize_comm.py +29 -12
- fbgemm_gpu/quantize_utils.py +88 -8
- fbgemm_gpu/runtime_monitor.py +9 -5
- fbgemm_gpu/sll/__init__.py +3 -0
- fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
- fbgemm_gpu/sll/triton/__init__.py +0 -10
- fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
- fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
- fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
- fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
- fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
- fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
- fbgemm_gpu/sparse_ops.py +244 -76
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
- fbgemm_gpu/split_embedding_configs.py +287 -3
- fbgemm_gpu/split_embedding_inference_converter.py +7 -6
- fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
- fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
- fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
- fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
- fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
- fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
- fbgemm_gpu/tbe/bench/__init__.py +13 -2
- fbgemm_gpu/tbe/bench/bench_config.py +37 -9
- fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
- fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
- fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
- fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
- fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
- fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
- fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
- fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
- fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
- fbgemm_gpu/tbe/bench/utils.py +129 -5
- fbgemm_gpu/tbe/cache/__init__.py +1 -0
- fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
- fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
- fbgemm_gpu/tbe/ssd/common.py +27 -0
- fbgemm_gpu/tbe/ssd/inference.py +15 -15
- fbgemm_gpu/tbe/ssd/training.py +2930 -195
- fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
- fbgemm_gpu/tbe/stats/__init__.py +10 -0
- fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
- fbgemm_gpu/tbe/utils/offsets.py +6 -6
- fbgemm_gpu/tbe/utils/quantize.py +8 -8
- fbgemm_gpu/tbe/utils/requests.py +53 -28
- fbgemm_gpu/tbe_input_multiplexer.py +16 -7
- fbgemm_gpu/triton/common.py +0 -1
- fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
- fbgemm_gpu/triton/quantize.py +14 -9
- fbgemm_gpu/utils/filestore.py +56 -5
- fbgemm_gpu/utils/torch_library.py +2 -2
- fbgemm_gpu/utils/writeback_util.py +124 -0
- fbgemm_gpu/uvm.py +3 -0
- {fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
- fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
- fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
- fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
- list_versions/cli_run.py +161 -0
- fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
- fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
- {fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0
|
@@ -31,15 +31,18 @@ except Exception:
|
|
|
31
31
|
|
|
32
32
|
# @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
|
|
33
33
|
import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
|
|
34
|
+
from fbgemm_gpu.split_embedding_configs import sparse_type_int_to_dtype
|
|
34
35
|
from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
def generate_vbe_metadata(
|
|
38
39
|
offsets: Tensor,
|
|
39
|
-
batch_size_per_feature_per_rank: Optional[
|
|
40
|
+
batch_size_per_feature_per_rank: Optional[list[list[int]]],
|
|
40
41
|
pooling_mode: PoolingMode,
|
|
41
42
|
feature_dims_cpu: Tensor,
|
|
42
43
|
device: torch.device,
|
|
44
|
+
vbe_output: Optional[Tensor] = None,
|
|
45
|
+
vbe_output_offsets: Optional[Tensor] = None,
|
|
43
46
|
) -> invokers.lookup_args.VBEMetadata:
|
|
44
47
|
"""
|
|
45
48
|
Generate VBE metadata based on batch_size_per_feature_per_rank.
|
|
@@ -133,6 +136,8 @@ def generate_vbe_metadata(
|
|
|
133
136
|
max_B_feature_rank=max_B_feature_rank,
|
|
134
137
|
# pyre-ignore
|
|
135
138
|
output_size=output_size,
|
|
139
|
+
vbe_output=vbe_output,
|
|
140
|
+
vbe_output_offsets=vbe_output_offsets,
|
|
136
141
|
)
|
|
137
142
|
else:
|
|
138
143
|
vbe_metadata = invokers.lookup_args.VBEMetadata(
|
|
@@ -142,5 +147,43 @@ def generate_vbe_metadata(
|
|
|
142
147
|
max_B=-1,
|
|
143
148
|
max_B_feature_rank=-1,
|
|
144
149
|
output_size=-1,
|
|
150
|
+
vbe_output=None,
|
|
151
|
+
vbe_output_offsets=None,
|
|
145
152
|
)
|
|
146
153
|
return vbe_metadata
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def check_allocated_vbe_output(
|
|
157
|
+
output_dtype: int,
|
|
158
|
+
batch_size_per_feature_per_rank: Optional[List[List[int]]],
|
|
159
|
+
vbe_output: Optional[Tensor] = None,
|
|
160
|
+
vbe_output_offsets: Optional[Tensor] = None,
|
|
161
|
+
) -> None:
|
|
162
|
+
assert (
|
|
163
|
+
batch_size_per_feature_per_rank is not None
|
|
164
|
+
), "[Merged_VBE] vbe_output is passed, batch_size_per_feature_per_rank cannot be None"
|
|
165
|
+
assert (
|
|
166
|
+
vbe_output is not None
|
|
167
|
+
), "[Merged_VBE] vbe_output_offsets is not None, vbe_output cannot be None"
|
|
168
|
+
assert (
|
|
169
|
+
vbe_output_offsets is not None
|
|
170
|
+
), "[Merged_VBE] vbe_output is not None, vbe_output_offsets cannot be None"
|
|
171
|
+
num_features = len(batch_size_per_feature_per_rank)
|
|
172
|
+
num_ranks = len(batch_size_per_feature_per_rank[0])
|
|
173
|
+
assert vbe_output_offsets.shape == torch.Size(
|
|
174
|
+
[num_ranks, num_features]
|
|
175
|
+
), f"[Merged_VBE] Mismatched vbe_output_offsets shape. batch_size_per_feature_per_rank={batch_size_per_feature_per_rank}. Expected: {torch.Size([num_ranks, num_features])}, Actual: {vbe_output_offsets.shape}"
|
|
176
|
+
assert (
|
|
177
|
+
vbe_output.dim() == 1
|
|
178
|
+
), f"[Merged_VBE] vbe_output must have 1 dimension, but got {vbe_output.dim()}. vbe_output shape is {vbe_output.shape}"
|
|
179
|
+
assert (
|
|
180
|
+
vbe_output_offsets.device == vbe_output.device
|
|
181
|
+
), "[Merged_VBE] vbe_output_offsets and vbe_output must be on the same device"
|
|
182
|
+
_output_dtype = sparse_type_int_to_dtype(output_dtype)
|
|
183
|
+
assert (
|
|
184
|
+
vbe_output.dtype == _output_dtype
|
|
185
|
+
), f"[Merged_VBE] vbe_output dtype must match TBE output dtype {_output_dtype} (SparseType {output_dtype}), but got {vbe_output.dtype}"
|
|
186
|
+
assert (
|
|
187
|
+
vbe_output_offsets.is_contiguous()
|
|
188
|
+
), "[Merged_VBE] vbe_output_offsets needs to be contiguous"
|
|
189
|
+
assert vbe_output.is_contiguous(), "[Merged_VBE] vbe_output needs to be contiguous"
|
fbgemm_gpu/tbe/bench/__init__.py
CHANGED
|
@@ -12,15 +12,19 @@ import torch
|
|
|
12
12
|
from .bench_config import ( # noqa F401
|
|
13
13
|
TBEBenchmarkingConfig,
|
|
14
14
|
TBEBenchmarkingConfigLoader,
|
|
15
|
+
TBEBenchmarkingHelperText,
|
|
15
16
|
)
|
|
16
17
|
from .bench_runs import ( # noqa F401
|
|
17
18
|
bench_warmup,
|
|
18
19
|
benchmark_cpu_requests,
|
|
20
|
+
benchmark_cpu_requests_mp,
|
|
19
21
|
benchmark_pipelined_requests,
|
|
20
22
|
benchmark_requests,
|
|
21
23
|
benchmark_requests_refer,
|
|
24
|
+
benchmark_requests_with_spec,
|
|
22
25
|
benchmark_vbe,
|
|
23
26
|
)
|
|
27
|
+
from .benchmark_click_interface import TbeBenchClickInterface # noqa F401
|
|
24
28
|
from .embedding_ops_common_config import EmbeddingOpsCommonConfigLoader # noqa F401
|
|
25
29
|
from .eval_compression import ( # noqa F401
|
|
26
30
|
benchmark_eval_compression,
|
|
@@ -28,13 +32,20 @@ from .eval_compression import ( # noqa F401
|
|
|
28
32
|
)
|
|
29
33
|
from .reporter import BenchmarkReporter # noqa F401
|
|
30
34
|
from .tbe_data_config import TBEDataConfig # noqa F401
|
|
31
|
-
from .tbe_data_config_loader import
|
|
35
|
+
from .tbe_data_config_loader import ( # noqa F401
|
|
36
|
+
TBEDataConfigHelperText,
|
|
37
|
+
TBEDataConfigLoader,
|
|
38
|
+
)
|
|
32
39
|
from .tbe_data_config_param_models import ( # noqa F401
|
|
33
40
|
BatchParams,
|
|
34
41
|
IndicesParams,
|
|
35
42
|
PoolingParams,
|
|
36
43
|
)
|
|
37
|
-
from .utils import
|
|
44
|
+
from .utils import ( # noqa F401
|
|
45
|
+
check_oom,
|
|
46
|
+
fill_random_scale_bias,
|
|
47
|
+
generate_merged_output_and_offsets,
|
|
48
|
+
)
|
|
38
49
|
|
|
39
50
|
try:
|
|
40
51
|
torch.ops.load_library(
|
|
@@ -9,7 +9,8 @@
|
|
|
9
9
|
|
|
10
10
|
import dataclasses
|
|
11
11
|
import json
|
|
12
|
-
from
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any, Optional
|
|
13
14
|
|
|
14
15
|
import click
|
|
15
16
|
|
|
@@ -28,10 +29,12 @@ class TBEBenchmarkingConfig:
|
|
|
28
29
|
export_trace: bool
|
|
29
30
|
# The path for exporting the trace
|
|
30
31
|
trace_url: Optional[str]
|
|
32
|
+
# If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba
|
|
33
|
+
upload_perf_data: bool
|
|
31
34
|
|
|
32
35
|
@classmethod
|
|
33
36
|
# pyre-ignore [3]
|
|
34
|
-
def from_dict(cls, data:
|
|
37
|
+
def from_dict(cls, data: dict[str, Any]):
|
|
35
38
|
return cls(**data)
|
|
36
39
|
|
|
37
40
|
@classmethod
|
|
@@ -39,7 +42,7 @@ class TBEBenchmarkingConfig:
|
|
|
39
42
|
def from_json(cls, data: str):
|
|
40
43
|
return cls.from_dict(json.loads(data))
|
|
41
44
|
|
|
42
|
-
def dict(self) ->
|
|
45
|
+
def dict(self) -> dict[str, Any]:
|
|
43
46
|
return dataclasses.asdict(self)
|
|
44
47
|
|
|
45
48
|
def json(self, format: bool = False) -> str:
|
|
@@ -56,6 +59,23 @@ class TBEBenchmarkingConfig:
|
|
|
56
59
|
return self
|
|
57
60
|
|
|
58
61
|
|
|
62
|
+
@dataclasses.dataclass(frozen=True)
|
|
63
|
+
class TBEBenchmarkingHelperText(Enum):
|
|
64
|
+
BENCH_ITERATIONS = "Number of benchmark iterations to run"
|
|
65
|
+
BENCH_NUM_REQUESTS = "Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used"
|
|
66
|
+
BENCH_WARMUP_ITERATIONS = (
|
|
67
|
+
"Number of warmup iterations to run before making measurements"
|
|
68
|
+
)
|
|
69
|
+
BENCH_FLUSH_GPU_CACHE_SIZE = (
|
|
70
|
+
"Amount of memory to use for flushing the GPU cache after each iteration (MB)"
|
|
71
|
+
)
|
|
72
|
+
BENCH_EXPORT_TRACE = (
|
|
73
|
+
"If set, trace will be exported to the path specified in trace url"
|
|
74
|
+
)
|
|
75
|
+
BENCH_TRACE_URL = "The path for exporting the trace"
|
|
76
|
+
BENCH_UPLOAD_PERF_DATA = "If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba"
|
|
77
|
+
|
|
78
|
+
|
|
59
79
|
class TBEBenchmarkingConfigLoader:
|
|
60
80
|
@classmethod
|
|
61
81
|
# pyre-ignore [2]
|
|
@@ -65,38 +85,44 @@ class TBEBenchmarkingConfigLoader:
|
|
|
65
85
|
"--bench-iterations",
|
|
66
86
|
type=int,
|
|
67
87
|
default=100,
|
|
68
|
-
help=
|
|
88
|
+
help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value,
|
|
69
89
|
),
|
|
70
90
|
click.option(
|
|
71
91
|
"--bench-num-requests",
|
|
72
92
|
type=int,
|
|
73
93
|
default=-1,
|
|
74
|
-
help=
|
|
94
|
+
help=TBEBenchmarkingHelperText.BENCH_NUM_REQUESTS.value,
|
|
75
95
|
),
|
|
76
96
|
click.option(
|
|
77
97
|
"--bench-warmup-iterations",
|
|
78
98
|
type=int,
|
|
79
99
|
default=0,
|
|
80
|
-
help=
|
|
100
|
+
help=TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value,
|
|
81
101
|
),
|
|
82
102
|
click.option(
|
|
83
103
|
"--bench-flush-gpu-cache-size",
|
|
84
104
|
type=int,
|
|
85
105
|
default=0,
|
|
86
|
-
help=
|
|
106
|
+
help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
|
|
87
107
|
),
|
|
88
108
|
click.option(
|
|
89
109
|
"--bench-export-trace",
|
|
90
110
|
is_flag=True,
|
|
91
111
|
default=False,
|
|
92
|
-
help=
|
|
112
|
+
help=TBEBenchmarkingHelperText.BENCH_EXPORT_TRACE.value,
|
|
93
113
|
),
|
|
94
114
|
click.option(
|
|
95
115
|
"--bench-trace-url",
|
|
96
116
|
type=str,
|
|
97
117
|
required=False,
|
|
98
118
|
default="{emb_op_type}_tbe_{phase}_trace_{ospid}.json",
|
|
99
|
-
help=
|
|
119
|
+
help=TBEBenchmarkingHelperText.BENCH_TRACE_URL.value,
|
|
120
|
+
),
|
|
121
|
+
click.option(
|
|
122
|
+
"--upload-perf-data",
|
|
123
|
+
is_flag=True,
|
|
124
|
+
default=False,
|
|
125
|
+
help=TBEBenchmarkingHelperText.BENCH_UPLOAD_PERF_DATA.value,
|
|
100
126
|
),
|
|
101
127
|
]
|
|
102
128
|
|
|
@@ -114,6 +140,7 @@ class TBEBenchmarkingConfigLoader:
|
|
|
114
140
|
flush_gpu_cache_size = params["bench_flush_gpu_cache_size"]
|
|
115
141
|
export_trace = params["bench_export_trace"]
|
|
116
142
|
trace_url = params["bench_trace_url"]
|
|
143
|
+
upload_perf_data = params["upload_perf_data"]
|
|
117
144
|
|
|
118
145
|
# Default the number of TBE requests to number of iterations specified
|
|
119
146
|
num_requests = iterations if num_requests == -1 else num_requests
|
|
@@ -125,4 +152,5 @@ class TBEBenchmarkingConfigLoader:
|
|
|
125
152
|
flush_gpu_cache_size,
|
|
126
153
|
export_trace,
|
|
127
154
|
trace_url,
|
|
155
|
+
upload_perf_data,
|
|
128
156
|
).validate()
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
1
2
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
3
|
# All rights reserved.
|
|
3
4
|
#
|
|
@@ -8,12 +9,16 @@
|
|
|
8
9
|
|
|
9
10
|
import logging
|
|
10
11
|
import statistics
|
|
12
|
+
import threading
|
|
11
13
|
import time
|
|
12
|
-
from
|
|
14
|
+
from subprocess import Popen
|
|
15
|
+
from typing import Callable, Optional
|
|
13
16
|
|
|
14
17
|
import torch
|
|
15
18
|
|
|
16
|
-
|
|
19
|
+
# fmt:skip
|
|
20
|
+
from fbgemm_gpu.tbe.utils import b_indices, TBERequest
|
|
21
|
+
from fbgemm_gpu.tbe.utils.common import get_device
|
|
17
22
|
|
|
18
23
|
logging.basicConfig(level=logging.DEBUG)
|
|
19
24
|
|
|
@@ -40,8 +45,177 @@ def bench_warmup(
|
|
|
40
45
|
out.backward(grad)
|
|
41
46
|
|
|
42
47
|
|
|
48
|
+
def bench_warmup_with_spec(
|
|
49
|
+
request: TBERequest,
|
|
50
|
+
warmup_ms: int,
|
|
51
|
+
warmup_runs: int,
|
|
52
|
+
func: Callable[
|
|
53
|
+
[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
|
|
54
|
+
torch.Tensor,
|
|
55
|
+
],
|
|
56
|
+
bwd_only: bool = False,
|
|
57
|
+
grad: Optional[torch.Tensor] = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
indices, offsets, weights, batch_size_per_feature_per_rank = request.unpack_4()
|
|
60
|
+
if warmup_ms:
|
|
61
|
+
start_time_ms = time.time() * 1000
|
|
62
|
+
while time.time() * 1000 - start_time_ms < warmup_ms:
|
|
63
|
+
out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
|
|
64
|
+
if bwd_only:
|
|
65
|
+
out.backward(grad)
|
|
66
|
+
else:
|
|
67
|
+
for _ in range(warmup_runs):
|
|
68
|
+
out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
|
|
69
|
+
if bwd_only:
|
|
70
|
+
out.backward(grad)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BMBarrier:
|
|
74
|
+
|
|
75
|
+
def __init__(self) -> None:
|
|
76
|
+
self.bar: Optional[threading.Barrier] = None
|
|
77
|
+
|
|
78
|
+
def create_barrier(self, party_size: int) -> None:
|
|
79
|
+
if self.bar is not None:
|
|
80
|
+
self.bar.reset()
|
|
81
|
+
self.bar = None
|
|
82
|
+
self.bar = torch.multiprocessing.Barrier(party_size)
|
|
83
|
+
|
|
84
|
+
def wait(self) -> None:
|
|
85
|
+
if self.bar is not None:
|
|
86
|
+
self.bar.wait()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# This barrier ensures all CPU TBE workers start the embedding workload
|
|
90
|
+
# together so that we get the most accurate measurement. This needs to be
|
|
91
|
+
# a global variable because it will be shared among worker processes.
|
|
92
|
+
cpu_bm_barrier = BMBarrier()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def cpu_tbe_worker(
|
|
96
|
+
requests_: list[TBERequest],
|
|
97
|
+
func_: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
|
|
98
|
+
use_barrier: bool = False,
|
|
99
|
+
) -> float:
|
|
100
|
+
"""
|
|
101
|
+
Worker function to process CPU TBE workload.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
requests_ (List[TBERequest]): A list of TBERequest objects to be processed. Namely, the dataset.
|
|
105
|
+
func_ (Callable[[Tensor, Tensor, Optional[Tensor]], Tensor]):
|
|
106
|
+
The function to process each request, usually the `.forward()` method
|
|
107
|
+
n the embedding module instance.
|
|
108
|
+
use_barrier (bool, optional): Whether to use a barrier to synchronize the
|
|
109
|
+
start of embedding workload. Defaults to False.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
float: The average runtime per iteration in seconds.
|
|
113
|
+
"""
|
|
114
|
+
import time
|
|
115
|
+
|
|
116
|
+
if use_barrier:
|
|
117
|
+
cpu_bm_barrier.wait()
|
|
118
|
+
|
|
119
|
+
start_time = time.perf_counter()
|
|
120
|
+
for req in requests_:
|
|
121
|
+
func_(*(req.unpack_3()))
|
|
122
|
+
end_time = time.perf_counter()
|
|
123
|
+
|
|
124
|
+
return (end_time - start_time) / len(requests_)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def benchmark_cpu_requests_mp(
|
|
128
|
+
requests: list[TBERequest],
|
|
129
|
+
emb_module: torch.nn.Module,
|
|
130
|
+
num_warmups: int = 0,
|
|
131
|
+
num_copies: int = 1,
|
|
132
|
+
start_script: str = "",
|
|
133
|
+
end_script: str = "",
|
|
134
|
+
) -> float:
|
|
135
|
+
"""
|
|
136
|
+
CPU benchmark request handler with multi-processing support
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
requests (List[TBERequest]): A list of TBERequest objects to be processed.
|
|
140
|
+
emb_module (torch.nn.Module): The embedding module to be used for processing requests,
|
|
141
|
+
for example, an instance of `IntNBitTableBatchedEmbeddingBagsCodegen` module.
|
|
142
|
+
num_warmups (int, optional): Number of warm-up iterations to perform before benchmarking. Defaults to 0.
|
|
143
|
+
num_copies (int, optional): Number of parallel copies of the workloads. By `copies`,
|
|
144
|
+
we mean the number of parallel processes working on the same dataset described in `requests`.
|
|
145
|
+
Defaults to 1 (which means single threaded). Increasing this will enable the benchmark to use
|
|
146
|
+
more CPU cores and push higher memory bandwidth.
|
|
147
|
+
start_script (str, optional): Path to a script to be executed before starting the benchmark.
|
|
148
|
+
Defaults to empty (not running anything). This can be used to collect perf counters.
|
|
149
|
+
The script will be terminated upon benchmark finishing.
|
|
150
|
+
end_script (str, optional): Path to a script to be executed after completing the benchmark.
|
|
151
|
+
Defaults to empty (not running anything). This can be used to post-process perf counters.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
float: The average runtime per iteration in seconds.
|
|
155
|
+
|
|
156
|
+
"""
|
|
157
|
+
import os
|
|
158
|
+
|
|
159
|
+
strategy = os.environ.get("PYTORCH_SHARE_STRATEGY")
|
|
160
|
+
current_strategy = torch.multiprocessing.get_sharing_strategy()
|
|
161
|
+
if strategy is not None and current_strategy != strategy:
|
|
162
|
+
torch.multiprocessing.set_sharing_strategy(strategy)
|
|
163
|
+
|
|
164
|
+
cpu_bm_barrier.create_barrier(num_copies)
|
|
165
|
+
worker_pool = torch.multiprocessing.Pool(num_copies)
|
|
166
|
+
|
|
167
|
+
if num_warmups > 0:
|
|
168
|
+
asyncres = []
|
|
169
|
+
for _ in range(num_copies):
|
|
170
|
+
asyncres.append(
|
|
171
|
+
worker_pool.apply_async(
|
|
172
|
+
cpu_tbe_worker,
|
|
173
|
+
args=(
|
|
174
|
+
[requests[0]],
|
|
175
|
+
emb_module.forward,
|
|
176
|
+
False,
|
|
177
|
+
num_warmups,
|
|
178
|
+
),
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
for res in asyncres:
|
|
182
|
+
res.wait()
|
|
183
|
+
|
|
184
|
+
if start_script:
|
|
185
|
+
p_start = Popen([start_script, str(num_copies)])
|
|
186
|
+
|
|
187
|
+
asyncres = []
|
|
188
|
+
for _ in range(num_copies):
|
|
189
|
+
asyncres.append(
|
|
190
|
+
worker_pool.apply_async(
|
|
191
|
+
cpu_tbe_worker,
|
|
192
|
+
args=(
|
|
193
|
+
requests,
|
|
194
|
+
emb_module.forward,
|
|
195
|
+
True,
|
|
196
|
+
),
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
runtime_per_iter = 0.0
|
|
200
|
+
for res in asyncres:
|
|
201
|
+
res.wait()
|
|
202
|
+
runtime_per_iter += res.get()
|
|
203
|
+
worker_pool.close()
|
|
204
|
+
worker_pool.join()
|
|
205
|
+
worker_pool.terminate()
|
|
206
|
+
|
|
207
|
+
if start_script:
|
|
208
|
+
p_start.terminate()
|
|
209
|
+
|
|
210
|
+
if end_script:
|
|
211
|
+
p_end = Popen([end_script, str(num_copies)])
|
|
212
|
+
p_end.wait()
|
|
213
|
+
|
|
214
|
+
return runtime_per_iter / num_copies
|
|
215
|
+
|
|
216
|
+
|
|
43
217
|
def benchmark_cpu_requests(
|
|
44
|
-
requests:
|
|
218
|
+
requests: list[TBERequest],
|
|
45
219
|
func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
|
|
46
220
|
num_warmups: int = 0,
|
|
47
221
|
) -> float:
|
|
@@ -59,7 +233,7 @@ def benchmark_cpu_requests(
|
|
|
59
233
|
|
|
60
234
|
|
|
61
235
|
def benchmark_requests( # noqa: C901
|
|
62
|
-
requests:
|
|
236
|
+
requests: list[TBERequest],
|
|
63
237
|
func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
|
|
64
238
|
flush_gpu_cache_size_mb: int = 0,
|
|
65
239
|
check_median: bool = False,
|
|
@@ -126,7 +300,7 @@ def benchmark_requests( # noqa: C901
|
|
|
126
300
|
_ = torch.rand(
|
|
127
301
|
flush_gpu_cache_size_mb * 1024 * 1024 // 4,
|
|
128
302
|
dtype=torch.float,
|
|
129
|
-
device=
|
|
303
|
+
device=get_device(),
|
|
130
304
|
)
|
|
131
305
|
start_events[it].record()
|
|
132
306
|
|
|
@@ -168,8 +342,123 @@ def benchmark_requests( # noqa: C901
|
|
|
168
342
|
return median_time if check_median else avg_time
|
|
169
343
|
|
|
170
344
|
|
|
345
|
+
def benchmark_requests_with_spec( # noqa: C901
|
|
346
|
+
requests: list[TBERequest],
|
|
347
|
+
func: Callable[
|
|
348
|
+
[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
|
|
349
|
+
torch.Tensor,
|
|
350
|
+
],
|
|
351
|
+
flush_gpu_cache_size_mb: int = 0,
|
|
352
|
+
check_median: bool = False,
|
|
353
|
+
num_warmups: int = 0,
|
|
354
|
+
bwd_only: bool = False,
|
|
355
|
+
grad: Optional[torch.Tensor] = None,
|
|
356
|
+
# Used to label benchmark iterations differently in nsys profile result
|
|
357
|
+
# so that we can compare performance of two different models for example.
|
|
358
|
+
# If empty string is provided, it won't have any effect.
|
|
359
|
+
nvtx_range: str = "",
|
|
360
|
+
# Can be used to clear model's stats after warmup for example.
|
|
361
|
+
callback_after_warmup: Optional[Callable[[], None]] = None,
|
|
362
|
+
periodic_logs: bool = False,
|
|
363
|
+
warmup_ms: Optional[int] = None,
|
|
364
|
+
iters: int = -1,
|
|
365
|
+
) -> float:
|
|
366
|
+
times = []
|
|
367
|
+
# Run at least one warmup iteration to avoid the long cudaLaunchKernel time
|
|
368
|
+
# for the first kernel if warmup_ms > 0
|
|
369
|
+
# warmup_ms is prioritized over num_warmups
|
|
370
|
+
|
|
371
|
+
if warmup_ms is None:
|
|
372
|
+
num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
|
|
373
|
+
|
|
374
|
+
# warm-up the GPU before profiling
|
|
375
|
+
bench_warmup_with_spec(
|
|
376
|
+
requests[0],
|
|
377
|
+
# pyre-ignore[6]
|
|
378
|
+
warmup_ms,
|
|
379
|
+
num_warmups,
|
|
380
|
+
lambda indices, offsets, per_sample_weights, batch_size_per_feature_per_rank: func(
|
|
381
|
+
indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
|
|
382
|
+
),
|
|
383
|
+
bwd_only=bwd_only,
|
|
384
|
+
grad=grad,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if callback_after_warmup is not None:
|
|
388
|
+
callback_after_warmup()
|
|
389
|
+
|
|
390
|
+
num_reqs = len(requests)
|
|
391
|
+
iters = num_reqs if iters == -1 else iters
|
|
392
|
+
|
|
393
|
+
if torch.cuda.is_available():
|
|
394
|
+
torch.cuda.synchronize()
|
|
395
|
+
start_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
|
|
396
|
+
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
|
|
397
|
+
else:
|
|
398
|
+
start_events = []
|
|
399
|
+
end_events = []
|
|
400
|
+
|
|
401
|
+
for it in range(iters):
|
|
402
|
+
req = requests[it % num_reqs]
|
|
403
|
+
|
|
404
|
+
indices, offsets, weights, batch_size_per_feature_per_rank = req.unpack_4()
|
|
405
|
+
# logging.info(
|
|
406
|
+
# f"[Benchmark Request] batch_size_per_feature_per_rank {batch_size_per_feature_per_rank} {indices.device}"
|
|
407
|
+
# )
|
|
408
|
+
|
|
409
|
+
if bwd_only:
|
|
410
|
+
# Run forward before profiling if does backward only
|
|
411
|
+
out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
|
|
412
|
+
start_time = time.time()
|
|
413
|
+
if torch.cuda.is_available():
|
|
414
|
+
if flush_gpu_cache_size_mb:
|
|
415
|
+
_ = torch.rand(
|
|
416
|
+
flush_gpu_cache_size_mb * 1024 * 1024 // 4,
|
|
417
|
+
dtype=torch.float,
|
|
418
|
+
device=get_device(),
|
|
419
|
+
)
|
|
420
|
+
start_events[it].record()
|
|
421
|
+
|
|
422
|
+
if nvtx_range:
|
|
423
|
+
torch.cuda.nvtx.range_push(f"{nvtx_range}-{it}")
|
|
424
|
+
|
|
425
|
+
if bwd_only:
|
|
426
|
+
out.backward(grad)
|
|
427
|
+
else:
|
|
428
|
+
func(indices, offsets, weights, batch_size_per_feature_per_rank)
|
|
429
|
+
|
|
430
|
+
if nvtx_range:
|
|
431
|
+
torch.cuda.nvtx.range_pop()
|
|
432
|
+
|
|
433
|
+
if torch.cuda.is_available():
|
|
434
|
+
end_events[it].record()
|
|
435
|
+
else:
|
|
436
|
+
it_time = time.time() - start_time
|
|
437
|
+
times.append(it_time)
|
|
438
|
+
|
|
439
|
+
if torch.cuda.is_available():
|
|
440
|
+
torch.cuda.synchronize()
|
|
441
|
+
times = [
|
|
442
|
+
start.elapsed_time(end) * 1.0e-3
|
|
443
|
+
for start, end in zip(start_events, end_events)
|
|
444
|
+
]
|
|
445
|
+
|
|
446
|
+
if periodic_logs:
|
|
447
|
+
for it in range(100, iters + 1, 100):
|
|
448
|
+
times_ = times[0:it]
|
|
449
|
+
avg_time = sum(times_) / len(times_) * 1.0e6
|
|
450
|
+
last_100_avg = sum(times_[-100:]) / 100 * 1.0e6
|
|
451
|
+
logging.info(
|
|
452
|
+
f"Iteration [{it}/{len(requests)}]: Last 100: {last_100_avg:.2f} us, Running avg: {avg_time:.2f} us"
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
avg_time = sum(times) / iters
|
|
456
|
+
median_time = statistics.median(times)
|
|
457
|
+
return median_time if check_median else avg_time
|
|
458
|
+
|
|
459
|
+
|
|
171
460
|
def benchmark_requests_refer(
|
|
172
|
-
requests:
|
|
461
|
+
requests: list[TBERequest],
|
|
173
462
|
T: int,
|
|
174
463
|
B: int,
|
|
175
464
|
L: int,
|
|
@@ -208,7 +497,7 @@ def benchmark_requests_refer(
|
|
|
208
497
|
_ = torch.rand(
|
|
209
498
|
flush_gpu_cache_size_mb * 1024 * 1024 // 4,
|
|
210
499
|
dtype=torch.float,
|
|
211
|
-
device=
|
|
500
|
+
device=get_device(),
|
|
212
501
|
)
|
|
213
502
|
torch.cuda.synchronize()
|
|
214
503
|
start_event.record()
|
|
@@ -261,12 +550,12 @@ def benchmark_requests_refer(
|
|
|
261
550
|
|
|
262
551
|
|
|
263
552
|
def benchmark_pipelined_requests(
|
|
264
|
-
requests:
|
|
553
|
+
requests: list[TBERequest],
|
|
265
554
|
func1: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
|
|
266
555
|
func2: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
|
|
267
556
|
flush_gpu_cache_size_mb: int = 0,
|
|
268
557
|
check_median: bool = False,
|
|
269
|
-
) ->
|
|
558
|
+
) -> tuple[float, float]:
|
|
270
559
|
torch.cuda.synchronize()
|
|
271
560
|
start_events = [
|
|
272
561
|
(torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
|
|
@@ -282,7 +571,7 @@ def benchmark_pipelined_requests(
|
|
|
282
571
|
_ = torch.rand(
|
|
283
572
|
flush_gpu_cache_size_mb * 1024 * 1024 // 4,
|
|
284
573
|
dtype=torch.float,
|
|
285
|
-
device=
|
|
574
|
+
device=get_device(),
|
|
286
575
|
)
|
|
287
576
|
torch.cuda.synchronize()
|
|
288
577
|
start_event[0].record()
|
|
@@ -318,10 +607,10 @@ def benchmark_pipelined_requests(
|
|
|
318
607
|
|
|
319
608
|
|
|
320
609
|
def benchmark_vbe(
|
|
321
|
-
requests:
|
|
610
|
+
requests: list[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
|
|
322
611
|
func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
|
|
323
612
|
num_warmups: int = 0,
|
|
324
|
-
) ->
|
|
613
|
+
) -> tuple[float, float]:
|
|
325
614
|
"""
|
|
326
615
|
A benchmark function to return the average execution time in seconds of
|
|
327
616
|
forward and backward of VBE kernels.
|