fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. fbgemm_gpu/__init__.py +118 -23
  2. fbgemm_gpu/asmjit.so +0 -0
  3. fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
  4. fbgemm_gpu/config/feature_list.py +7 -1
  5. fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
  6. fbgemm_gpu/docs/sparse_ops.py +142 -1
  7. fbgemm_gpu/docs/target.default.json.py +6 -0
  8. fbgemm_gpu/enums.py +3 -4
  9. fbgemm_gpu/fbgemm.so +0 -0
  10. fbgemm_gpu/fbgemm_gpu_config.so +0 -0
  11. fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
  12. fbgemm_gpu/fbgemm_gpu_py.so +0 -0
  13. fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
  14. fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
  15. fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
  16. fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
  17. fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
  18. fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
  19. fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
  20. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
  21. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
  22. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
  23. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
  24. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
  25. fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
  26. fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
  27. fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
  28. fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
  29. fbgemm_gpu/quantize/__init__.py +2 -0
  30. fbgemm_gpu/quantize/quantize_ops.py +1 -0
  31. fbgemm_gpu/quantize_comm.py +29 -12
  32. fbgemm_gpu/quantize_utils.py +88 -8
  33. fbgemm_gpu/runtime_monitor.py +9 -5
  34. fbgemm_gpu/sll/__init__.py +3 -0
  35. fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
  36. fbgemm_gpu/sll/triton/__init__.py +0 -10
  37. fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
  38. fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
  39. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
  40. fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
  41. fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
  42. fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
  43. fbgemm_gpu/sparse_ops.py +244 -76
  44. fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
  45. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
  46. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
  47. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
  48. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
  49. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
  50. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
  51. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
  52. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
  53. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
  54. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
  55. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
  56. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
  57. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
  58. fbgemm_gpu/split_embedding_configs.py +287 -3
  59. fbgemm_gpu/split_embedding_inference_converter.py +7 -6
  60. fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
  61. fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
  62. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
  63. fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
  64. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
  65. fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
  66. fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
  67. fbgemm_gpu/tbe/bench/__init__.py +13 -2
  68. fbgemm_gpu/tbe/bench/bench_config.py +37 -9
  69. fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
  70. fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
  71. fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
  72. fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
  73. fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
  74. fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
  75. fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
  76. fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
  77. fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
  78. fbgemm_gpu/tbe/bench/utils.py +129 -5
  79. fbgemm_gpu/tbe/cache/__init__.py +1 -0
  80. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
  81. fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
  82. fbgemm_gpu/tbe/ssd/common.py +27 -0
  83. fbgemm_gpu/tbe/ssd/inference.py +15 -15
  84. fbgemm_gpu/tbe/ssd/training.py +2930 -195
  85. fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
  86. fbgemm_gpu/tbe/stats/__init__.py +10 -0
  87. fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
  88. fbgemm_gpu/tbe/utils/offsets.py +6 -6
  89. fbgemm_gpu/tbe/utils/quantize.py +8 -8
  90. fbgemm_gpu/tbe/utils/requests.py +53 -28
  91. fbgemm_gpu/tbe_input_multiplexer.py +16 -7
  92. fbgemm_gpu/triton/common.py +0 -1
  93. fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
  94. fbgemm_gpu/triton/quantize.py +14 -9
  95. fbgemm_gpu/utils/filestore.py +56 -5
  96. fbgemm_gpu/utils/torch_library.py +2 -2
  97. fbgemm_gpu/utils/writeback_util.py +124 -0
  98. fbgemm_gpu/uvm.py +3 -0
  99. {fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
  100. fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
  101. fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
  102. fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
  103. list_versions/cli_run.py +161 -0
  104. fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
  105. fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
  106. {fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0
@@ -31,15 +31,18 @@ except Exception:
31
31
 
32
32
  # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
33
33
  import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
34
+ from fbgemm_gpu.split_embedding_configs import sparse_type_int_to_dtype
34
35
  from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
35
36
 
36
37
 
37
38
  def generate_vbe_metadata(
38
39
  offsets: Tensor,
39
- batch_size_per_feature_per_rank: Optional[List[List[int]]],
40
+ batch_size_per_feature_per_rank: Optional[list[list[int]]],
40
41
  pooling_mode: PoolingMode,
41
42
  feature_dims_cpu: Tensor,
42
43
  device: torch.device,
44
+ vbe_output: Optional[Tensor] = None,
45
+ vbe_output_offsets: Optional[Tensor] = None,
43
46
  ) -> invokers.lookup_args.VBEMetadata:
44
47
  """
45
48
  Generate VBE metadata based on batch_size_per_feature_per_rank.
@@ -133,6 +136,8 @@ def generate_vbe_metadata(
133
136
  max_B_feature_rank=max_B_feature_rank,
134
137
  # pyre-ignore
135
138
  output_size=output_size,
139
+ vbe_output=vbe_output,
140
+ vbe_output_offsets=vbe_output_offsets,
136
141
  )
137
142
  else:
138
143
  vbe_metadata = invokers.lookup_args.VBEMetadata(
@@ -142,5 +147,43 @@ def generate_vbe_metadata(
142
147
  max_B=-1,
143
148
  max_B_feature_rank=-1,
144
149
  output_size=-1,
150
+ vbe_output=None,
151
+ vbe_output_offsets=None,
145
152
  )
146
153
  return vbe_metadata
154
+
155
+
156
+ def check_allocated_vbe_output(
157
+ output_dtype: int,
158
+ batch_size_per_feature_per_rank: Optional[List[List[int]]],
159
+ vbe_output: Optional[Tensor] = None,
160
+ vbe_output_offsets: Optional[Tensor] = None,
161
+ ) -> None:
162
+ assert (
163
+ batch_size_per_feature_per_rank is not None
164
+ ), "[Merged_VBE] vbe_output is passed, batch_size_per_feature_per_rank cannot be None"
165
+ assert (
166
+ vbe_output is not None
167
+ ), "[Merged_VBE] vbe_output_offsets is not None, vbe_output cannot be None"
168
+ assert (
169
+ vbe_output_offsets is not None
170
+ ), "[Merged_VBE] vbe_output is not None, vbe_output_offsets cannot be None"
171
+ num_features = len(batch_size_per_feature_per_rank)
172
+ num_ranks = len(batch_size_per_feature_per_rank[0])
173
+ assert vbe_output_offsets.shape == torch.Size(
174
+ [num_ranks, num_features]
175
+ ), f"[Merged_VBE] Mismatched vbe_output_offsets shape. batch_size_per_feature_per_rank={batch_size_per_feature_per_rank}. Expected: {torch.Size([num_ranks, num_features])}, Actual: {vbe_output_offsets.shape}"
176
+ assert (
177
+ vbe_output.dim() == 1
178
+ ), f"[Merged_VBE] vbe_output must have 1 dimension, but got {vbe_output.dim()}. vbe_output shape is {vbe_output.shape}"
179
+ assert (
180
+ vbe_output_offsets.device == vbe_output.device
181
+ ), "[Merged_VBE] vbe_output_offsets and vbe_output must be on the same device"
182
+ _output_dtype = sparse_type_int_to_dtype(output_dtype)
183
+ assert (
184
+ vbe_output.dtype == _output_dtype
185
+ ), f"[Merged_VBE] vbe_output dtype must match TBE output dtype {_output_dtype} (SparseType {output_dtype}), but got {vbe_output.dtype}"
186
+ assert (
187
+ vbe_output_offsets.is_contiguous()
188
+ ), "[Merged_VBE] vbe_output_offsets needs to be contiguous"
189
+ assert vbe_output.is_contiguous(), "[Merged_VBE] vbe_output needs to be contiguous"
@@ -16,7 +16,6 @@ from fbgemm_gpu.tbe.ssd import ( # noqa: F401
16
16
  SSDTableBatchedEmbeddingBags, # noqa: F401
17
17
  )
18
18
 
19
-
20
19
  warnings.warn( # noqa: B028
21
20
  f"""\033[93m
22
21
  The Python module {__name__} is now DEPRECATED and will be removed in the
@@ -12,15 +12,19 @@ import torch
12
12
  from .bench_config import ( # noqa F401
13
13
  TBEBenchmarkingConfig,
14
14
  TBEBenchmarkingConfigLoader,
15
+ TBEBenchmarkingHelperText,
15
16
  )
16
17
  from .bench_runs import ( # noqa F401
17
18
  bench_warmup,
18
19
  benchmark_cpu_requests,
20
+ benchmark_cpu_requests_mp,
19
21
  benchmark_pipelined_requests,
20
22
  benchmark_requests,
21
23
  benchmark_requests_refer,
24
+ benchmark_requests_with_spec,
22
25
  benchmark_vbe,
23
26
  )
27
+ from .benchmark_click_interface import TbeBenchClickInterface # noqa F401
24
28
  from .embedding_ops_common_config import EmbeddingOpsCommonConfigLoader # noqa F401
25
29
  from .eval_compression import ( # noqa F401
26
30
  benchmark_eval_compression,
@@ -28,13 +32,20 @@ from .eval_compression import ( # noqa F401
28
32
  )
29
33
  from .reporter import BenchmarkReporter # noqa F401
30
34
  from .tbe_data_config import TBEDataConfig # noqa F401
31
- from .tbe_data_config_loader import TBEDataConfigLoader # noqa F401
35
+ from .tbe_data_config_loader import ( # noqa F401
36
+ TBEDataConfigHelperText,
37
+ TBEDataConfigLoader,
38
+ )
32
39
  from .tbe_data_config_param_models import ( # noqa F401
33
40
  BatchParams,
34
41
  IndicesParams,
35
42
  PoolingParams,
36
43
  )
37
- from .utils import fill_random_scale_bias # noqa F401
44
+ from .utils import ( # noqa F401
45
+ check_oom,
46
+ fill_random_scale_bias,
47
+ generate_merged_output_and_offsets,
48
+ )
38
49
 
39
50
  try:
40
51
  torch.ops.load_library(
@@ -9,7 +9,8 @@
9
9
 
10
10
  import dataclasses
11
11
  import json
12
- from typing import Any, Dict, Optional
12
+ from enum import Enum
13
+ from typing import Any, Optional
13
14
 
14
15
  import click
15
16
 
@@ -28,10 +29,12 @@ class TBEBenchmarkingConfig:
28
29
  export_trace: bool
29
30
  # The path for exporting the trace
30
31
  trace_url: Optional[str]
32
+ # If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba
33
+ upload_perf_data: bool
31
34
 
32
35
  @classmethod
33
36
  # pyre-ignore [3]
34
- def from_dict(cls, data: Dict[str, Any]):
37
+ def from_dict(cls, data: dict[str, Any]):
35
38
  return cls(**data)
36
39
 
37
40
  @classmethod
@@ -39,7 +42,7 @@ class TBEBenchmarkingConfig:
39
42
  def from_json(cls, data: str):
40
43
  return cls.from_dict(json.loads(data))
41
44
 
42
- def dict(self) -> Dict[str, Any]:
45
+ def dict(self) -> dict[str, Any]:
43
46
  return dataclasses.asdict(self)
44
47
 
45
48
  def json(self, format: bool = False) -> str:
@@ -56,6 +59,23 @@ class TBEBenchmarkingConfig:
56
59
  return self
57
60
 
58
61
 
62
+ @dataclasses.dataclass(frozen=True)
63
+ class TBEBenchmarkingHelperText(Enum):
64
+ BENCH_ITERATIONS = "Number of benchmark iterations to run"
65
+ BENCH_NUM_REQUESTS = "Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used"
66
+ BENCH_WARMUP_ITERATIONS = (
67
+ "Number of warmup iterations to run before making measurements"
68
+ )
69
+ BENCH_FLUSH_GPU_CACHE_SIZE = (
70
+ "Amount of memory to use for flushing the GPU cache after each iteration (MB)"
71
+ )
72
+ BENCH_EXPORT_TRACE = (
73
+ "If set, trace will be exported to the path specified in trace url"
74
+ )
75
+ BENCH_TRACE_URL = "The path for exporting the trace"
76
+ BENCH_UPLOAD_PERF_DATA = "If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba"
77
+
78
+
59
79
  class TBEBenchmarkingConfigLoader:
60
80
  @classmethod
61
81
  # pyre-ignore [2]
@@ -65,38 +85,44 @@ class TBEBenchmarkingConfigLoader:
65
85
  "--bench-iterations",
66
86
  type=int,
67
87
  default=100,
68
- help="Number of benchmark iterations to run",
88
+ help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value,
69
89
  ),
70
90
  click.option(
71
91
  "--bench-num-requests",
72
92
  type=int,
73
93
  default=-1,
74
- help="Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used",
94
+ help=TBEBenchmarkingHelperText.BENCH_NUM_REQUESTS.value,
75
95
  ),
76
96
  click.option(
77
97
  "--bench-warmup-iterations",
78
98
  type=int,
79
99
  default=0,
80
- help="Number of warmup iterations to run before making measurements",
100
+ help=TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value,
81
101
  ),
82
102
  click.option(
83
103
  "--bench-flush-gpu-cache-size",
84
104
  type=int,
85
105
  default=0,
86
- help="Amount of memory to use for flushing the GPU cache after each iteration (MB)",
106
+ help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
87
107
  ),
88
108
  click.option(
89
109
  "--bench-export-trace",
90
110
  is_flag=True,
91
111
  default=False,
92
- help="If set, a trace will be exported",
112
+ help=TBEBenchmarkingHelperText.BENCH_EXPORT_TRACE.value,
93
113
  ),
94
114
  click.option(
95
115
  "--bench-trace-url",
96
116
  type=str,
97
117
  required=False,
98
118
  default="{emb_op_type}_tbe_{phase}_trace_{ospid}.json",
99
- help="The path for exporting the trace",
119
+ help=TBEBenchmarkingHelperText.BENCH_TRACE_URL.value,
120
+ ),
121
+ click.option(
122
+ "--upload-perf-data",
123
+ is_flag=True,
124
+ default=False,
125
+ help=TBEBenchmarkingHelperText.BENCH_UPLOAD_PERF_DATA.value,
100
126
  ),
101
127
  ]
102
128
 
@@ -114,6 +140,7 @@ class TBEBenchmarkingConfigLoader:
114
140
  flush_gpu_cache_size = params["bench_flush_gpu_cache_size"]
115
141
  export_trace = params["bench_export_trace"]
116
142
  trace_url = params["bench_trace_url"]
143
+ upload_perf_data = params["upload_perf_data"]
117
144
 
118
145
  # Default the number of TBE requests to number of iterations specified
119
146
  num_requests = iterations if num_requests == -1 else num_requests
@@ -125,4 +152,5 @@ class TBEBenchmarkingConfigLoader:
125
152
  flush_gpu_cache_size,
126
153
  export_trace,
127
154
  trace_url,
155
+ upload_perf_data,
128
156
  ).validate()
@@ -1,3 +1,4 @@
1
+ #!/usr/bin/env python3
1
2
  # Copyright (c) Meta Platforms, Inc. and affiliates.
2
3
  # All rights reserved.
3
4
  #
@@ -8,12 +9,16 @@
8
9
 
9
10
  import logging
10
11
  import statistics
12
+ import threading
11
13
  import time
12
- from typing import Callable, List, Optional, Tuple
14
+ from subprocess import Popen
15
+ from typing import Callable, Optional
13
16
 
14
17
  import torch
15
18
 
16
- from fbgemm_gpu.tbe.utils import b_indices, TBERequest # noqa: F401
19
+ # fmt:skip
20
+ from fbgemm_gpu.tbe.utils import b_indices, TBERequest
21
+ from fbgemm_gpu.tbe.utils.common import get_device
17
22
 
18
23
  logging.basicConfig(level=logging.DEBUG)
19
24
 
@@ -40,8 +45,177 @@ def bench_warmup(
40
45
  out.backward(grad)
41
46
 
42
47
 
48
+ def bench_warmup_with_spec(
49
+ request: TBERequest,
50
+ warmup_ms: int,
51
+ warmup_runs: int,
52
+ func: Callable[
53
+ [torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
54
+ torch.Tensor,
55
+ ],
56
+ bwd_only: bool = False,
57
+ grad: Optional[torch.Tensor] = None,
58
+ ) -> None:
59
+ indices, offsets, weights, batch_size_per_feature_per_rank = request.unpack_4()
60
+ if warmup_ms:
61
+ start_time_ms = time.time() * 1000
62
+ while time.time() * 1000 - start_time_ms < warmup_ms:
63
+ out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
64
+ if bwd_only:
65
+ out.backward(grad)
66
+ else:
67
+ for _ in range(warmup_runs):
68
+ out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
69
+ if bwd_only:
70
+ out.backward(grad)
71
+
72
+
73
+ class BMBarrier:
74
+
75
+ def __init__(self) -> None:
76
+ self.bar: Optional[threading.Barrier] = None
77
+
78
+ def create_barrier(self, party_size: int) -> None:
79
+ if self.bar is not None:
80
+ self.bar.reset()
81
+ self.bar = None
82
+ self.bar = torch.multiprocessing.Barrier(party_size)
83
+
84
+ def wait(self) -> None:
85
+ if self.bar is not None:
86
+ self.bar.wait()
87
+
88
+
89
+ # This barrier ensures all CPU TBE workers start the embedding workload
90
+ # together so that we get the most accurate measurement. This needs to be
91
+ # a global variable because it will be shared among worker processes.
92
+ cpu_bm_barrier = BMBarrier()
93
+
94
+
95
+ def cpu_tbe_worker(
96
+ requests_: list[TBERequest],
97
+ func_: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
98
+ use_barrier: bool = False,
99
+ ) -> float:
100
+ """
101
+ Worker function to process CPU TBE workload.
102
+
103
+ Args:
104
+ requests_ (List[TBERequest]): A list of TBERequest objects to be processed. Namely, the dataset.
105
+ func_ (Callable[[Tensor, Tensor, Optional[Tensor]], Tensor]):
106
+ The function to process each request, usually the `.forward()` method
107
+ n the embedding module instance.
108
+ use_barrier (bool, optional): Whether to use a barrier to synchronize the
109
+ start of embedding workload. Defaults to False.
110
+
111
+ Returns:
112
+ float: The average runtime per iteration in seconds.
113
+ """
114
+ import time
115
+
116
+ if use_barrier:
117
+ cpu_bm_barrier.wait()
118
+
119
+ start_time = time.perf_counter()
120
+ for req in requests_:
121
+ func_(*(req.unpack_3()))
122
+ end_time = time.perf_counter()
123
+
124
+ return (end_time - start_time) / len(requests_)
125
+
126
+
127
+ def benchmark_cpu_requests_mp(
128
+ requests: list[TBERequest],
129
+ emb_module: torch.nn.Module,
130
+ num_warmups: int = 0,
131
+ num_copies: int = 1,
132
+ start_script: str = "",
133
+ end_script: str = "",
134
+ ) -> float:
135
+ """
136
+ CPU benchmark request handler with multi-processing support
137
+
138
+ Args:
139
+ requests (List[TBERequest]): A list of TBERequest objects to be processed.
140
+ emb_module (torch.nn.Module): The embedding module to be used for processing requests,
141
+ for example, an instance of `IntNBitTableBatchedEmbeddingBagsCodegen` module.
142
+ num_warmups (int, optional): Number of warm-up iterations to perform before benchmarking. Defaults to 0.
143
+ num_copies (int, optional): Number of parallel copies of the workloads. By `copies`,
144
+ we mean the number of parallel processes working on the same dataset described in `requests`.
145
+ Defaults to 1 (which means single threaded). Increasing this will enable the benchmark to use
146
+ more CPU cores and push higher memory bandwidth.
147
+ start_script (str, optional): Path to a script to be executed before starting the benchmark.
148
+ Defaults to empty (not running anything). This can be used to collect perf counters.
149
+ The script will be terminated upon benchmark finishing.
150
+ end_script (str, optional): Path to a script to be executed after completing the benchmark.
151
+ Defaults to empty (not running anything). This can be used to post-process perf counters.
152
+
153
+ Returns:
154
+ float: The average runtime per iteration in seconds.
155
+
156
+ """
157
+ import os
158
+
159
+ strategy = os.environ.get("PYTORCH_SHARE_STRATEGY")
160
+ current_strategy = torch.multiprocessing.get_sharing_strategy()
161
+ if strategy is not None and current_strategy != strategy:
162
+ torch.multiprocessing.set_sharing_strategy(strategy)
163
+
164
+ cpu_bm_barrier.create_barrier(num_copies)
165
+ worker_pool = torch.multiprocessing.Pool(num_copies)
166
+
167
+ if num_warmups > 0:
168
+ asyncres = []
169
+ for _ in range(num_copies):
170
+ asyncres.append(
171
+ worker_pool.apply_async(
172
+ cpu_tbe_worker,
173
+ args=(
174
+ [requests[0]],
175
+ emb_module.forward,
176
+ False,
177
+ num_warmups,
178
+ ),
179
+ )
180
+ )
181
+ for res in asyncres:
182
+ res.wait()
183
+
184
+ if start_script:
185
+ p_start = Popen([start_script, str(num_copies)])
186
+
187
+ asyncres = []
188
+ for _ in range(num_copies):
189
+ asyncres.append(
190
+ worker_pool.apply_async(
191
+ cpu_tbe_worker,
192
+ args=(
193
+ requests,
194
+ emb_module.forward,
195
+ True,
196
+ ),
197
+ )
198
+ )
199
+ runtime_per_iter = 0.0
200
+ for res in asyncres:
201
+ res.wait()
202
+ runtime_per_iter += res.get()
203
+ worker_pool.close()
204
+ worker_pool.join()
205
+ worker_pool.terminate()
206
+
207
+ if start_script:
208
+ p_start.terminate()
209
+
210
+ if end_script:
211
+ p_end = Popen([end_script, str(num_copies)])
212
+ p_end.wait()
213
+
214
+ return runtime_per_iter / num_copies
215
+
216
+
43
217
  def benchmark_cpu_requests(
44
- requests: List[TBERequest],
218
+ requests: list[TBERequest],
45
219
  func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
46
220
  num_warmups: int = 0,
47
221
  ) -> float:
@@ -59,7 +233,7 @@ def benchmark_cpu_requests(
59
233
 
60
234
 
61
235
  def benchmark_requests( # noqa: C901
62
- requests: List[TBERequest],
236
+ requests: list[TBERequest],
63
237
  func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
64
238
  flush_gpu_cache_size_mb: int = 0,
65
239
  check_median: bool = False,
@@ -126,7 +300,7 @@ def benchmark_requests( # noqa: C901
126
300
  _ = torch.rand(
127
301
  flush_gpu_cache_size_mb * 1024 * 1024 // 4,
128
302
  dtype=torch.float,
129
- device="cuda",
303
+ device=get_device(),
130
304
  )
131
305
  start_events[it].record()
132
306
 
@@ -168,8 +342,123 @@ def benchmark_requests( # noqa: C901
168
342
  return median_time if check_median else avg_time
169
343
 
170
344
 
345
+ def benchmark_requests_with_spec( # noqa: C901
346
+ requests: list[TBERequest],
347
+ func: Callable[
348
+ [torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]],
349
+ torch.Tensor,
350
+ ],
351
+ flush_gpu_cache_size_mb: int = 0,
352
+ check_median: bool = False,
353
+ num_warmups: int = 0,
354
+ bwd_only: bool = False,
355
+ grad: Optional[torch.Tensor] = None,
356
+ # Used to label benchmark iterations differently in nsys profile result
357
+ # so that we can compare performance of two different models for example.
358
+ # If empty string is provided, it won't have any effect.
359
+ nvtx_range: str = "",
360
+ # Can be used to clear model's stats after warmup for example.
361
+ callback_after_warmup: Optional[Callable[[], None]] = None,
362
+ periodic_logs: bool = False,
363
+ warmup_ms: Optional[int] = None,
364
+ iters: int = -1,
365
+ ) -> float:
366
+ times = []
367
+ # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
368
+ # for the first kernel if warmup_ms > 0
369
+ # warmup_ms is prioritized over num_warmups
370
+
371
+ if warmup_ms is None:
372
+ num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
373
+
374
+ # warm-up the GPU before profiling
375
+ bench_warmup_with_spec(
376
+ requests[0],
377
+ # pyre-ignore[6]
378
+ warmup_ms,
379
+ num_warmups,
380
+ lambda indices, offsets, per_sample_weights, batch_size_per_feature_per_rank: func(
381
+ indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
382
+ ),
383
+ bwd_only=bwd_only,
384
+ grad=grad,
385
+ )
386
+
387
+ if callback_after_warmup is not None:
388
+ callback_after_warmup()
389
+
390
+ num_reqs = len(requests)
391
+ iters = num_reqs if iters == -1 else iters
392
+
393
+ if torch.cuda.is_available():
394
+ torch.cuda.synchronize()
395
+ start_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
396
+ end_events = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
397
+ else:
398
+ start_events = []
399
+ end_events = []
400
+
401
+ for it in range(iters):
402
+ req = requests[it % num_reqs]
403
+
404
+ indices, offsets, weights, batch_size_per_feature_per_rank = req.unpack_4()
405
+ # logging.info(
406
+ # f"[Benchmark Request] batch_size_per_feature_per_rank {batch_size_per_feature_per_rank} {indices.device}"
407
+ # )
408
+
409
+ if bwd_only:
410
+ # Run forward before profiling if does backward only
411
+ out = func(indices, offsets, weights, batch_size_per_feature_per_rank)
412
+ start_time = time.time()
413
+ if torch.cuda.is_available():
414
+ if flush_gpu_cache_size_mb:
415
+ _ = torch.rand(
416
+ flush_gpu_cache_size_mb * 1024 * 1024 // 4,
417
+ dtype=torch.float,
418
+ device=get_device(),
419
+ )
420
+ start_events[it].record()
421
+
422
+ if nvtx_range:
423
+ torch.cuda.nvtx.range_push(f"{nvtx_range}-{it}")
424
+
425
+ if bwd_only:
426
+ out.backward(grad)
427
+ else:
428
+ func(indices, offsets, weights, batch_size_per_feature_per_rank)
429
+
430
+ if nvtx_range:
431
+ torch.cuda.nvtx.range_pop()
432
+
433
+ if torch.cuda.is_available():
434
+ end_events[it].record()
435
+ else:
436
+ it_time = time.time() - start_time
437
+ times.append(it_time)
438
+
439
+ if torch.cuda.is_available():
440
+ torch.cuda.synchronize()
441
+ times = [
442
+ start.elapsed_time(end) * 1.0e-3
443
+ for start, end in zip(start_events, end_events)
444
+ ]
445
+
446
+ if periodic_logs:
447
+ for it in range(100, iters + 1, 100):
448
+ times_ = times[0:it]
449
+ avg_time = sum(times_) / len(times_) * 1.0e6
450
+ last_100_avg = sum(times_[-100:]) / 100 * 1.0e6
451
+ logging.info(
452
+ f"Iteration [{it}/{len(requests)}]: Last 100: {last_100_avg:.2f} us, Running avg: {avg_time:.2f} us"
453
+ )
454
+
455
+ avg_time = sum(times) / iters
456
+ median_time = statistics.median(times)
457
+ return median_time if check_median else avg_time
458
+
459
+
171
460
  def benchmark_requests_refer(
172
- requests: List[TBERequest],
461
+ requests: list[TBERequest],
173
462
  T: int,
174
463
  B: int,
175
464
  L: int,
@@ -208,7 +497,7 @@ def benchmark_requests_refer(
208
497
  _ = torch.rand(
209
498
  flush_gpu_cache_size_mb * 1024 * 1024 // 4,
210
499
  dtype=torch.float,
211
- device="cuda",
500
+ device=get_device(),
212
501
  )
213
502
  torch.cuda.synchronize()
214
503
  start_event.record()
@@ -261,12 +550,12 @@ def benchmark_requests_refer(
261
550
 
262
551
 
263
552
  def benchmark_pipelined_requests(
264
- requests: List[TBERequest],
553
+ requests: list[TBERequest],
265
554
  func1: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
266
555
  func2: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], None],
267
556
  flush_gpu_cache_size_mb: int = 0,
268
557
  check_median: bool = False,
269
- ) -> Tuple[float, float]:
558
+ ) -> tuple[float, float]:
270
559
  torch.cuda.synchronize()
271
560
  start_events = [
272
561
  (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
@@ -282,7 +571,7 @@ def benchmark_pipelined_requests(
282
571
  _ = torch.rand(
283
572
  flush_gpu_cache_size_mb * 1024 * 1024 // 4,
284
573
  dtype=torch.float,
285
- device="cuda",
574
+ device=get_device(),
286
575
  )
287
576
  torch.cuda.synchronize()
288
577
  start_event[0].record()
@@ -318,10 +607,10 @@ def benchmark_pipelined_requests(
318
607
 
319
608
 
320
609
  def benchmark_vbe(
321
- requests: List[Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
610
+ requests: list[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
322
611
  func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
323
612
  num_warmups: int = 0,
324
- ) -> Tuple[float, float]:
613
+ ) -> tuple[float, float]:
325
614
  """
326
615
  A benchmark function to return the average execution time in seconds of
327
616
  forward and backward of VBE kernels.