fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show
  1. fbgemm_gpu/__init__.py +186 -0
  2. fbgemm_gpu/asmjit.so +0 -0
  3. fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
  4. fbgemm_gpu/config/__init__.py +9 -0
  5. fbgemm_gpu/config/feature_list.py +88 -0
  6. fbgemm_gpu/docs/__init__.py +18 -0
  7. fbgemm_gpu/docs/common.py +9 -0
  8. fbgemm_gpu/docs/examples.py +73 -0
  9. fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
  10. fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
  11. fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
  12. fbgemm_gpu/docs/quantize_ops.py +41 -0
  13. fbgemm_gpu/docs/sparse_ops.py +616 -0
  14. fbgemm_gpu/docs/target.genai.json.py +6 -0
  15. fbgemm_gpu/enums.py +24 -0
  16. fbgemm_gpu/experimental/example/__init__.py +29 -0
  17. fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
  18. fbgemm_gpu/experimental/example/utils.py +20 -0
  19. fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
  20. fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
  21. fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
  22. fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
  23. fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
  24. fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
  25. fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
  26. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
  27. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
  28. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
  29. fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
  30. fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
  31. fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
  32. fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
  33. fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
  34. fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
  35. fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
  36. fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
  37. fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
  38. fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
  39. fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
  40. fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
  41. fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
  42. fbgemm_gpu/fbgemm.so +0 -0
  43. fbgemm_gpu/metrics.py +160 -0
  44. fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
  45. fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
  46. fbgemm_gpu/quantize/__init__.py +43 -0
  47. fbgemm_gpu/quantize/quantize_ops.py +64 -0
  48. fbgemm_gpu/quantize_comm.py +315 -0
  49. fbgemm_gpu/quantize_utils.py +246 -0
  50. fbgemm_gpu/runtime_monitor.py +237 -0
  51. fbgemm_gpu/sll/__init__.py +189 -0
  52. fbgemm_gpu/sll/cpu/__init__.py +80 -0
  53. fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
  54. fbgemm_gpu/sll/meta/__init__.py +35 -0
  55. fbgemm_gpu/sll/meta/meta_sll.py +337 -0
  56. fbgemm_gpu/sll/triton/__init__.py +127 -0
  57. fbgemm_gpu/sll/triton/common.py +38 -0
  58. fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
  59. fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
  60. fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
  61. fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
  62. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
  63. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
  64. fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
  65. fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
  66. fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
  67. fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
  68. fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
  69. fbgemm_gpu/sparse_ops.py +1455 -0
  70. fbgemm_gpu/split_embedding_configs.py +452 -0
  71. fbgemm_gpu/split_embedding_inference_converter.py +175 -0
  72. fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
  73. fbgemm_gpu/split_embedding_utils.py +29 -0
  74. fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
  75. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
  76. fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
  77. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
  78. fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
  79. fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
  80. fbgemm_gpu/tbe/__init__.py +6 -0
  81. fbgemm_gpu/tbe/bench/__init__.py +55 -0
  82. fbgemm_gpu/tbe/bench/bench_config.py +156 -0
  83. fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
  84. fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
  85. fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
  86. fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
  87. fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
  88. fbgemm_gpu/tbe/bench/reporter.py +35 -0
  89. fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
  90. fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
  91. fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
  92. fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
  93. fbgemm_gpu/tbe/bench/utils.py +48 -0
  94. fbgemm_gpu/tbe/cache/__init__.py +11 -0
  95. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
  96. fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
  97. fbgemm_gpu/tbe/ssd/__init__.py +15 -0
  98. fbgemm_gpu/tbe/ssd/common.py +46 -0
  99. fbgemm_gpu/tbe/ssd/inference.py +586 -0
  100. fbgemm_gpu/tbe/ssd/training.py +4908 -0
  101. fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
  102. fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
  103. fbgemm_gpu/tbe/stats/__init__.py +10 -0
  104. fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
  105. fbgemm_gpu/tbe/utils/__init__.py +13 -0
  106. fbgemm_gpu/tbe/utils/common.py +42 -0
  107. fbgemm_gpu/tbe/utils/offsets.py +65 -0
  108. fbgemm_gpu/tbe/utils/quantize.py +251 -0
  109. fbgemm_gpu/tbe/utils/requests.py +556 -0
  110. fbgemm_gpu/tbe_input_multiplexer.py +108 -0
  111. fbgemm_gpu/triton/__init__.py +22 -0
  112. fbgemm_gpu/triton/common.py +77 -0
  113. fbgemm_gpu/triton/jagged/__init__.py +8 -0
  114. fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
  115. fbgemm_gpu/triton/quantize.py +647 -0
  116. fbgemm_gpu/triton/quantize_ref.py +286 -0
  117. fbgemm_gpu/utils/__init__.py +11 -0
  118. fbgemm_gpu/utils/filestore.py +211 -0
  119. fbgemm_gpu/utils/loader.py +36 -0
  120. fbgemm_gpu/utils/torch_library.py +132 -0
  121. fbgemm_gpu/uvm.py +40 -0
  122. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
  123. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
  124. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
  125. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
  126. list_versions/__init__.py +12 -0
  127. list_versions/cli_run.py +163 -0
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-unsafe
9
+
10
+ from typing import Optional
11
+
12
+ import torch
13
+ from torch import Tensor
14
+
15
+ try:
16
+ try:
17
+ from torch.compiler import is_compiling
18
+
19
+ def is_torchdynamo_compiling() -> bool: # type: ignore[misc]
20
+ # at least one test fails if we import is_compiling as a different name
21
+ return is_compiling()
22
+
23
+ except Exception:
24
+ # torch.compiler.is_compiling is not available in torch 1.10
25
+ from torch._dynamo import is_compiling as is_torchdynamo_compiling
26
+ except Exception:
27
+
28
+ def is_torchdynamo_compiling() -> bool: # type: ignore[misc]
29
+ return False
30
+
31
+
32
+ # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
33
+ import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
34
+ from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
35
+
36
+
37
+ def generate_vbe_metadata(
38
+ offsets: Tensor,
39
+ batch_size_per_feature_per_rank: Optional[list[list[int]]],
40
+ pooling_mode: PoolingMode,
41
+ feature_dims_cpu: Tensor,
42
+ device: torch.device,
43
+ ) -> invokers.lookup_args.VBEMetadata:
44
+ """
45
+ Generate VBE metadata based on batch_size_per_feature_per_rank.
46
+ Metadata includes:
47
+ 1) B_offsets - A tensor that contains batch size offsets for each
48
+ feature
49
+ 2) output_offsets_feature_rank - A tensor that contains output
50
+ offsets for each feature
51
+ 3) B_offsets_per_rank_per_feature - A tensor that contains batch
52
+ size offsets for each feature
53
+ and rank
54
+ 4) max_B - The maximum batch size for all features
55
+ 5) max_B_feature_rank - The maximum batch size for all ranks and
56
+ features
57
+ 6) output_size - The output size (number of elements)
58
+ """
59
+ if batch_size_per_feature_per_rank is not None:
60
+ assert (
61
+ pooling_mode != PoolingMode.NONE
62
+ ), "Variable batch size TBE support is not enabled for PoolingMode.NONE"
63
+ # TODO: Add input check
64
+ zero_tensor = torch.zeros(1, device="cpu", dtype=torch.int32)
65
+
66
+ # Create B offsets
67
+ total_batch_size_per_feature = torch.tensor(
68
+ batch_size_per_feature_per_rank, dtype=torch.int32, device="cpu"
69
+ ).sum(dim=1)
70
+
71
+ max_B = total_batch_size_per_feature.max().item()
72
+ if not torch.jit.is_scripting() and is_torchdynamo_compiling():
73
+ torch._check_is_size(max_B)
74
+ torch._check(max_B < offsets.numel())
75
+
76
+ Bs = torch.concat([zero_tensor, total_batch_size_per_feature])
77
+ B_offsets = Bs.cumsum(dim=0).to(torch.int)
78
+
79
+ # Create output offsets
80
+ B_feature_rank = torch.tensor(
81
+ batch_size_per_feature_per_rank,
82
+ device="cpu",
83
+ dtype=torch.int64,
84
+ )
85
+ max_B_feature_rank = B_feature_rank.max().item()
86
+ if not torch.jit.is_scripting() and is_torchdynamo_compiling():
87
+ torch._check_is_size(max_B_feature_rank)
88
+ torch._check(max_B_feature_rank <= offsets.size(0))
89
+ output_sizes_feature_rank = B_feature_rank.transpose(
90
+ 0, 1
91
+ ) * feature_dims_cpu.view(1, -1)
92
+ output_offsets_feature_rank = torch.concat(
93
+ [
94
+ zero_tensor.to(torch.int64),
95
+ output_sizes_feature_rank.flatten().cumsum(dim=0),
96
+ ]
97
+ )
98
+ output_size = output_offsets_feature_rank[-1].item()
99
+ if not torch.jit.is_scripting() and is_torchdynamo_compiling():
100
+ torch._check_is_size(output_size)
101
+
102
+ # TODO: Support INT8 output
103
+ # B_offsets_rank_per_feature is for rank and (b, t) mapping
104
+ B_offsets_rank_per_feature = (
105
+ torch.tensor(
106
+ [
107
+ [0] + batch_size_per_feature
108
+ for batch_size_per_feature in batch_size_per_feature_per_rank
109
+ ],
110
+ device="cpu",
111
+ dtype=torch.int32,
112
+ )
113
+ .cumsum(dim=1)
114
+ .to(torch.int)
115
+ )
116
+
117
+ B_offsets = B_offsets.to(device, non_blocking=True)
118
+ output_offsets_feature_rank = output_offsets_feature_rank.to(
119
+ device, non_blocking=True
120
+ )
121
+ B_offsets_rank_per_feature = B_offsets_rank_per_feature.to(
122
+ device, non_blocking=True
123
+ )
124
+
125
+ # TODO: Use int32 for B_offsets and int64 for output_offsets_feature_rank
126
+ vbe_metadata = invokers.lookup_args.VBEMetadata(
127
+ B_offsets=B_offsets,
128
+ output_offsets_feature_rank=output_offsets_feature_rank,
129
+ B_offsets_rank_per_feature=B_offsets_rank_per_feature,
130
+ # pyre-ignore
131
+ max_B=max_B,
132
+ # pyre-ignore
133
+ max_B_feature_rank=max_B_feature_rank,
134
+ # pyre-ignore
135
+ output_size=output_size,
136
+ )
137
+ else:
138
+ vbe_metadata = invokers.lookup_args.VBEMetadata(
139
+ B_offsets=None,
140
+ output_offsets_feature_rank=None,
141
+ B_offsets_rank_per_feature=None,
142
+ max_B=-1,
143
+ max_B_feature_rank=-1,
144
+ output_size=-1,
145
+ )
146
+ return vbe_metadata
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+ # pyre-ignore-all-errors[56]
10
+
11
+ import warnings
12
+
13
+ from fbgemm_gpu.tbe.ssd import ( # noqa: F401
14
+ ASSOC, # noqa: F401
15
+ SSDIntNBitTableBatchedEmbeddingBags, # noqa: F401
16
+ SSDTableBatchedEmbeddingBags, # noqa: F401
17
+ )
18
+
19
+
20
+ warnings.warn( # noqa: B028
21
+ f"""\033[93m
22
+ The Python module {__name__} is now DEPRECATED and will be removed in the
23
+ future. Users should import fbgemm_gpu.tbe.ssd into their scripts instead.
24
+ \033[0m""",
25
+ DeprecationWarning,
26
+ )
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-unsafe
9
+
10
+ import torch
11
+
12
+ from .bench_config import ( # noqa F401
13
+ TBEBenchmarkingConfig,
14
+ TBEBenchmarkingConfigLoader,
15
+ TBEBenchmarkingHelperText,
16
+ )
17
+ from .bench_runs import ( # noqa F401
18
+ bench_warmup,
19
+ benchmark_cpu_requests,
20
+ benchmark_cpu_requests_mp,
21
+ benchmark_pipelined_requests,
22
+ benchmark_requests,
23
+ benchmark_requests_refer,
24
+ benchmark_requests_with_spec,
25
+ benchmark_vbe,
26
+ )
27
+ from .benchmark_click_interface import TbeBenchClickInterface # noqa F401
28
+ from .embedding_ops_common_config import EmbeddingOpsCommonConfigLoader # noqa F401
29
+ from .eval_compression import ( # noqa F401
30
+ benchmark_eval_compression,
31
+ EvalCompressionBenchmarkOutput,
32
+ )
33
+ from .reporter import BenchmarkReporter # noqa F401
34
+ from .tbe_data_config import TBEDataConfig # noqa F401
35
+ from .tbe_data_config_loader import ( # noqa F401
36
+ TBEDataConfigHelperText,
37
+ TBEDataConfigLoader,
38
+ )
39
+ from .tbe_data_config_param_models import ( # noqa F401
40
+ BatchParams,
41
+ IndicesParams,
42
+ PoolingParams,
43
+ )
44
+ from .utils import fill_random_scale_bias # noqa F401
45
+
46
+ try:
47
+ torch.ops.load_library(
48
+ "//deeplearning/fbgemm/fbgemm_gpu/src/tbe/eeg:indices_estimator"
49
+ )
50
+ except Exception:
51
+ pass
52
+
53
+ #: The max number of heavy heavy hitters, as defined in
54
+ #: fbgemm_gpu/src/tbe/eeg/indices_estimator.h
55
+ EEG_MAX_HEAVY_HITTERS: int = 20
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import dataclasses
11
+ import json
12
+ from enum import Enum
13
+ from typing import Any, Optional
14
+
15
+ import click
16
+
17
+
18
+ @dataclasses.dataclass(frozen=True)
19
+ class TBEBenchmarkingConfig:
20
+ # Number of iterations
21
+ iterations: int
22
+ # Number of input TBE batches to generate for testing
23
+ num_requests: int
24
+ # Number of warmup iterations to run before making measurements
25
+ warmup_iterations: int
26
+ # Amount of memory to use for flushing the GPU cache after each iteration
27
+ flush_gpu_cache_size_mb: int
28
+ # If set, trace will be exported to the path specified in trace_url
29
+ export_trace: bool
30
+ # The path for exporting the trace
31
+ trace_url: Optional[str]
32
+ # If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba
33
+ upload_perf_data: bool
34
+
35
+ @classmethod
36
+ # pyre-ignore [3]
37
+ def from_dict(cls, data: dict[str, Any]):
38
+ return cls(**data)
39
+
40
+ @classmethod
41
+ # pyre-ignore [3]
42
+ def from_json(cls, data: str):
43
+ return cls.from_dict(json.loads(data))
44
+
45
+ def dict(self) -> dict[str, Any]:
46
+ return dataclasses.asdict(self)
47
+
48
+ def json(self, format: bool = False) -> str:
49
+ return json.dumps(self.dict(), indent=(2 if format else -1), sort_keys=True)
50
+
51
+ # pyre-ignore [3]
52
+ def validate(self):
53
+ assert self.iterations > 0, "iterations must be positive"
54
+ assert self.num_requests > 0, "num_requests must be positive"
55
+ assert self.warmup_iterations >= 0, "warmup_iterations must be non-negative"
56
+ assert (
57
+ self.flush_gpu_cache_size_mb >= 0
58
+ ), "flush_gpu_cache_size_mb must be non-negative"
59
+ return self
60
+
61
+
62
+ @dataclasses.dataclass(frozen=True)
63
+ class TBEBenchmarkingHelperText(Enum):
64
+ BENCH_ITERATIONS = "Number of benchmark iterations to run"
65
+ BENCH_NUM_REQUESTS = "Number of input batches to generate. If the value is smaller than the number of benchmark iterations, input batches will be re-used"
66
+ BENCH_WARMUP_ITERATIONS = (
67
+ "Number of warmup iterations to run before making measurements"
68
+ )
69
+ BENCH_FLUSH_GPU_CACHE_SIZE = (
70
+ "Amount of memory to use for flushing the GPU cache after each iteration (MB)"
71
+ )
72
+ BENCH_EXPORT_TRACE = (
73
+ "If set, trace will be exported to the path specified in trace url"
74
+ )
75
+ BENCH_TRACE_URL = "The path for exporting the trace"
76
+ BENCH_UPLOAD_PERF_DATA = "If set and export_trace is true, the benchmark will upload performance data from the trace to Scuba"
77
+
78
+
79
+ class TBEBenchmarkingConfigLoader:
80
+ @classmethod
81
+ # pyre-ignore [2]
82
+ def options(cls, func) -> click.Command:
83
+ options = [
84
+ click.option(
85
+ "--bench-iterations",
86
+ type=int,
87
+ default=100,
88
+ help=TBEBenchmarkingHelperText.BENCH_ITERATIONS.value,
89
+ ),
90
+ click.option(
91
+ "--bench-num-requests",
92
+ type=int,
93
+ default=-1,
94
+ help=TBEBenchmarkingHelperText.BENCH_NUM_REQUESTS.value,
95
+ ),
96
+ click.option(
97
+ "--bench-warmup-iterations",
98
+ type=int,
99
+ default=0,
100
+ help=TBEBenchmarkingHelperText.BENCH_WARMUP_ITERATIONS.value,
101
+ ),
102
+ click.option(
103
+ "--bench-flush-gpu-cache-size",
104
+ type=int,
105
+ default=0,
106
+ help=TBEBenchmarkingHelperText.BENCH_FLUSH_GPU_CACHE_SIZE.value,
107
+ ),
108
+ click.option(
109
+ "--bench-export-trace",
110
+ is_flag=True,
111
+ default=False,
112
+ help=TBEBenchmarkingHelperText.BENCH_EXPORT_TRACE.value,
113
+ ),
114
+ click.option(
115
+ "--bench-trace-url",
116
+ type=str,
117
+ required=False,
118
+ default="{emb_op_type}_tbe_{phase}_trace_{ospid}.json",
119
+ help=TBEBenchmarkingHelperText.BENCH_TRACE_URL.value,
120
+ ),
121
+ click.option(
122
+ "--upload-perf-data",
123
+ is_flag=True,
124
+ default=False,
125
+ help=TBEBenchmarkingHelperText.BENCH_UPLOAD_PERF_DATA.value,
126
+ ),
127
+ ]
128
+
129
+ for option in reversed(options):
130
+ func = option(func)
131
+ return func
132
+
133
+ @classmethod
134
+ def load(cls, context: click.Context) -> TBEBenchmarkingConfig:
135
+ params = context.params
136
+
137
+ iterations = params["bench_iterations"]
138
+ num_requests = params["bench_num_requests"]
139
+ warmup_iterations = params["bench_warmup_iterations"]
140
+ flush_gpu_cache_size = params["bench_flush_gpu_cache_size"]
141
+ export_trace = params["bench_export_trace"]
142
+ trace_url = params["bench_trace_url"]
143
+ upload_perf_data = params["upload_perf_data"]
144
+
145
+ # Default the number of TBE requests to number of iterations specified
146
+ num_requests = iterations if num_requests == -1 else num_requests
147
+
148
+ return TBEBenchmarkingConfig(
149
+ iterations,
150
+ num_requests,
151
+ warmup_iterations,
152
+ flush_gpu_cache_size,
153
+ export_trace,
154
+ trace_url,
155
+ upload_perf_data,
156
+ ).validate()