triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,887 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import time
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from typing import Dict, List, Optional, Tuple
|
|
22
|
+
|
|
23
|
+
import numba
|
|
24
|
+
import requests
|
|
25
|
+
from prometheus_client.parser import text_string_to_metric_families
|
|
26
|
+
|
|
27
|
+
from model_analyzer.config.generate.base_model_config_generator import (
|
|
28
|
+
BaseModelConfigGenerator,
|
|
29
|
+
)
|
|
30
|
+
from model_analyzer.config.run.run_config import RunConfig
|
|
31
|
+
from model_analyzer.constants import LOGGER_NAME, PA_ERROR_LOG_FILENAME
|
|
32
|
+
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
|
|
33
|
+
from model_analyzer.monitor.cpu_monitor import CPUMonitor
|
|
34
|
+
from model_analyzer.monitor.dcgm.dcgm_monitor import DCGMMonitor
|
|
35
|
+
from model_analyzer.monitor.remote_monitor import RemoteMonitor
|
|
36
|
+
from model_analyzer.output.file_writer import FileWriter
|
|
37
|
+
from model_analyzer.perf_analyzer.perf_analyzer import PerfAnalyzer
|
|
38
|
+
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
|
|
39
|
+
from model_analyzer.triton.model.model_config_variant import ModelConfigVariant
|
|
40
|
+
|
|
41
|
+
from .record import Record, RecordType
|
|
42
|
+
from .record_aggregator import RecordAggregator
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(LOGGER_NAME)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MetricsManager:
|
|
48
|
+
"""
|
|
49
|
+
This class handles the profiling
|
|
50
|
+
categorization of metrics
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
metrics = [
|
|
54
|
+
"perf_throughput",
|
|
55
|
+
"perf_latency_avg",
|
|
56
|
+
"perf_latency_p90",
|
|
57
|
+
"perf_latency_p95",
|
|
58
|
+
"perf_latency_p99",
|
|
59
|
+
"perf_latency",
|
|
60
|
+
"perf_client_response_wait",
|
|
61
|
+
"perf_client_send_recv",
|
|
62
|
+
"perf_server_queue",
|
|
63
|
+
"perf_server_compute_input",
|
|
64
|
+
"perf_server_compute_infer",
|
|
65
|
+
"perf_server_compute_output",
|
|
66
|
+
"gpu_used_memory",
|
|
67
|
+
"gpu_free_memory",
|
|
68
|
+
"gpu_utilization",
|
|
69
|
+
"gpu_power_usage",
|
|
70
|
+
"cpu_available_ram",
|
|
71
|
+
"cpu_used_ram",
|
|
72
|
+
"time_to_first_token_avg",
|
|
73
|
+
"time_to_first_token_min",
|
|
74
|
+
"time_to_first_token_max",
|
|
75
|
+
"time_to_first_token_p99",
|
|
76
|
+
"time_to_first_token_p95",
|
|
77
|
+
"time_to_first_token_p90",
|
|
78
|
+
"time_to_first_token_p75",
|
|
79
|
+
"time_to_first_token_p50",
|
|
80
|
+
"time_to_first_token_p25",
|
|
81
|
+
"inter_token_latency_avg",
|
|
82
|
+
"inter_token_latency_min",
|
|
83
|
+
"inter_token_latency_max",
|
|
84
|
+
"inter_token_latency_p99",
|
|
85
|
+
"inter_token_latency_p95",
|
|
86
|
+
"inter_token_latency_p90",
|
|
87
|
+
"inter_token_latency_p75",
|
|
88
|
+
"inter_token_latency_p50",
|
|
89
|
+
"inter_token_latency_p25",
|
|
90
|
+
"output_token_throughput",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
def __init__(self, config, client, server, gpus, result_manager, state_manager):
|
|
94
|
+
"""
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
config :ConfigCommandProfile
|
|
98
|
+
The model analyzer's config
|
|
99
|
+
client : TritonClient
|
|
100
|
+
handle to the instance of Tritonclient to communicate with
|
|
101
|
+
the server
|
|
102
|
+
server : TritonServer
|
|
103
|
+
Handle to the instance of Triton being used
|
|
104
|
+
gpus: List of GPUDevices
|
|
105
|
+
The gpus being used to profile
|
|
106
|
+
result_manager : ResultManager
|
|
107
|
+
instance that manages the result tables and
|
|
108
|
+
adding results
|
|
109
|
+
state_manager: AnalyzerStateManager
|
|
110
|
+
manages the analyzer state
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
# Generate the output model repository path folder.
|
|
114
|
+
self._output_model_repo_path = config.output_model_repository_path
|
|
115
|
+
|
|
116
|
+
if len(config.profile_models) != len(
|
|
117
|
+
set([model._model_name for model in config.profile_models])
|
|
118
|
+
):
|
|
119
|
+
raise TritonModelAnalyzerException(
|
|
120
|
+
f"Duplicate model names detected: "
|
|
121
|
+
f"{[model._model_name for model in config.profile_models]}"
|
|
122
|
+
)
|
|
123
|
+
self._first_config_variant = {}
|
|
124
|
+
self._config = config
|
|
125
|
+
self._client = client
|
|
126
|
+
self._server = server
|
|
127
|
+
self._result_manager = result_manager
|
|
128
|
+
self._state_manager = state_manager
|
|
129
|
+
self._loaded_models = None
|
|
130
|
+
|
|
131
|
+
self._cpu_warning_printed = False
|
|
132
|
+
self._encountered_perf_analyzer_error = False
|
|
133
|
+
|
|
134
|
+
(
|
|
135
|
+
self._gpu_metrics,
|
|
136
|
+
self._perf_metrics,
|
|
137
|
+
self._llm_metrics,
|
|
138
|
+
self._cpu_metrics,
|
|
139
|
+
) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
|
|
140
|
+
self._gpus = gpus
|
|
141
|
+
self._init_state()
|
|
142
|
+
|
|
143
|
+
def start_new_model(self):
|
|
144
|
+
"""Indicate that profiling of a new model is starting"""
|
|
145
|
+
self._first_config_variant = {}
|
|
146
|
+
|
|
147
|
+
def encountered_perf_analyzer_error(self) -> bool:
|
|
148
|
+
return self._encountered_perf_analyzer_error
|
|
149
|
+
|
|
150
|
+
def _init_state(self):
|
|
151
|
+
"""
|
|
152
|
+
Sets MetricsManager object managed
|
|
153
|
+
state variables in AnalyzerState
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
gpu_info = self._state_manager.get_state_variable("MetricsManager.gpu_info")
|
|
157
|
+
|
|
158
|
+
if self._state_manager.starting_fresh_run() or gpu_info is None:
|
|
159
|
+
gpu_info = {}
|
|
160
|
+
|
|
161
|
+
for i in range(len(self._gpus)):
|
|
162
|
+
if self._gpus[i].device_uuid() not in gpu_info:
|
|
163
|
+
device_info = {}
|
|
164
|
+
device = numba.cuda.list_devices()[i]
|
|
165
|
+
device_info["name"] = str(device.name, encoding="utf-8")
|
|
166
|
+
with device:
|
|
167
|
+
# convert bytes to GB
|
|
168
|
+
device_info["total_memory"] = (
|
|
169
|
+
numba.cuda.current_context().get_memory_info().total
|
|
170
|
+
)
|
|
171
|
+
gpu_info[self._gpus[i].device_uuid()] = device_info
|
|
172
|
+
|
|
173
|
+
self._state_manager.set_state_variable("MetricsManager.gpus", gpu_info)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
|
|
177
|
+
"""
|
|
178
|
+
Splits the metrics into groups based
|
|
179
|
+
on how they are collected
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
-------
|
|
183
|
+
(list,list,list,list)
|
|
184
|
+
tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], []
|
|
188
|
+
# Separates metrics and objectives into related lists
|
|
189
|
+
for metric in MetricsManager.get_metric_types(metric_tags):
|
|
190
|
+
if metric in PerfAnalyzer.get_gpu_metrics():
|
|
191
|
+
gpu_metrics.append(metric)
|
|
192
|
+
elif metric in PerfAnalyzer.get_perf_metrics():
|
|
193
|
+
perf_metrics.append(metric)
|
|
194
|
+
elif metric in PerfAnalyzer.get_llm_metrics():
|
|
195
|
+
llm_metrics.append(metric)
|
|
196
|
+
elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
|
|
197
|
+
cpu_metrics.append(metric)
|
|
198
|
+
|
|
199
|
+
return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics
|
|
200
|
+
|
|
201
|
+
def profile_server(self):
|
|
202
|
+
"""
|
|
203
|
+
Runs the DCGM monitor on the triton server without the perf_analyzer
|
|
204
|
+
Raises
|
|
205
|
+
------
|
|
206
|
+
TritonModelAnalyzerException
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
capture_gpu_metrics = numba.cuda.is_available()
|
|
210
|
+
self._start_monitors(capture_gpu_metrics=capture_gpu_metrics)
|
|
211
|
+
time.sleep(self._config.duration_seconds)
|
|
212
|
+
if capture_gpu_metrics or self._config.always_report_gpu_metrics:
|
|
213
|
+
server_gpu_metrics = self._get_gpu_inference_metrics()
|
|
214
|
+
self._result_manager.add_server_data(data=server_gpu_metrics)
|
|
215
|
+
self._destroy_monitors(capture_gpu_metrics=capture_gpu_metrics)
|
|
216
|
+
|
|
217
|
+
def execute_run_config(
|
|
218
|
+
self, run_config: RunConfig
|
|
219
|
+
) -> Optional[RunConfigMeasurement]:
|
|
220
|
+
"""
|
|
221
|
+
Executes the RunConfig. Returns obtained measurement. Also sends
|
|
222
|
+
measurement to the result manager
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
self._create_model_variants(run_config)
|
|
226
|
+
|
|
227
|
+
# If this run config was already run, do not run again, just get the measurement
|
|
228
|
+
measurement = self._get_measurement_if_config_duplicate(run_config)
|
|
229
|
+
if measurement:
|
|
230
|
+
logger.info("Existing measurement found for run config. Skipping profile")
|
|
231
|
+
return measurement
|
|
232
|
+
|
|
233
|
+
current_model_variants = run_config.model_variants_name()
|
|
234
|
+
if current_model_variants != self._loaded_models:
|
|
235
|
+
self._server.stop()
|
|
236
|
+
self._server.start(env=run_config.triton_environment())
|
|
237
|
+
|
|
238
|
+
if not self._load_model_variants(run_config):
|
|
239
|
+
self._server.stop()
|
|
240
|
+
self._loaded_models = None
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
self._loaded_models = current_model_variants
|
|
244
|
+
|
|
245
|
+
measurement = self.profile_models(run_config)
|
|
246
|
+
|
|
247
|
+
return measurement
|
|
248
|
+
|
|
249
|
+
def profile_models(self, run_config: RunConfig) -> Optional[RunConfigMeasurement]:
|
|
250
|
+
"""
|
|
251
|
+
Runs monitors while running perf_analyzer with a specific set of
|
|
252
|
+
arguments. This will profile model inferencing.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
run_config : RunConfig
|
|
257
|
+
RunConfig object corresponding to the models being profiled.
|
|
258
|
+
|
|
259
|
+
Returns
|
|
260
|
+
-------
|
|
261
|
+
(dict of lists, list)
|
|
262
|
+
The gpu specific and non gpu metrics
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
perf_output_writer = (
|
|
266
|
+
None
|
|
267
|
+
if not self._config.perf_output
|
|
268
|
+
else FileWriter(self._config.perf_output_path)
|
|
269
|
+
)
|
|
270
|
+
capture_gpu_metrics = (
|
|
271
|
+
self._config.always_report_gpu_metrics or not run_config.cpu_only()
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
self._print_run_config_info(run_config)
|
|
275
|
+
|
|
276
|
+
self._start_monitors(capture_gpu_metrics=capture_gpu_metrics)
|
|
277
|
+
|
|
278
|
+
perf_analyzer_metrics, model_gpu_metrics = self._run_perf_analyzer(
|
|
279
|
+
run_config, perf_output_writer
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if not perf_analyzer_metrics:
|
|
283
|
+
self._stop_monitors(capture_gpu_metrics=capture_gpu_metrics)
|
|
284
|
+
self._destroy_monitors(capture_gpu_metrics=capture_gpu_metrics)
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
# Get metrics for model inference and combine metrics that do not have GPU UUID
|
|
288
|
+
if capture_gpu_metrics and not model_gpu_metrics:
|
|
289
|
+
model_gpu_metrics = self._get_gpu_inference_metrics()
|
|
290
|
+
model_cpu_metrics = self._get_cpu_inference_metrics()
|
|
291
|
+
|
|
292
|
+
self._destroy_monitors(capture_gpu_metrics=capture_gpu_metrics)
|
|
293
|
+
|
|
294
|
+
run_config_measurement = None
|
|
295
|
+
if model_gpu_metrics is not None and perf_analyzer_metrics is not None:
|
|
296
|
+
run_config_measurement = RunConfigMeasurement(
|
|
297
|
+
run_config.model_variants_name(), model_gpu_metrics
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Combine all per-model measurements into the RunConfigMeasurement
|
|
301
|
+
#
|
|
302
|
+
for model_run_config in run_config.model_run_configs():
|
|
303
|
+
perf_config = model_run_config.perf_config()
|
|
304
|
+
model_name = perf_config["model-name"]
|
|
305
|
+
|
|
306
|
+
model_non_gpu_metrics = list(
|
|
307
|
+
perf_analyzer_metrics[model_name].values()
|
|
308
|
+
) + list(model_cpu_metrics.values())
|
|
309
|
+
|
|
310
|
+
model_specific_pa_params = (
|
|
311
|
+
perf_config.extract_model_specific_parameters()
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
run_config_measurement.add_model_config_measurement(
|
|
315
|
+
perf_config["model-name"],
|
|
316
|
+
model_specific_pa_params,
|
|
317
|
+
model_non_gpu_metrics,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
self._result_manager.add_run_config_measurement(
|
|
321
|
+
run_config, run_config_measurement
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return run_config_measurement
|
|
325
|
+
|
|
326
|
+
def finalize(self):
|
|
327
|
+
self._server.stop()
|
|
328
|
+
|
|
329
|
+
def _create_model_variants(self, run_config: RunConfig) -> None:
|
|
330
|
+
"""
|
|
331
|
+
Creates and fills all model variant directories
|
|
332
|
+
"""
|
|
333
|
+
for mrc in run_config.model_run_configs():
|
|
334
|
+
self._create_model_variant(
|
|
335
|
+
original_name=mrc.model_name(),
|
|
336
|
+
variant_config=mrc.model_config_variant(),
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
for composing_config_variant in mrc.composing_config_variants():
|
|
340
|
+
variant_name = composing_config_variant.variant_name
|
|
341
|
+
original_name = (
|
|
342
|
+
BaseModelConfigGenerator.extract_model_name_from_variant_name(
|
|
343
|
+
variant_name
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
self._create_model_variant(original_name, composing_config_variant)
|
|
348
|
+
|
|
349
|
+
# Create a version with the original (no _config_#/default appended) name
|
|
350
|
+
original_composing_config = (
|
|
351
|
+
BaseModelConfigGenerator.create_original_config_from_variant(
|
|
352
|
+
composing_config_variant.model_config
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
self._create_model_variant(
|
|
356
|
+
original_name,
|
|
357
|
+
ModelConfigVariant(original_composing_config, original_name),
|
|
358
|
+
ignore_first_config_variant=True,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
def _create_model_variant(
|
|
362
|
+
self,
|
|
363
|
+
original_name: str,
|
|
364
|
+
variant_config: ModelConfigVariant,
|
|
365
|
+
ignore_first_config_variant: bool = False,
|
|
366
|
+
) -> None:
|
|
367
|
+
"""
|
|
368
|
+
Creates a directory for the model config variant in the output model
|
|
369
|
+
repository and fills directory with config
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
if self._config.triton_launch_mode != "remote":
|
|
373
|
+
self._create_non_remote_mode_model_variant(
|
|
374
|
+
original_name, variant_config, ignore_first_config_variant
|
|
375
|
+
)
|
|
376
|
+
else:
|
|
377
|
+
self._create_remote_mode_model_variant(original_name, variant_config)
|
|
378
|
+
|
|
379
|
+
def _create_non_remote_mode_model_variant(
|
|
380
|
+
self,
|
|
381
|
+
original_name: str,
|
|
382
|
+
variant_config: ModelConfigVariant,
|
|
383
|
+
ignore_first_config_variant: bool = False,
|
|
384
|
+
) -> None:
|
|
385
|
+
"""
|
|
386
|
+
Creates a directory for the model config variant in the output model
|
|
387
|
+
repository and fills directory with config
|
|
388
|
+
"""
|
|
389
|
+
variant_name = variant_config.variant_name
|
|
390
|
+
model_repository = self._config.model_repository
|
|
391
|
+
|
|
392
|
+
original_model_dir = os.path.join(model_repository, original_name)
|
|
393
|
+
new_model_dir = os.path.join(self._output_model_repo_path, variant_name)
|
|
394
|
+
try:
|
|
395
|
+
# Create the directory for the new model
|
|
396
|
+
os.makedirs(new_model_dir, exist_ok=True)
|
|
397
|
+
self._first_config_variant.setdefault(original_name, None)
|
|
398
|
+
|
|
399
|
+
if ignore_first_config_variant:
|
|
400
|
+
variant_config.model_config.write_config_to_file(
|
|
401
|
+
new_model_dir, original_model_dir, None
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
variant_config.model_config.write_config_to_file(
|
|
405
|
+
new_model_dir,
|
|
406
|
+
original_model_dir,
|
|
407
|
+
self._first_config_variant[original_name],
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if self._first_config_variant[original_name] is None:
|
|
411
|
+
self._first_config_variant[original_name] = os.path.join(
|
|
412
|
+
self._output_model_repo_path, variant_name
|
|
413
|
+
)
|
|
414
|
+
except FileExistsError:
|
|
415
|
+
# Ignore if the file already exists
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
def _create_remote_mode_model_variant(
|
|
419
|
+
self,
|
|
420
|
+
original_name: str,
|
|
421
|
+
variant_config: ModelConfigVariant,
|
|
422
|
+
) -> None:
|
|
423
|
+
"""
|
|
424
|
+
Creates a directory for the model config variant in the output model
|
|
425
|
+
repository and fills directory with only the config.pbtxt
|
|
426
|
+
"""
|
|
427
|
+
variant_name = variant_config.variant_name
|
|
428
|
+
new_model_dir = os.path.join(self._output_model_repo_path, variant_name)
|
|
429
|
+
try:
|
|
430
|
+
os.makedirs(new_model_dir, exist_ok=False)
|
|
431
|
+
self._first_config_variant.setdefault(original_name, None)
|
|
432
|
+
variant_config.model_config.write_config_to_file(
|
|
433
|
+
model_path=new_model_dir,
|
|
434
|
+
src_model_path=new_model_dir,
|
|
435
|
+
first_variant_model_path=None,
|
|
436
|
+
)
|
|
437
|
+
except FileExistsError:
|
|
438
|
+
# Ignore if the dir already exists
|
|
439
|
+
pass
|
|
440
|
+
|
|
441
|
+
def _load_model_variants(self, run_config: RunConfig) -> bool:
|
|
442
|
+
"""
|
|
443
|
+
Loads all model variants in the client
|
|
444
|
+
"""
|
|
445
|
+
for mrc in run_config.model_run_configs():
|
|
446
|
+
# Load all composing model variants first, and then the parent model
|
|
447
|
+
for composing_config_variant in mrc.composing_config_variants():
|
|
448
|
+
if not self._load_model_variant(
|
|
449
|
+
variant_config=composing_config_variant
|
|
450
|
+
):
|
|
451
|
+
return False
|
|
452
|
+
if not self._load_model_variant(variant_config=mrc.model_config_variant()):
|
|
453
|
+
return False
|
|
454
|
+
return True
|
|
455
|
+
|
|
456
|
+
def _load_model_variant(self, variant_config: ModelConfigVariant) -> bool:
|
|
457
|
+
"""
|
|
458
|
+
Conditionally loads a model variant in the client
|
|
459
|
+
"""
|
|
460
|
+
remote = self._config.triton_launch_mode == "remote"
|
|
461
|
+
c_api = self._config.triton_launch_mode == "c_api"
|
|
462
|
+
disabled = self._config.reload_model_disable
|
|
463
|
+
do_load = (remote and not disabled) or (not remote and not c_api)
|
|
464
|
+
|
|
465
|
+
retval = True
|
|
466
|
+
if do_load:
|
|
467
|
+
retval = self._do_load_model_variant(variant_config)
|
|
468
|
+
return retval
|
|
469
|
+
|
|
470
|
+
def _do_load_model_variant(self, variant_config: ModelConfigVariant) -> bool:
|
|
471
|
+
"""
|
|
472
|
+
Loads a model variant in the client
|
|
473
|
+
"""
|
|
474
|
+
self._client.wait_for_server_ready(
|
|
475
|
+
num_retries=self._config.client_max_retries,
|
|
476
|
+
log_file=self._server.log_file(),
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
model_name = variant_config.model_config.get_field("name")
|
|
480
|
+
variant_name = variant_config.variant_name
|
|
481
|
+
config_str = variant_config.model_config.get_config_str()
|
|
482
|
+
if (
|
|
483
|
+
self._client.load_model(
|
|
484
|
+
model_name=model_name,
|
|
485
|
+
variant_name=variant_name,
|
|
486
|
+
config_str=config_str,
|
|
487
|
+
)
|
|
488
|
+
== -1
|
|
489
|
+
):
|
|
490
|
+
return False
|
|
491
|
+
|
|
492
|
+
if (
|
|
493
|
+
self._client.wait_for_model_ready(
|
|
494
|
+
model_name=variant_config.model_config.get_field("name"),
|
|
495
|
+
num_retries=self._config.client_max_retries,
|
|
496
|
+
)
|
|
497
|
+
== -1
|
|
498
|
+
):
|
|
499
|
+
return False
|
|
500
|
+
return True
|
|
501
|
+
|
|
502
|
+
def _get_measurement_if_config_duplicate(self, run_config):
|
|
503
|
+
"""
|
|
504
|
+
Checks whether this run config has measurements
|
|
505
|
+
in the state manager's results object
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
models_name = run_config.models_name()
|
|
509
|
+
model_variants_name = run_config.model_variants_name()
|
|
510
|
+
key = run_config.representation()
|
|
511
|
+
|
|
512
|
+
results = self._state_manager.get_state_variable("ResultManager.results")
|
|
513
|
+
|
|
514
|
+
if not results.contains_model_variant(models_name, model_variants_name):
|
|
515
|
+
return False
|
|
516
|
+
|
|
517
|
+
measurements = results.get_model_variants_measurements_dict(
|
|
518
|
+
models_name, model_variants_name
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
return measurements.get(key, None)
|
|
522
|
+
|
|
523
|
+
def _start_monitors(self, capture_gpu_metrics=True):
|
|
524
|
+
"""
|
|
525
|
+
Start any metrics monitors
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
self._gpu_monitor = None
|
|
529
|
+
if capture_gpu_metrics:
|
|
530
|
+
try:
|
|
531
|
+
self._gpu_monitor = RemoteMonitor(
|
|
532
|
+
self._config.triton_metrics_url,
|
|
533
|
+
self._config.monitoring_interval,
|
|
534
|
+
self._gpu_metrics,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
self._gpu_monitor.start_recording_metrics()
|
|
538
|
+
except TritonModelAnalyzerException:
|
|
539
|
+
self._destroy_monitors()
|
|
540
|
+
raise
|
|
541
|
+
finally:
|
|
542
|
+
if (
|
|
543
|
+
not self._gpu_monitor.is_monitoring_connected()
|
|
544
|
+
and self._config.triton_launch_mode != "c_api"
|
|
545
|
+
):
|
|
546
|
+
raise TritonModelAnalyzerException(
|
|
547
|
+
f"Failed to connect to Tritonserver's GPU metrics monitor. "
|
|
548
|
+
f"Please check that the `triton_metrics_url` value is set correctly: {self._config.triton_metrics_url}."
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
self._cpu_monitor = CPUMonitor(
|
|
552
|
+
self._server, self._config.monitoring_interval, self._cpu_metrics
|
|
553
|
+
)
|
|
554
|
+
self._cpu_monitor.start_recording_metrics()
|
|
555
|
+
|
|
556
|
+
def _stop_monitors(self, capture_gpu_metrics=True):
|
|
557
|
+
"""
|
|
558
|
+
Stop any metrics monitors, when we don't need
|
|
559
|
+
to collect the result
|
|
560
|
+
"""
|
|
561
|
+
|
|
562
|
+
# Stop DCGM Monitor only if there are GPUs available
|
|
563
|
+
if capture_gpu_metrics:
|
|
564
|
+
self._gpu_monitor.stop_recording_metrics()
|
|
565
|
+
self._cpu_monitor.stop_recording_metrics()
|
|
566
|
+
|
|
567
|
+
def _destroy_monitors(self, capture_gpu_metrics=True):
|
|
568
|
+
"""
|
|
569
|
+
Destroy the monitors created by start
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
if capture_gpu_metrics:
|
|
573
|
+
if self._gpu_monitor:
|
|
574
|
+
self._gpu_monitor.destroy()
|
|
575
|
+
if self._cpu_monitor:
|
|
576
|
+
self._cpu_monitor.destroy()
|
|
577
|
+
self._gpu_monitor = None
|
|
578
|
+
self._cpu_monitor = None
|
|
579
|
+
|
|
580
|
+
def _run_perf_analyzer(
|
|
581
|
+
self, run_config: RunConfig, perf_output_writer: Optional[FileWriter]
|
|
582
|
+
) -> Tuple[Optional[Dict], Optional[Dict[int, List[Record]]]]:
|
|
583
|
+
"""
|
|
584
|
+
Runs perf_analyzer and returns the aggregated metrics
|
|
585
|
+
|
|
586
|
+
Parameters
|
|
587
|
+
----------
|
|
588
|
+
run_config : RunConfig
|
|
589
|
+
The RunConfig to execute on perf analyzer
|
|
590
|
+
|
|
591
|
+
perf_output_writer : FileWriter
|
|
592
|
+
Writer that writes the output from perf_analyzer to the output
|
|
593
|
+
stream/file. If None, the output is not written
|
|
594
|
+
|
|
595
|
+
Raises
|
|
596
|
+
------
|
|
597
|
+
TritonModelAnalyzerException
|
|
598
|
+
"""
|
|
599
|
+
|
|
600
|
+
perf_analyzer_env = run_config.triton_environment()
|
|
601
|
+
|
|
602
|
+
# IF running with C_API, need to set CUDA_VISIBLE_DEVICES here
|
|
603
|
+
if self._config.triton_launch_mode == "c_api":
|
|
604
|
+
perf_analyzer_env["CUDA_VISIBLE_DEVICES"] = ",".join(
|
|
605
|
+
[gpu.device_uuid() for gpu in self._gpus]
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
perf_analyzer = PerfAnalyzer(
|
|
609
|
+
path=self._config.perf_analyzer_path,
|
|
610
|
+
config=run_config,
|
|
611
|
+
max_retries=self._config.perf_analyzer_max_auto_adjusts,
|
|
612
|
+
timeout=self._config.perf_analyzer_timeout,
|
|
613
|
+
max_cpu_util=self._config.perf_analyzer_cpu_util,
|
|
614
|
+
model_type=self._config.model_type,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics
|
|
618
|
+
status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
|
|
619
|
+
|
|
620
|
+
self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
|
|
621
|
+
|
|
622
|
+
if status == 1:
|
|
623
|
+
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
|
|
624
|
+
return (None, None)
|
|
625
|
+
|
|
626
|
+
perf_records = perf_analyzer.get_perf_records()
|
|
627
|
+
|
|
628
|
+
if self._config.model_type == "LLM":
|
|
629
|
+
perf_records[run_config.models_name()].extend(
|
|
630
|
+
perf_analyzer.get_llm_records()[run_config.models_name()]
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
gpu_records = perf_analyzer.get_gpu_records()
|
|
634
|
+
|
|
635
|
+
aggregated_perf_records = self._aggregate_perf_records(perf_records)
|
|
636
|
+
aggregated_gpu_records = self._aggregate_gpu_records(gpu_records)
|
|
637
|
+
|
|
638
|
+
return aggregated_perf_records, aggregated_gpu_records
|
|
639
|
+
|
|
640
|
+
def _write_perf_analyzer_output(
|
|
641
|
+
self, perf_output_writer: Optional[FileWriter], perf_analyzer: PerfAnalyzer
|
|
642
|
+
) -> None:
|
|
643
|
+
if perf_output_writer:
|
|
644
|
+
perf_output_writer.write(
|
|
645
|
+
"============== Perf Analyzer Launched ==============\n"
|
|
646
|
+
f"Command: {perf_analyzer.get_cmd()}\n\n",
|
|
647
|
+
append=True,
|
|
648
|
+
)
|
|
649
|
+
if perf_analyzer.output():
|
|
650
|
+
perf_output_writer.write(perf_analyzer.output() + "\n", append=True)
|
|
651
|
+
|
|
652
|
+
def _handle_unsuccessful_perf_analyzer_run(
|
|
653
|
+
self, perf_analyzer: PerfAnalyzer
|
|
654
|
+
) -> None:
|
|
655
|
+
output_file = f"{self._config.export_path}/{PA_ERROR_LOG_FILENAME}"
|
|
656
|
+
|
|
657
|
+
if not self._encountered_perf_analyzer_error:
|
|
658
|
+
self._encountered_perf_analyzer_error = True
|
|
659
|
+
if os.path.exists(output_file):
|
|
660
|
+
os.remove(output_file)
|
|
661
|
+
|
|
662
|
+
perf_error_log = FileWriter(output_file)
|
|
663
|
+
perf_error_log.write(
|
|
664
|
+
"Command: \n" + perf_analyzer.get_cmd() + "\n\n", append=True
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
if perf_analyzer.output():
|
|
668
|
+
perf_error_log.write(
|
|
669
|
+
"Error: \n" + perf_analyzer.output() + "\n", append=True
|
|
670
|
+
)
|
|
671
|
+
else:
|
|
672
|
+
perf_error_log.write(
|
|
673
|
+
"Error: "
|
|
674
|
+
+ "perf_analyzer did not produce any output. It was likely terminated with a SIGABRT."
|
|
675
|
+
+ "\n\n",
|
|
676
|
+
append=True,
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
def _aggregate_perf_records(self, perf_records):
|
|
680
|
+
per_model_perf_records = {}
|
|
681
|
+
for model, records in perf_records.items():
|
|
682
|
+
perf_record_aggregator = RecordAggregator()
|
|
683
|
+
perf_record_aggregator.insert_all(records)
|
|
684
|
+
|
|
685
|
+
per_model_perf_records[model] = perf_record_aggregator.aggregate()
|
|
686
|
+
return per_model_perf_records
|
|
687
|
+
|
|
688
|
+
def _get_gpu_inference_metrics(self):
|
|
689
|
+
"""
|
|
690
|
+
Stops GPU monitor and aggregates any records
|
|
691
|
+
that are GPU specific
|
|
692
|
+
Returns
|
|
693
|
+
-------
|
|
694
|
+
dict
|
|
695
|
+
keys are gpu ids and values are metric values
|
|
696
|
+
in the order specified in self._gpu_metrics
|
|
697
|
+
"""
|
|
698
|
+
|
|
699
|
+
# Stop and destroy DCGM monitor
|
|
700
|
+
gpu_records = self._gpu_monitor.stop_recording_metrics()
|
|
701
|
+
gpu_metrics = self._aggregate_gpu_records(gpu_records)
|
|
702
|
+
return gpu_metrics
|
|
703
|
+
|
|
704
|
+
def _aggregate_gpu_records(self, gpu_records):
|
|
705
|
+
# Insert all records into aggregator and get aggregated DCGM records
|
|
706
|
+
gpu_record_aggregator = RecordAggregator()
|
|
707
|
+
gpu_record_aggregator.insert_all(gpu_records)
|
|
708
|
+
|
|
709
|
+
records_groupby_gpu = {}
|
|
710
|
+
records_groupby_gpu = gpu_record_aggregator.groupby(
|
|
711
|
+
self._gpu_metrics, lambda record: record.device_uuid()
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
gpu_metrics = defaultdict(list)
|
|
715
|
+
for _, metric in records_groupby_gpu.items():
|
|
716
|
+
for gpu_uuid, metric_value in metric.items():
|
|
717
|
+
gpu_metrics[gpu_uuid].append(metric_value)
|
|
718
|
+
return gpu_metrics
|
|
719
|
+
|
|
720
|
+
def _get_cpu_inference_metrics(self):
|
|
721
|
+
"""
|
|
722
|
+
Stops any monitors that just need the records to be aggregated
|
|
723
|
+
like the CPU metrics
|
|
724
|
+
"""
|
|
725
|
+
|
|
726
|
+
cpu_records = self._cpu_monitor.stop_recording_metrics()
|
|
727
|
+
|
|
728
|
+
cpu_record_aggregator = RecordAggregator()
|
|
729
|
+
cpu_record_aggregator.insert_all(cpu_records)
|
|
730
|
+
return cpu_record_aggregator.aggregate()
|
|
731
|
+
|
|
732
|
+
def _check_triton_and_model_analyzer_gpus(self):
|
|
733
|
+
"""
|
|
734
|
+
Check whether Triton Server and Model Analyzer are using the same GPUs
|
|
735
|
+
Raises
|
|
736
|
+
------
|
|
737
|
+
TritonModelAnalyzerException
|
|
738
|
+
If they are using different GPUs this exception will be raised.
|
|
739
|
+
"""
|
|
740
|
+
|
|
741
|
+
if (
|
|
742
|
+
self._config.triton_launch_mode != "remote"
|
|
743
|
+
and self._config.triton_launch_mode != "c_api"
|
|
744
|
+
):
|
|
745
|
+
self._client.wait_for_server_ready(
|
|
746
|
+
num_retries=self._config.client_max_retries,
|
|
747
|
+
log_file=self._server.log_file(),
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
model_analyzer_gpus = [gpu.device_uuid() for gpu in self._gpus]
|
|
751
|
+
triton_gpus = self._get_triton_metrics_gpus()
|
|
752
|
+
if set(model_analyzer_gpus) != set(triton_gpus):
|
|
753
|
+
raise TritonModelAnalyzerException(
|
|
754
|
+
"'Triton Server is not using the same GPUs as Model Analyzer: '"
|
|
755
|
+
f"Model Analyzer GPUs {model_analyzer_gpus}, Triton GPUs {triton_gpus}"
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
def _get_triton_metrics_gpus(self):
|
|
759
|
+
"""
|
|
760
|
+
Uses prometheus to request a list of GPU UUIDs corresponding to the GPUs
|
|
761
|
+
visible to Triton Inference Server
|
|
762
|
+
Parameters
|
|
763
|
+
----------
|
|
764
|
+
config : namespace
|
|
765
|
+
The arguments passed into the CLI
|
|
766
|
+
"""
|
|
767
|
+
|
|
768
|
+
triton_prom_str = str(
|
|
769
|
+
requests.get(self._config.triton_metrics_url, timeout=10).content,
|
|
770
|
+
encoding="ascii",
|
|
771
|
+
)
|
|
772
|
+
metrics = text_string_to_metric_families(triton_prom_str)
|
|
773
|
+
|
|
774
|
+
triton_gpus = []
|
|
775
|
+
for metric in metrics:
|
|
776
|
+
if metric.name == "nv_gpu_utilization":
|
|
777
|
+
for sample in metric.samples:
|
|
778
|
+
triton_gpus.append(sample.labels["gpu_uuid"])
|
|
779
|
+
|
|
780
|
+
return triton_gpus
|
|
781
|
+
|
|
782
|
+
def _print_run_config_info(self, run_config):
|
|
783
|
+
for model_run_config in run_config.model_run_configs():
|
|
784
|
+
perf_config = model_run_config.perf_config()
|
|
785
|
+
if perf_config["request-rate-range"]:
|
|
786
|
+
if perf_config["batch-size"] != 1:
|
|
787
|
+
logger.info(
|
|
788
|
+
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-rate-range={perf_config['request-rate-range']}"
|
|
789
|
+
)
|
|
790
|
+
else:
|
|
791
|
+
logger.info(
|
|
792
|
+
f"Profiling {model_run_config.model_variant_name()}: request-rate-range={perf_config['request-rate-range']}"
|
|
793
|
+
)
|
|
794
|
+
else:
|
|
795
|
+
if perf_config["batch-size"] != 1:
|
|
796
|
+
logger.info(
|
|
797
|
+
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, concurrency={perf_config['concurrency-range']}"
|
|
798
|
+
)
|
|
799
|
+
else:
|
|
800
|
+
logger.info(
|
|
801
|
+
f"Profiling {model_run_config.model_variant_name()}: concurrency={perf_config['concurrency-range']}"
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# Vertical spacing when running multiple models at a time
|
|
805
|
+
if len(run_config.model_run_configs()) > 1:
|
|
806
|
+
logger.info("")
|
|
807
|
+
|
|
808
|
+
cpu_only = run_config.cpu_only()
|
|
809
|
+
|
|
810
|
+
# Inform user CPU metric(s) are not being collected under CPU mode
|
|
811
|
+
collect_cpu_metrics_expect = cpu_only
|
|
812
|
+
collect_cpu_metrics_actual = len(self._cpu_metrics) > 0
|
|
813
|
+
if collect_cpu_metrics_expect and not collect_cpu_metrics_actual:
|
|
814
|
+
if not self._cpu_warning_printed:
|
|
815
|
+
self._cpu_warning_printed = True
|
|
816
|
+
logger.warning(
|
|
817
|
+
"One or more models are running on the CPU, but CPU metric(s) are not being collected"
|
|
818
|
+
)
|
|
819
|
+
# Warn user about CPU monitor performance issue
|
|
820
|
+
if collect_cpu_metrics_actual:
|
|
821
|
+
if not self._cpu_warning_printed:
|
|
822
|
+
self._cpu_warning_printed = True
|
|
823
|
+
logger.warning(
|
|
824
|
+
"CPU metrics are being collected. This can affect the latency or throughput numbers reported by perf analyzer."
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
@staticmethod
|
|
828
|
+
def get_metric_types(tags):
|
|
829
|
+
"""
|
|
830
|
+
Parameters
|
|
831
|
+
----------
|
|
832
|
+
tags : list of str
|
|
833
|
+
Human readable names for the
|
|
834
|
+
metrics to monitor. They correspond
|
|
835
|
+
to actual record types.
|
|
836
|
+
Returns
|
|
837
|
+
-------
|
|
838
|
+
List
|
|
839
|
+
of record types being monitored
|
|
840
|
+
"""
|
|
841
|
+
|
|
842
|
+
return [RecordType.get(tag) for tag in tags]
|
|
843
|
+
|
|
844
|
+
@staticmethod
|
|
845
|
+
def is_gpu_metric(tag):
|
|
846
|
+
"""
|
|
847
|
+
Returns
|
|
848
|
+
------
|
|
849
|
+
True if the given tag is a supported gpu metric
|
|
850
|
+
False otherwise
|
|
851
|
+
"""
|
|
852
|
+
metric = MetricsManager.get_metric_types([tag])[0]
|
|
853
|
+
return metric in DCGMMonitor.model_analyzer_to_dcgm_field
|
|
854
|
+
|
|
855
|
+
@staticmethod
|
|
856
|
+
def is_perf_analyzer_metric(tag):
|
|
857
|
+
"""
|
|
858
|
+
Returns
|
|
859
|
+
------
|
|
860
|
+
True if the given tag is a supported perf_analyzer metric
|
|
861
|
+
False otherwise
|
|
862
|
+
"""
|
|
863
|
+
metric = MetricsManager.get_metric_types([tag])[0]
|
|
864
|
+
return metric in PerfAnalyzer.get_perf_metrics()
|
|
865
|
+
|
|
866
|
+
@staticmethod
|
|
867
|
+
def is_llm_metric(tag):
|
|
868
|
+
"""
|
|
869
|
+
Returns
|
|
870
|
+
------
|
|
871
|
+
True if the given tag is a supported perf_analyzer metric
|
|
872
|
+
False otherwise
|
|
873
|
+
"""
|
|
874
|
+
metric = MetricsManager.get_metric_types([tag])[0]
|
|
875
|
+
return metric in PerfAnalyzer.get_llm_metrics()
|
|
876
|
+
|
|
877
|
+
@staticmethod
|
|
878
|
+
def is_cpu_metric(tag):
|
|
879
|
+
"""
|
|
880
|
+
Returns
|
|
881
|
+
------
|
|
882
|
+
True if the given tag is a supported cpu metric
|
|
883
|
+
False otherwise
|
|
884
|
+
"""
|
|
885
|
+
|
|
886
|
+
metric = MetricsManager.get_metric_types([tag])[0]
|
|
887
|
+
return metric in CPUMonitor.cpu_metrics
|