triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,882 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import csv
|
|
18
|
+
import glob
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import signal
|
|
23
|
+
import tempfile
|
|
24
|
+
from csv import DictReader
|
|
25
|
+
from subprocess import STDOUT, Popen
|
|
26
|
+
from typing import Dict, List, Optional
|
|
27
|
+
|
|
28
|
+
import psutil
|
|
29
|
+
|
|
30
|
+
from model_analyzer.config.input.config_defaults import DEFAULT_MODEL_TYPE
|
|
31
|
+
from model_analyzer.constants import (
|
|
32
|
+
GENAI_PERF_COLLATERAL,
|
|
33
|
+
GENAI_PERF_CSV,
|
|
34
|
+
INTERVAL_SLEEP_TIME,
|
|
35
|
+
LOGGER_NAME,
|
|
36
|
+
MEASUREMENT_REQUEST_COUNT_STEP,
|
|
37
|
+
MEASUREMENT_WINDOW_STEP,
|
|
38
|
+
PERF_ANALYZER_MEASUREMENT_WINDOW,
|
|
39
|
+
PERF_ANALYZER_MINIMUM_REQUEST_COUNT,
|
|
40
|
+
)
|
|
41
|
+
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
|
|
42
|
+
from model_analyzer.record.record import Record
|
|
43
|
+
from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
|
|
44
|
+
from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
|
|
45
|
+
from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
|
|
46
|
+
from model_analyzer.record.types.gpu_utilization import GPUUtilization
|
|
47
|
+
from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg
|
|
48
|
+
from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax
|
|
49
|
+
from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin
|
|
50
|
+
from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25
|
|
51
|
+
from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50
|
|
52
|
+
from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75
|
|
53
|
+
from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90
|
|
54
|
+
from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95
|
|
55
|
+
from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99
|
|
56
|
+
from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput
|
|
57
|
+
from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait
|
|
58
|
+
from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv
|
|
59
|
+
from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg
|
|
60
|
+
from model_analyzer.record.types.perf_latency_p90 import PerfLatencyP90
|
|
61
|
+
from model_analyzer.record.types.perf_latency_p95 import PerfLatencyP95
|
|
62
|
+
from model_analyzer.record.types.perf_latency_p99 import PerfLatencyP99
|
|
63
|
+
from model_analyzer.record.types.perf_server_compute_infer import PerfServerComputeInfer
|
|
64
|
+
from model_analyzer.record.types.perf_server_compute_input import PerfServerComputeInput
|
|
65
|
+
from model_analyzer.record.types.perf_server_compute_output import (
|
|
66
|
+
PerfServerComputeOutput,
|
|
67
|
+
)
|
|
68
|
+
from model_analyzer.record.types.perf_server_queue import PerfServerQueue
|
|
69
|
+
from model_analyzer.record.types.perf_throughput import PerfThroughput
|
|
70
|
+
from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg
|
|
71
|
+
from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax
|
|
72
|
+
from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin
|
|
73
|
+
from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25
|
|
74
|
+
from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50
|
|
75
|
+
from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75
|
|
76
|
+
from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90
|
|
77
|
+
from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95
|
|
78
|
+
from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99
|
|
79
|
+
|
|
80
|
+
logger = logging.getLogger(LOGGER_NAME)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PerfAnalyzer:
|
|
84
|
+
"""
|
|
85
|
+
This class provides an interface for running workloads
|
|
86
|
+
with perf_analyzer.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
GPU_METRIC_UUID = 0
|
|
90
|
+
GPU_METRIC_VALUE = 1
|
|
91
|
+
|
|
92
|
+
PA_SUCCESS, PA_FAIL, PA_RETRY = 0, 1, 2
|
|
93
|
+
|
|
94
|
+
METRIC_TAG, CSV_STRING, RECORD_CLASS, REDUCTION_FACTOR = 0, 1, 2, 3
|
|
95
|
+
perf_metric_table = [
|
|
96
|
+
["perf_latency_avg", "Avg latency", PerfLatencyAvg, "1000"],
|
|
97
|
+
["perf_latency_p90", "p90 latency", PerfLatencyP90, "1000"],
|
|
98
|
+
["perf_latency_p95", "p95 latency", PerfLatencyP95, "1000"],
|
|
99
|
+
["perf_latency_p99", "p99 latency", PerfLatencyP99, "1000"],
|
|
100
|
+
["perf_throughput", "Inferences/Second", PerfThroughput, "1"],
|
|
101
|
+
["perf_client_send_recv", "request/response", PerfClientSendRecv, "1000"],
|
|
102
|
+
["perf_client_send_recv", "send/recv", PerfClientSendRecv, "1000"],
|
|
103
|
+
["perf_client_response_wait", "response wait", PerfClientResponseWait, "1000"],
|
|
104
|
+
["perf_server_queue", "Server Queue", PerfServerQueue, "1000"],
|
|
105
|
+
[
|
|
106
|
+
"perf_server_compute_infer",
|
|
107
|
+
"Server Compute Infer",
|
|
108
|
+
PerfServerComputeInfer,
|
|
109
|
+
"1000",
|
|
110
|
+
],
|
|
111
|
+
[
|
|
112
|
+
"perf_server_compute_input",
|
|
113
|
+
"Server Compute Input",
|
|
114
|
+
PerfServerComputeInput,
|
|
115
|
+
"1000",
|
|
116
|
+
],
|
|
117
|
+
[
|
|
118
|
+
"perf_server_compute_output",
|
|
119
|
+
"Server Compute Output",
|
|
120
|
+
PerfServerComputeOutput,
|
|
121
|
+
"1000",
|
|
122
|
+
],
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
gpu_metric_table = [
|
|
126
|
+
["gpu_utilization", "Avg GPU Utilization", GPUUtilization, "0.01"],
|
|
127
|
+
["gpu_power_usage", "Avg GPU Power Usage", GPUPowerUsage, "1"],
|
|
128
|
+
["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"],
|
|
129
|
+
["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"],
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
llm_metric_table = [
|
|
133
|
+
[
|
|
134
|
+
"time_to_first_token_avg",
|
|
135
|
+
"Time To First Token (ns) avg",
|
|
136
|
+
TimeToFirstTokenAvg,
|
|
137
|
+
"1000",
|
|
138
|
+
],
|
|
139
|
+
[
|
|
140
|
+
"time_to_first_token_min",
|
|
141
|
+
"Time To First Token (ns) min",
|
|
142
|
+
TimeToFirstTokenMin,
|
|
143
|
+
"1000",
|
|
144
|
+
],
|
|
145
|
+
[
|
|
146
|
+
"time_to_first_token_max",
|
|
147
|
+
"Time To First Token (ns) max",
|
|
148
|
+
TimeToFirstTokenMax,
|
|
149
|
+
"1000",
|
|
150
|
+
],
|
|
151
|
+
[
|
|
152
|
+
"time_to_first_token_p99",
|
|
153
|
+
"Time To First Token (ns) p99",
|
|
154
|
+
TimeToFirstTokenP99,
|
|
155
|
+
"1000",
|
|
156
|
+
],
|
|
157
|
+
[
|
|
158
|
+
"time_to_first_token_p95",
|
|
159
|
+
"Time To First Token (ns) p95",
|
|
160
|
+
TimeToFirstTokenP95,
|
|
161
|
+
"1000",
|
|
162
|
+
],
|
|
163
|
+
[
|
|
164
|
+
"time_to_first_token_p90",
|
|
165
|
+
"Time To First Token (ns) p90",
|
|
166
|
+
TimeToFirstTokenP90,
|
|
167
|
+
"1000",
|
|
168
|
+
],
|
|
169
|
+
[
|
|
170
|
+
"time_to_first_token_p75",
|
|
171
|
+
"Time To First Token (ns) p75",
|
|
172
|
+
TimeToFirstTokenP75,
|
|
173
|
+
"1000",
|
|
174
|
+
],
|
|
175
|
+
[
|
|
176
|
+
"time_to_first_token_p50",
|
|
177
|
+
"Time To First Token (ns) p50",
|
|
178
|
+
TimeToFirstTokenP50,
|
|
179
|
+
"1000",
|
|
180
|
+
],
|
|
181
|
+
[
|
|
182
|
+
"time_to_first_token_p25",
|
|
183
|
+
"Time To First Token (ns) p25",
|
|
184
|
+
TimeToFirstTokenP25,
|
|
185
|
+
"1000",
|
|
186
|
+
],
|
|
187
|
+
[
|
|
188
|
+
"inter_token_latency_avg",
|
|
189
|
+
"Inter Token Latency (ns) avg",
|
|
190
|
+
InterTokenLatencyAvg,
|
|
191
|
+
"1000",
|
|
192
|
+
],
|
|
193
|
+
[
|
|
194
|
+
"inter_token_latency_min",
|
|
195
|
+
"Inter Token Latency (ns) min",
|
|
196
|
+
InterTokenLatencyMin,
|
|
197
|
+
"1000",
|
|
198
|
+
],
|
|
199
|
+
[
|
|
200
|
+
"inter_token_latency_max",
|
|
201
|
+
"Inter Token Latency (ns) max",
|
|
202
|
+
InterTokenLatencyMax,
|
|
203
|
+
"1000",
|
|
204
|
+
],
|
|
205
|
+
[
|
|
206
|
+
"inter_token_latency_p99",
|
|
207
|
+
"Inter Token Latency (ns) p99",
|
|
208
|
+
InterTokenLatencyP99,
|
|
209
|
+
"1000",
|
|
210
|
+
],
|
|
211
|
+
[
|
|
212
|
+
"inter_token_latency_p95",
|
|
213
|
+
"Inter Token Latency (ns) p95",
|
|
214
|
+
InterTokenLatencyP95,
|
|
215
|
+
"1000",
|
|
216
|
+
],
|
|
217
|
+
[
|
|
218
|
+
"inter_token_latency_p90",
|
|
219
|
+
"Inter Token Latency (ns) p90",
|
|
220
|
+
InterTokenLatencyP90,
|
|
221
|
+
"1000",
|
|
222
|
+
],
|
|
223
|
+
[
|
|
224
|
+
"inter_token_latency_p75",
|
|
225
|
+
"Inter Token Latency (ns) p75",
|
|
226
|
+
InterTokenLatencyP75,
|
|
227
|
+
"1000",
|
|
228
|
+
],
|
|
229
|
+
[
|
|
230
|
+
"inter_token_latency_p50",
|
|
231
|
+
"Inter Token Latency (ns) p50",
|
|
232
|
+
InterTokenLatencyP50,
|
|
233
|
+
"1000",
|
|
234
|
+
],
|
|
235
|
+
[
|
|
236
|
+
"inter_token_latency_p25",
|
|
237
|
+
"Inter Token Latency (ns) p25",
|
|
238
|
+
InterTokenLatencyP25,
|
|
239
|
+
"1000",
|
|
240
|
+
],
|
|
241
|
+
[
|
|
242
|
+
"output_token_throughput",
|
|
243
|
+
"Output Token Throughput (per sec) avg",
|
|
244
|
+
OutputTokenThroughput,
|
|
245
|
+
"1",
|
|
246
|
+
],
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def get_perf_metrics():
|
|
251
|
+
perf_metrics = [
|
|
252
|
+
perf_metric[PerfAnalyzer.RECORD_CLASS]
|
|
253
|
+
for perf_metric in PerfAnalyzer.perf_metric_table
|
|
254
|
+
]
|
|
255
|
+
return perf_metrics
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def get_gpu_metrics():
|
|
259
|
+
gpu_metrics = [
|
|
260
|
+
gpu_metric[PerfAnalyzer.RECORD_CLASS]
|
|
261
|
+
for gpu_metric in PerfAnalyzer.gpu_metric_table
|
|
262
|
+
]
|
|
263
|
+
return gpu_metrics
|
|
264
|
+
|
|
265
|
+
@staticmethod
|
|
266
|
+
def get_llm_metrics():
|
|
267
|
+
llm_metrics = [
|
|
268
|
+
llm_metric[PerfAnalyzer.RECORD_CLASS]
|
|
269
|
+
for llm_metric in PerfAnalyzer.llm_metric_table
|
|
270
|
+
]
|
|
271
|
+
return llm_metrics
|
|
272
|
+
|
|
273
|
+
def __init__(
|
|
274
|
+
self,
|
|
275
|
+
path,
|
|
276
|
+
config,
|
|
277
|
+
max_retries,
|
|
278
|
+
timeout,
|
|
279
|
+
max_cpu_util,
|
|
280
|
+
model_type=DEFAULT_MODEL_TYPE,
|
|
281
|
+
):
|
|
282
|
+
"""
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
path : full path to the perf_analyzer
|
|
286
|
+
executable
|
|
287
|
+
config : RunConfig
|
|
288
|
+
The RunConfig with information on what to execute
|
|
289
|
+
max_retries: int
|
|
290
|
+
Maximum number of times perf_analyzer adjusts parameters
|
|
291
|
+
in an attempt to profile a model.
|
|
292
|
+
timeout : int
|
|
293
|
+
Maximum number of seconds that perf_analyzer
|
|
294
|
+
will wait until the execution is complete.
|
|
295
|
+
max_cpu_util : float
|
|
296
|
+
Maximum CPU utilization allowed for perf_analyzer
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
self.bin_path = path
|
|
300
|
+
self._config = config
|
|
301
|
+
self._max_retries = max_retries
|
|
302
|
+
self._timeout = timeout
|
|
303
|
+
self._output = ""
|
|
304
|
+
self._perf_records = {}
|
|
305
|
+
self._llm_records = {}
|
|
306
|
+
self._gpu_records = []
|
|
307
|
+
self._max_cpu_util = max_cpu_util
|
|
308
|
+
self._model_type = model_type
|
|
309
|
+
|
|
310
|
+
def run(self, metrics, env=None):
|
|
311
|
+
"""
|
|
312
|
+
Runs the perf analyzer with the
|
|
313
|
+
initialized configuration
|
|
314
|
+
Parameters
|
|
315
|
+
----------
|
|
316
|
+
metrics : List of Record types
|
|
317
|
+
The list of record types to parse from
|
|
318
|
+
Perf Analyzer
|
|
319
|
+
|
|
320
|
+
env: dict
|
|
321
|
+
Environment variables to set for perf_analyzer run
|
|
322
|
+
|
|
323
|
+
Returns
|
|
324
|
+
-------
|
|
325
|
+
Dict
|
|
326
|
+
Dict of Model to List of Records obtained from this
|
|
327
|
+
run of perf_analyzer
|
|
328
|
+
|
|
329
|
+
Raises
|
|
330
|
+
------
|
|
331
|
+
TritonModelAnalyzerException
|
|
332
|
+
If subprocess throws CalledProcessError
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
if metrics:
|
|
336
|
+
# Synchronously start and finish run
|
|
337
|
+
for _ in range(self._max_retries):
|
|
338
|
+
status = self._execute_pa(env)
|
|
339
|
+
|
|
340
|
+
if status == self.PA_FAIL:
|
|
341
|
+
return status
|
|
342
|
+
elif status == self.PA_SUCCESS:
|
|
343
|
+
self._parse_outputs(metrics)
|
|
344
|
+
break
|
|
345
|
+
elif status == self.PA_RETRY:
|
|
346
|
+
continue
|
|
347
|
+
else:
|
|
348
|
+
raise TritonModelAnalyzerException(f"Unexpected PA return {status}")
|
|
349
|
+
|
|
350
|
+
else:
|
|
351
|
+
logger.info(
|
|
352
|
+
f"Ran perf_analyzer {self._max_retries} times, "
|
|
353
|
+
"but no valid requests recorded"
|
|
354
|
+
)
|
|
355
|
+
return self.PA_FAIL
|
|
356
|
+
|
|
357
|
+
return self.PA_SUCCESS
|
|
358
|
+
|
|
359
|
+
def get_perf_records(self):
|
|
360
|
+
"""
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
The perf records from the last perf_analyzer run
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
if self._perf_records:
|
|
367
|
+
return self._perf_records
|
|
368
|
+
raise TritonModelAnalyzerException(
|
|
369
|
+
"Attempted to get perf_analyzer results without calling run first."
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def get_llm_records(self):
|
|
373
|
+
"""
|
|
374
|
+
Returns
|
|
375
|
+
-------
|
|
376
|
+
The LLM records from the last perf_analyzer run
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
if self._llm_records:
|
|
380
|
+
return self._llm_records
|
|
381
|
+
raise TritonModelAnalyzerException(
|
|
382
|
+
"Attempted to get perf_analyzer results without calling run first."
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
def get_gpu_records(self):
|
|
386
|
+
"""
|
|
387
|
+
Returns
|
|
388
|
+
-------
|
|
389
|
+
The gpu records from the last perf_analyzer run
|
|
390
|
+
"""
|
|
391
|
+
|
|
392
|
+
return self._gpu_records
|
|
393
|
+
|
|
394
|
+
def output(self):
|
|
395
|
+
"""
|
|
396
|
+
Returns
|
|
397
|
+
-------
|
|
398
|
+
The stdout output of the
|
|
399
|
+
last perf_analyzer run
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
if not self._output:
|
|
403
|
+
logger.info("perf_analyzer did not produce any output.")
|
|
404
|
+
return self._output
|
|
405
|
+
|
|
406
|
+
def get_cmd(self):
|
|
407
|
+
"""
|
|
408
|
+
Returns a string of the command to run
|
|
409
|
+
"""
|
|
410
|
+
return " ".join(self._get_cmd())
|
|
411
|
+
|
|
412
|
+
def _execute_pa(self, env):
|
|
413
|
+
cmd = self._get_cmd()
|
|
414
|
+
logger.debug(f"Running {cmd}")
|
|
415
|
+
perf_analyzer_env = self._create_env(env)
|
|
416
|
+
|
|
417
|
+
process = self._create_process(cmd, perf_analyzer_env)
|
|
418
|
+
status = self._resolve_process(process)
|
|
419
|
+
|
|
420
|
+
return status
|
|
421
|
+
|
|
422
|
+
def _get_cmd(self):
|
|
423
|
+
if self._is_multi_model():
|
|
424
|
+
cmd = ["mpiexec", "--allow-run-as-root", "--tag-output"]
|
|
425
|
+
for index in range(len(self._config.model_run_configs())):
|
|
426
|
+
if index:
|
|
427
|
+
cmd += [":"]
|
|
428
|
+
cmd += ["-n", "1"]
|
|
429
|
+
cmd += self._get_single_model_cmd(index)
|
|
430
|
+
else:
|
|
431
|
+
cmd = self._get_single_model_cmd(0)
|
|
432
|
+
return cmd
|
|
433
|
+
|
|
434
|
+
def _get_single_model_cmd(self, index):
|
|
435
|
+
if self._model_type == "LLM":
|
|
436
|
+
cmd = ["genai-perf", "-m", self._config.models_name()]
|
|
437
|
+
cmd += self._get_genai_perf_cli_command(index).replace("=", " ").split()
|
|
438
|
+
cmd += ["--"]
|
|
439
|
+
cmd += (
|
|
440
|
+
self._get_pa_cli_command(index, exclude_model_name=True)
|
|
441
|
+
.replace("=", " ")
|
|
442
|
+
.split()
|
|
443
|
+
)
|
|
444
|
+
else:
|
|
445
|
+
cmd = [self.bin_path]
|
|
446
|
+
if self._is_multi_model():
|
|
447
|
+
cmd += ["--enable-mpi"]
|
|
448
|
+
cmd += self._get_pa_cli_command(index).replace("=", " ").split()
|
|
449
|
+
|
|
450
|
+
return cmd
|
|
451
|
+
|
|
452
|
+
def _get_pa_cli_command(self, index, exclude_model_name=False):
|
|
453
|
+
return (
|
|
454
|
+
self._config.model_run_configs()[index]
|
|
455
|
+
.perf_config()
|
|
456
|
+
.to_cli_string(exclude_model_name)
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
def _get_genai_perf_cli_command(self, index):
|
|
460
|
+
return self._config.genai_perf_config().to_cli_string()
|
|
461
|
+
|
|
462
|
+
def _create_env(self, env):
|
|
463
|
+
perf_analyzer_env = os.environ.copy()
|
|
464
|
+
|
|
465
|
+
if env:
|
|
466
|
+
# Filter env variables that use env lookups
|
|
467
|
+
for variable, value in env.items():
|
|
468
|
+
if value.find("$") == -1:
|
|
469
|
+
perf_analyzer_env[variable] = value
|
|
470
|
+
else:
|
|
471
|
+
# Collect the ones that need lookups to give to the shell
|
|
472
|
+
perf_analyzer_env[variable] = os.path.expandvars(value)
|
|
473
|
+
|
|
474
|
+
return perf_analyzer_env
|
|
475
|
+
|
|
476
|
+
def _create_process(self, cmd, perf_analyzer_env):
|
|
477
|
+
self._cmd_log = tempfile.NamedTemporaryFile()
|
|
478
|
+
try:
|
|
479
|
+
process = Popen(
|
|
480
|
+
cmd,
|
|
481
|
+
start_new_session=True,
|
|
482
|
+
stdout=self._cmd_log,
|
|
483
|
+
stderr=STDOUT,
|
|
484
|
+
encoding="utf-8",
|
|
485
|
+
env=perf_analyzer_env,
|
|
486
|
+
)
|
|
487
|
+
except FileNotFoundError as e:
|
|
488
|
+
raise TritonModelAnalyzerException(f"perf_analyzer binary not found : {e}")
|
|
489
|
+
return process
|
|
490
|
+
|
|
491
|
+
def _verify_output_files_exist(self):
|
|
492
|
+
"""
|
|
493
|
+
Verify that perf_analyzer created the expected output files.
|
|
494
|
+
Waits briefly to handle filesystem buffering delays.
|
|
495
|
+
Returns True if all expected files exist, False otherwise.
|
|
496
|
+
"""
|
|
497
|
+
import time
|
|
498
|
+
|
|
499
|
+
max_wait_time = 2.0 # seconds
|
|
500
|
+
wait_interval = 0.1 # seconds
|
|
501
|
+
max_attempts = int(max_wait_time / wait_interval)
|
|
502
|
+
|
|
503
|
+
for perf_config in [
|
|
504
|
+
mrc.perf_config() for mrc in self._config.model_run_configs()
|
|
505
|
+
]:
|
|
506
|
+
latency_file = perf_config["latency-report-file"]
|
|
507
|
+
|
|
508
|
+
file_found = False
|
|
509
|
+
for attempt in range(max_attempts):
|
|
510
|
+
if os.path.isfile(latency_file):
|
|
511
|
+
file_found = True
|
|
512
|
+
break
|
|
513
|
+
if attempt < max_attempts - 1: # Don't sleep on last attempt
|
|
514
|
+
time.sleep(wait_interval)
|
|
515
|
+
|
|
516
|
+
if not file_found:
|
|
517
|
+
logger.error(f"Expected output file not found: {latency_file}")
|
|
518
|
+
return False
|
|
519
|
+
|
|
520
|
+
return True
|
|
521
|
+
|
|
522
|
+
def _resolve_process(self, process):
|
|
523
|
+
if self._poll_perf_analyzer(process) == 1:
|
|
524
|
+
return self.PA_FAIL
|
|
525
|
+
|
|
526
|
+
if process.returncode > 0:
|
|
527
|
+
if self._auto_adjust_parameters(process) == self.PA_FAIL:
|
|
528
|
+
return self.PA_FAIL
|
|
529
|
+
else:
|
|
530
|
+
return self.PA_RETRY
|
|
531
|
+
elif process.returncode < 0:
|
|
532
|
+
logger.error(
|
|
533
|
+
"perf_analyzer was terminated by signal: "
|
|
534
|
+
f"{signal.Signals(abs(process.returncode)).name}"
|
|
535
|
+
)
|
|
536
|
+
return self.PA_FAIL
|
|
537
|
+
|
|
538
|
+
if not self._verify_output_files_exist():
|
|
539
|
+
logger.error(
|
|
540
|
+
"perf_analyzer returned success but did not create expected output files"
|
|
541
|
+
)
|
|
542
|
+
logger.error("perf_analyzer output:")
|
|
543
|
+
if self._output:
|
|
544
|
+
logger.error(self._output)
|
|
545
|
+
else:
|
|
546
|
+
logger.error("(no output captured)")
|
|
547
|
+
# Check if this is due to measurement window being too small
|
|
548
|
+
if self._auto_adjust_parameters(process) == self.PA_FAIL:
|
|
549
|
+
return self.PA_FAIL
|
|
550
|
+
else:
|
|
551
|
+
return self.PA_RETRY
|
|
552
|
+
|
|
553
|
+
return self.PA_SUCCESS
|
|
554
|
+
|
|
555
|
+
def _poll_perf_analyzer(self, process):
|
|
556
|
+
"""
|
|
557
|
+
Periodically poll the perf analyzer to get output
|
|
558
|
+
or see if it is taking too much time or CPU resources
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
current_timeout = self._timeout
|
|
562
|
+
process_util = psutil.Process(process.pid)
|
|
563
|
+
|
|
564
|
+
while current_timeout > 0:
|
|
565
|
+
if process.poll() is not None:
|
|
566
|
+
self._output = self._get_process_output()
|
|
567
|
+
break
|
|
568
|
+
|
|
569
|
+
# perf_analyzer using too much CPU?
|
|
570
|
+
cpu_util = process_util.cpu_percent(INTERVAL_SLEEP_TIME)
|
|
571
|
+
if cpu_util > self._max_cpu_util:
|
|
572
|
+
logger.info(
|
|
573
|
+
f"perf_analyzer used significant amount of CPU resources ({cpu_util}%), killing perf_analyzer"
|
|
574
|
+
)
|
|
575
|
+
self._output = self._get_process_output()
|
|
576
|
+
process.kill()
|
|
577
|
+
|
|
578
|
+
return self.PA_FAIL
|
|
579
|
+
|
|
580
|
+
current_timeout -= INTERVAL_SLEEP_TIME
|
|
581
|
+
else:
|
|
582
|
+
logger.info("perf_analyzer took very long to exit, killing perf_analyzer")
|
|
583
|
+
process.kill()
|
|
584
|
+
|
|
585
|
+
return self.PA_FAIL
|
|
586
|
+
|
|
587
|
+
return self.PA_SUCCESS
|
|
588
|
+
|
|
589
|
+
def _get_process_output(self):
|
|
590
|
+
self._cmd_log.seek(0)
|
|
591
|
+
tmp_output = self._cmd_log.read()
|
|
592
|
+
self._cmd_log.close()
|
|
593
|
+
|
|
594
|
+
# PA has occasionally output non-UTF-8 bytes which would cause MA
|
|
595
|
+
# to assert. In that case, just ignore the result instead of asserting
|
|
596
|
+
result = ""
|
|
597
|
+
try:
|
|
598
|
+
result = tmp_output.decode("utf-8")
|
|
599
|
+
except Exception:
|
|
600
|
+
# Ignore the result on decode failed
|
|
601
|
+
pass
|
|
602
|
+
|
|
603
|
+
return result
|
|
604
|
+
|
|
605
|
+
def _auto_adjust_parameters(self, process):
|
|
606
|
+
"""
|
|
607
|
+
Attempt to update PA parameters based on the output
|
|
608
|
+
"""
|
|
609
|
+
logger.debug(
|
|
610
|
+
f"_auto_adjust_parameters called. returncode={process.returncode}, output_length={len(self._output)}, has_failed_msg={'Failed to obtain stable measurement' in self._output}, has_larger_window_msg={'Please use a larger time window' in self._output}"
|
|
611
|
+
)
|
|
612
|
+
if (
|
|
613
|
+
self._output.find("Failed to obtain stable measurement") != -1
|
|
614
|
+
or self._output.find("Please use a larger time window") != -1
|
|
615
|
+
):
|
|
616
|
+
logger.debug("Found error message, will adjust parameters")
|
|
617
|
+
per_rank_logs = self._split_output_per_rank()
|
|
618
|
+
|
|
619
|
+
for index, log in enumerate(per_rank_logs):
|
|
620
|
+
perf_config = self._config.model_run_configs()[index].perf_config()
|
|
621
|
+
self._auto_adjust_parameters_for_perf_config(perf_config, log)
|
|
622
|
+
|
|
623
|
+
return self.PA_SUCCESS
|
|
624
|
+
else:
|
|
625
|
+
clamped_output = self._output[:1000]
|
|
626
|
+
logger.info(
|
|
627
|
+
f"Running perf_analyzer failed with"
|
|
628
|
+
f" exit status {process.returncode}:\n{clamped_output}"
|
|
629
|
+
)
|
|
630
|
+
return self.PA_FAIL
|
|
631
|
+
|
|
632
|
+
def _auto_adjust_parameters_for_perf_config(self, perf_config, log):
|
|
633
|
+
if (
|
|
634
|
+
log.find("Failed to obtain stable measurement") != -1
|
|
635
|
+
or log.find("Please use a larger time window") != -1
|
|
636
|
+
):
|
|
637
|
+
logger.debug(
|
|
638
|
+
f"Found measurement error in log, will adjust parameters. measurement-mode={perf_config['measurement-mode']}, current measurement-interval={perf_config['measurement-interval']}"
|
|
639
|
+
)
|
|
640
|
+
if perf_config["measurement-mode"] == "time_windows":
|
|
641
|
+
if perf_config["measurement-interval"] is None:
|
|
642
|
+
perf_config["measurement-interval"] = (
|
|
643
|
+
PERF_ANALYZER_MEASUREMENT_WINDOW + MEASUREMENT_WINDOW_STEP
|
|
644
|
+
)
|
|
645
|
+
else:
|
|
646
|
+
perf_config["measurement-interval"] = (
|
|
647
|
+
int(perf_config["measurement-interval"])
|
|
648
|
+
+ MEASUREMENT_WINDOW_STEP
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
logger.info(
|
|
652
|
+
"perf_analyzer's measurement window is too small, "
|
|
653
|
+
f"increased to {perf_config['measurement-interval']} ms."
|
|
654
|
+
)
|
|
655
|
+
elif (
|
|
656
|
+
perf_config["measurement-mode"] is None
|
|
657
|
+
or perf_config["measurement-mode"] == "count_windows"
|
|
658
|
+
):
|
|
659
|
+
if perf_config["measurement-request-count"] is None:
|
|
660
|
+
perf_config["measurement-request-count"] = (
|
|
661
|
+
PERF_ANALYZER_MINIMUM_REQUEST_COUNT
|
|
662
|
+
+ MEASUREMENT_REQUEST_COUNT_STEP
|
|
663
|
+
)
|
|
664
|
+
else:
|
|
665
|
+
perf_config["measurement-request-count"] = (
|
|
666
|
+
int(perf_config["measurement-request-count"])
|
|
667
|
+
+ MEASUREMENT_REQUEST_COUNT_STEP
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
logger.info(
|
|
671
|
+
"perf_analyzer's request count is too small, "
|
|
672
|
+
f"increased to {perf_config['measurement-request-count']}."
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
def _split_output_per_rank(self):
|
|
676
|
+
if self._is_multi_model():
|
|
677
|
+
outputs = ["" for mrc in self._config.model_run_configs()]
|
|
678
|
+
for line in self._output.splitlines():
|
|
679
|
+
# Example would find the '2': [1,2]<stdout>: fake output ***
|
|
680
|
+
rank = re.search(r"^\[\d+,(\d+)\]", line)
|
|
681
|
+
|
|
682
|
+
if rank:
|
|
683
|
+
index = int(rank.group(1))
|
|
684
|
+
outputs[index] += line + "\n"
|
|
685
|
+
return outputs
|
|
686
|
+
else:
|
|
687
|
+
return [self._output]
|
|
688
|
+
|
|
689
|
+
def _is_multi_model(self):
|
|
690
|
+
"""
|
|
691
|
+
Returns true if the RunConfig provided to this class contains multiple perf_configs. Else False
|
|
692
|
+
"""
|
|
693
|
+
return len(self._config.model_run_configs()) > 1
|
|
694
|
+
|
|
695
|
+
def _parse_outputs(self, metrics):
|
|
696
|
+
self._parse_generic_outputs(metrics)
|
|
697
|
+
|
|
698
|
+
if self._model_type == "LLM":
|
|
699
|
+
self._parse_llm_outputs(metrics)
|
|
700
|
+
|
|
701
|
+
def _parse_generic_outputs(self, metrics):
|
|
702
|
+
"""
|
|
703
|
+
Extract records from the Perf Analyzer run for each model
|
|
704
|
+
"""
|
|
705
|
+
|
|
706
|
+
for perf_config in [
|
|
707
|
+
mrc.perf_config() for mrc in self._config.model_run_configs()
|
|
708
|
+
]:
|
|
709
|
+
latency_file = perf_config["latency-report-file"]
|
|
710
|
+
logger.debug(f"Reading PA results from {latency_file}")
|
|
711
|
+
|
|
712
|
+
with open(latency_file, mode="r") as f:
|
|
713
|
+
csv_reader = csv.DictReader(f, delimiter=",")
|
|
714
|
+
|
|
715
|
+
for row in csv_reader:
|
|
716
|
+
self._perf_records[
|
|
717
|
+
perf_config["model-name"]
|
|
718
|
+
] = self._extract_perf_records_from_row(metrics, row)
|
|
719
|
+
self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
|
|
720
|
+
|
|
721
|
+
for perf_config in [
|
|
722
|
+
mrc.perf_config() for mrc in self._config.model_run_configs()
|
|
723
|
+
]:
|
|
724
|
+
# Remove the latency file and all associated composing model latency files
|
|
725
|
+
for f in glob.glob(f"*{perf_config['latency-report-file']}"):
|
|
726
|
+
os.remove(f)
|
|
727
|
+
|
|
728
|
+
def _parse_llm_outputs(self, metrics):
|
|
729
|
+
"""
|
|
730
|
+
Extract records from the Perf Analyzer run for each model
|
|
731
|
+
"""
|
|
732
|
+
|
|
733
|
+
perf_config = self._config.model_run_configs()[0].perf_config()
|
|
734
|
+
|
|
735
|
+
logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}")
|
|
736
|
+
with open(GENAI_PERF_CSV, mode="r") as f:
|
|
737
|
+
csv_reader = list(csv.DictReader(f, delimiter=","))
|
|
738
|
+
|
|
739
|
+
# See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example
|
|
740
|
+
self._llm_records[perf_config["model-name"]] = self._extract_llm_records(
|
|
741
|
+
metrics, csv_reader
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
os.remove(GENAI_PERF_CSV)
|
|
745
|
+
for filename in GENAI_PERF_COLLATERAL:
|
|
746
|
+
os.remove(filename)
|
|
747
|
+
|
|
748
|
+
def _extract_perf_records_from_row(
|
|
749
|
+
self, requested_metrics: List[Record], row_metrics: Dict[str, str]
|
|
750
|
+
) -> List[Record]:
|
|
751
|
+
perf_records: List[Record] = []
|
|
752
|
+
for perf_metric in PerfAnalyzer.perf_metric_table:
|
|
753
|
+
if self._is_metric_requested_and_in_row(
|
|
754
|
+
perf_metric, requested_metrics, row_metrics
|
|
755
|
+
):
|
|
756
|
+
value = float(row_metrics[str(perf_metric[PerfAnalyzer.CSV_STRING])])
|
|
757
|
+
reduction_factor = float(
|
|
758
|
+
str(perf_metric[PerfAnalyzer.REDUCTION_FACTOR])
|
|
759
|
+
)
|
|
760
|
+
perf_value = value / reduction_factor
|
|
761
|
+
|
|
762
|
+
perf_records.append(
|
|
763
|
+
perf_metric[PerfAnalyzer.RECORD_CLASS](perf_value) # type: ignore
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
return perf_records
|
|
767
|
+
|
|
768
|
+
def _extract_gpu_records_from_row(
|
|
769
|
+
self, requested_metrics: List[Record], row_metrics: Dict[str, str]
|
|
770
|
+
) -> List[Record]:
|
|
771
|
+
# GPU metrics have the following format: UUID0:value0;UUID1:value1;...
|
|
772
|
+
gpu_records: List[Record] = []
|
|
773
|
+
for gpu_metric in PerfAnalyzer.gpu_metric_table:
|
|
774
|
+
if self._is_metric_requested_and_in_row(
|
|
775
|
+
gpu_metric, requested_metrics, row_metrics
|
|
776
|
+
):
|
|
777
|
+
gpu_metric_string = row_metrics[
|
|
778
|
+
str(gpu_metric[PerfAnalyzer.CSV_STRING])
|
|
779
|
+
]
|
|
780
|
+
|
|
781
|
+
# Covers the case where PA didn't provide data
|
|
782
|
+
if not gpu_metric_string:
|
|
783
|
+
continue
|
|
784
|
+
|
|
785
|
+
# Needed because PA might terminate substring with a ;
|
|
786
|
+
if gpu_metric_string and gpu_metric_string[-1] == ";":
|
|
787
|
+
gpu_metric_string = gpu_metric_string[:-1]
|
|
788
|
+
|
|
789
|
+
gpu_metric_string_tuples = gpu_metric_string.split(";")
|
|
790
|
+
|
|
791
|
+
for gpu_metric_string_tuple in gpu_metric_string_tuples:
|
|
792
|
+
gpu_metric_tuple = gpu_metric_string_tuple.split(":")
|
|
793
|
+
|
|
794
|
+
uuid = gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_UUID]
|
|
795
|
+
tmp_value = float(gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE])
|
|
796
|
+
reduction_factor = float(
|
|
797
|
+
str(gpu_metric[PerfAnalyzer.REDUCTION_FACTOR])
|
|
798
|
+
)
|
|
799
|
+
value = tmp_value / reduction_factor
|
|
800
|
+
|
|
801
|
+
record = gpu_metric[PerfAnalyzer.RECORD_CLASS](
|
|
802
|
+
value=value, device_uuid=uuid
|
|
803
|
+
) # type: ignore
|
|
804
|
+
|
|
805
|
+
gpu_records.append(record)
|
|
806
|
+
|
|
807
|
+
self._cleanup_gpu_records(gpu_records)
|
|
808
|
+
return gpu_records
|
|
809
|
+
|
|
810
|
+
def _extract_llm_records(
|
|
811
|
+
self, requested_metrics: List[Record], csv_reader: DictReader
|
|
812
|
+
) -> List[Record]:
|
|
813
|
+
llm_records: List[Record] = []
|
|
814
|
+
|
|
815
|
+
for requested_metric in requested_metrics:
|
|
816
|
+
new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader)
|
|
817
|
+
if new_llm_record:
|
|
818
|
+
llm_records.append(new_llm_record)
|
|
819
|
+
|
|
820
|
+
return llm_records
|
|
821
|
+
|
|
822
|
+
def _get_llm_record_from_csv(
|
|
823
|
+
self, requested_metric: Record, csv_reader: DictReader
|
|
824
|
+
) -> Optional[Record]:
|
|
825
|
+
for row in csv_reader:
|
|
826
|
+
for key, value in row.items():
|
|
827
|
+
metric_string = f"{row['Metric']} {key}"
|
|
828
|
+
llm_metric = self._find_corresponding_llm_metric_row(metric_string)
|
|
829
|
+
|
|
830
|
+
if (
|
|
831
|
+
llm_metric
|
|
832
|
+
and llm_metric[PerfAnalyzer.METRIC_TAG] == requested_metric.tag
|
|
833
|
+
):
|
|
834
|
+
adjusted_value = float(value) / float(
|
|
835
|
+
llm_metric[PerfAnalyzer.REDUCTION_FACTOR]
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore
|
|
839
|
+
return llm_record
|
|
840
|
+
|
|
841
|
+
return None
|
|
842
|
+
|
|
843
|
+
def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]:
|
|
844
|
+
for row in PerfAnalyzer.llm_metric_table:
|
|
845
|
+
if metric_string == row[PerfAnalyzer.CSV_STRING]:
|
|
846
|
+
return row
|
|
847
|
+
|
|
848
|
+
return None
|
|
849
|
+
|
|
850
|
+
def _cleanup_gpu_records(self, gpu_records):
|
|
851
|
+
# Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory
|
|
852
|
+
# Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory
|
|
853
|
+
indexes_to_remove = []
|
|
854
|
+
for i, record in enumerate(gpu_records):
|
|
855
|
+
if type(record) == GPUFreeMemory:
|
|
856
|
+
# Find matching UUID UsedMemory
|
|
857
|
+
found = False
|
|
858
|
+
for other_record in gpu_records:
|
|
859
|
+
if (
|
|
860
|
+
type(other_record) == GPUUsedMemory
|
|
861
|
+
and record.device_uuid() == other_record.device_uuid()
|
|
862
|
+
):
|
|
863
|
+
found = True
|
|
864
|
+
record._value = record.value() - other_record.value()
|
|
865
|
+
break
|
|
866
|
+
if not found:
|
|
867
|
+
indexes_to_remove.append(i)
|
|
868
|
+
for i in reversed(indexes_to_remove):
|
|
869
|
+
del gpu_records[i]
|
|
870
|
+
|
|
871
|
+
def _is_metric_requested_and_in_row(
|
|
872
|
+
self,
|
|
873
|
+
metric: List[object],
|
|
874
|
+
requested_metrics: List[Record],
|
|
875
|
+
row_metrics: Dict[str, str],
|
|
876
|
+
) -> bool:
|
|
877
|
+
tag_match = any(
|
|
878
|
+
metric[PerfAnalyzer.METRIC_TAG] in requested_metric.tag
|
|
879
|
+
for requested_metric in requested_metrics
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
return tag_match and metric[PerfAnalyzer.CSV_STRING] in row_metrics
|