triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
from copy import deepcopy
|
|
19
|
+
from functools import total_ordering
|
|
20
|
+
from typing import Any, Dict, List, Optional
|
|
21
|
+
|
|
22
|
+
from model_analyzer.constants import COMPARISON_SCORE_THRESHOLD, LOGGER_NAME
|
|
23
|
+
from model_analyzer.record.record import Record, RecordType
|
|
24
|
+
from model_analyzer.result.constraint_manager import ConstraintManager
|
|
25
|
+
from model_analyzer.result.model_config_measurement import ModelConfigMeasurement
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(LOGGER_NAME)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@total_ordering
|
|
31
|
+
class RunConfigMeasurement:
|
|
32
|
+
"""
|
|
33
|
+
Encapsulates the set of metrics obtained from all model configs
|
|
34
|
+
in a single RunConfig
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self, model_variants_name: Optional[str], gpu_data: Dict[int, List[Record]]
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
model_variants_name: str
|
|
42
|
+
Name of the model variants this measurement was collected for
|
|
43
|
+
|
|
44
|
+
gpu_data : dict of list of Records
|
|
45
|
+
Metrics from the monitors that have a GPU UUID
|
|
46
|
+
associated with them
|
|
47
|
+
"""
|
|
48
|
+
self._model_variants_name = model_variants_name
|
|
49
|
+
|
|
50
|
+
self._gpu_data = gpu_data
|
|
51
|
+
self._avg_gpu_data = self._average_list(list(self._gpu_data.values()))
|
|
52
|
+
self._avg_gpu_data_from_tag = self._get_avg_gpu_data_from_tag()
|
|
53
|
+
|
|
54
|
+
self._model_config_measurements: List[ModelConfigMeasurement] = []
|
|
55
|
+
self._model_config_weights: List[float] = []
|
|
56
|
+
self._constraint_manager: Optional[ConstraintManager] = None
|
|
57
|
+
|
|
58
|
+
def to_dict(self):
|
|
59
|
+
rcm_dict = deepcopy(self.__dict__)
|
|
60
|
+
del rcm_dict["_model_config_weights"]
|
|
61
|
+
del rcm_dict["_constraint_manager"]
|
|
62
|
+
|
|
63
|
+
return rcm_dict
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_dict(cls, run_config_measurement_dict: Dict) -> "RunConfigMeasurement":
|
|
67
|
+
run_config_measurement = RunConfigMeasurement(None, {})
|
|
68
|
+
|
|
69
|
+
run_config_measurement._model_variants_name = run_config_measurement_dict[
|
|
70
|
+
"_model_variants_name"
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
run_config_measurement._gpu_data = cls._deserialize_gpu_data(
|
|
74
|
+
run_config_measurement, run_config_measurement_dict["_gpu_data"]
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
run_config_measurement._avg_gpu_data = cls._average_list(
|
|
78
|
+
run_config_measurement, list(run_config_measurement._gpu_data.values())
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
run_config_measurement._avg_gpu_data_from_tag = cls._get_avg_gpu_data_from_tag(
|
|
82
|
+
run_config_measurement
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
run_config_measurement._model_config_measurements = (
|
|
86
|
+
cls._deserialize_model_config_measurements(
|
|
87
|
+
run_config_measurement,
|
|
88
|
+
run_config_measurement_dict["_model_config_measurements"],
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return run_config_measurement
|
|
93
|
+
|
|
94
|
+
def set_model_config_weighting(self, model_config_weights: List[int]) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Sets the model config weightings used when calculating
|
|
97
|
+
weighted metrics
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
model_weights: list of ints
|
|
102
|
+
Weights are the relative importance of the model_configs
|
|
103
|
+
with respect to one another
|
|
104
|
+
"""
|
|
105
|
+
self._model_config_weights = [
|
|
106
|
+
model_config_weight / sum(model_config_weights)
|
|
107
|
+
for model_config_weight in model_config_weights
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
def set_constraint_manager(self, constraint_manager: ConstraintManager) -> None:
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
constraint_manager: ConstraintManager object
|
|
116
|
+
Used to determine if an ModelConfigMeasurement passes or fails
|
|
117
|
+
"""
|
|
118
|
+
self._constraint_manager = constraint_manager
|
|
119
|
+
|
|
120
|
+
def add_model_config_measurement(
|
|
121
|
+
self,
|
|
122
|
+
model_config_name: str,
|
|
123
|
+
model_specific_pa_params: Dict[str, int],
|
|
124
|
+
non_gpu_data: List[Record],
|
|
125
|
+
) -> None:
|
|
126
|
+
"""
|
|
127
|
+
Adds a measurement from a single model config in this PA's run
|
|
128
|
+
|
|
129
|
+
model_config_name : string
|
|
130
|
+
The model config name that was used for this PA run
|
|
131
|
+
model_specific_pa_params: dict
|
|
132
|
+
Dictionary of PA parameters that change between models
|
|
133
|
+
in a multi-model run
|
|
134
|
+
non_gpu_data : list of Records
|
|
135
|
+
Metrics that do not have a GPU UUID associated with them,
|
|
136
|
+
from either CPU or PA
|
|
137
|
+
"""
|
|
138
|
+
self._model_config_measurements.append(
|
|
139
|
+
ModelConfigMeasurement(
|
|
140
|
+
model_config_name, model_specific_pa_params, non_gpu_data
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# By default setting all models to have equal weighting
|
|
145
|
+
self._model_config_weights.append(1)
|
|
146
|
+
|
|
147
|
+
def set_metric_weightings(self, metric_objectives: List[Dict[str, int]]) -> None:
|
|
148
|
+
"""
|
|
149
|
+
Sets the metric weighting for all non-GPU measurements
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
metric_objectives : list of dict of RecordTypes
|
|
154
|
+
One entry per ModelConfig
|
|
155
|
+
"""
|
|
156
|
+
for index, measurement in enumerate(self._model_config_measurements):
|
|
157
|
+
measurement.set_metric_weighting(metric_objectives[index])
|
|
158
|
+
|
|
159
|
+
def model_variants_name(self) -> Optional[str]:
|
|
160
|
+
"""
|
|
161
|
+
Returns: str
|
|
162
|
+
The name of the model variants this measurement was collected for
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
return self._model_variants_name
|
|
166
|
+
|
|
167
|
+
def model_name(self) -> Optional[str]:
|
|
168
|
+
"""
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
str: Model name for this RunConfigMeasurement
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
return self._model_variants_name
|
|
175
|
+
|
|
176
|
+
def data(self) -> Dict[str, List[Record]]:
|
|
177
|
+
"""
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
dict
|
|
181
|
+
keys are model names and values are list of Records per model
|
|
182
|
+
All the metric values in each model's measurement
|
|
183
|
+
for both GPU and non-GPU
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
mcm.model_name(): self._avg_gpu_data + mcm.non_gpu_data()
|
|
188
|
+
for mcm in self._model_config_measurements
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def gpu_data(self) -> Dict[int, List[Record]]:
|
|
192
|
+
"""
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
Dict of List of Records
|
|
196
|
+
GPU specific measurements
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
return self._gpu_data
|
|
200
|
+
|
|
201
|
+
def non_gpu_data(self) -> List[List[Record]]:
|
|
202
|
+
"""
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
per model list of a list Records
|
|
206
|
+
The non GPU specific measurements
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
return [
|
|
210
|
+
model_config_measurement.non_gpu_data()
|
|
211
|
+
for model_config_measurement in self._model_config_measurements
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
def get_gpu_metric(self, tag: str) -> Optional[Record]:
|
|
215
|
+
"""
|
|
216
|
+
Returns the average of Records associated with this GPU metric
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
tag : str
|
|
221
|
+
A human readable tag that corresponds
|
|
222
|
+
to a particular GPU metric
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
Record:
|
|
227
|
+
of average GPU metric Records corresponding to this tag,
|
|
228
|
+
or None if tag not found
|
|
229
|
+
"""
|
|
230
|
+
if tag in self._avg_gpu_data_from_tag:
|
|
231
|
+
return self._avg_gpu_data_from_tag[tag]
|
|
232
|
+
else:
|
|
233
|
+
logger.warning(
|
|
234
|
+
f"No GPU metric corresponding to tag '{tag}' "
|
|
235
|
+
"found in the model's measurement. Possibly comparing "
|
|
236
|
+
"measurements across devices."
|
|
237
|
+
)
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
def get_non_gpu_metric(self, tag: str) -> List[Record]:
|
|
241
|
+
"""
|
|
242
|
+
Returns the Records associated with this non-GPU metric
|
|
243
|
+
|
|
244
|
+
Parameters
|
|
245
|
+
----------
|
|
246
|
+
tag : str
|
|
247
|
+
A human readable tag that corresponds
|
|
248
|
+
to a particular metric
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
list:
|
|
253
|
+
of per model list:
|
|
254
|
+
of non-GPU metric Records, or None if tag not found
|
|
255
|
+
"""
|
|
256
|
+
return [
|
|
257
|
+
model_config_measurement.get_metric(tag)
|
|
258
|
+
for model_config_measurement in self._model_config_measurements
|
|
259
|
+
]
|
|
260
|
+
|
|
261
|
+
def get_weighted_non_gpu_metric(self, tag: str) -> List[Record]:
|
|
262
|
+
"""
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
tag : str
|
|
266
|
+
A human readable tag that corresponds
|
|
267
|
+
to a particular non-GPU metric
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
list:
|
|
272
|
+
of per model list:
|
|
273
|
+
of weighted non-GPU metric Records,
|
|
274
|
+
or None if tag not found
|
|
275
|
+
|
|
276
|
+
"""
|
|
277
|
+
assert len(self._model_config_weights) == len(self._model_config_measurements)
|
|
278
|
+
|
|
279
|
+
return [
|
|
280
|
+
model_config_measurement.get_metric(tag) * self._model_config_weights[index]
|
|
281
|
+
for index, model_config_measurement in enumerate(
|
|
282
|
+
self._model_config_measurements
|
|
283
|
+
)
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
def get_non_gpu_metric_value(self, tag: str, default_value: Any = 0) -> float:
|
|
287
|
+
"""
|
|
288
|
+
Parameters
|
|
289
|
+
----------
|
|
290
|
+
tag : str
|
|
291
|
+
A human readable tag that corresponds
|
|
292
|
+
to a particular non-GPU metric
|
|
293
|
+
default_value : any
|
|
294
|
+
Value to return if tag is not found
|
|
295
|
+
|
|
296
|
+
Returns
|
|
297
|
+
-------
|
|
298
|
+
Float
|
|
299
|
+
Computation of the values of the non-GPU metric Records
|
|
300
|
+
corresponding to the tag, default_value if tag not found,
|
|
301
|
+
based on the supplied aggregation function (usually mean or sum).
|
|
302
|
+
"""
|
|
303
|
+
return RecordType.get_all_record_types()[tag].value_function()(
|
|
304
|
+
[
|
|
305
|
+
default_value if m is None else m.value()
|
|
306
|
+
for m in self.get_non_gpu_metric(tag)
|
|
307
|
+
]
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def get_gpu_metric_value(self, tag: str, default_value: Any = 0) -> float:
|
|
311
|
+
"""
|
|
312
|
+
Parameters
|
|
313
|
+
----------
|
|
314
|
+
tag : str
|
|
315
|
+
A human readable tag that corresponds
|
|
316
|
+
to a particular GPU metric
|
|
317
|
+
default_value : any
|
|
318
|
+
Value to return if tag is not found
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
float :
|
|
323
|
+
Average of the values of the GPU metric Records
|
|
324
|
+
corresponding to the tag, default_value if tag not found.
|
|
325
|
+
"""
|
|
326
|
+
metric = self.get_gpu_metric(tag)
|
|
327
|
+
if metric is None:
|
|
328
|
+
return default_value
|
|
329
|
+
else:
|
|
330
|
+
return metric.value()
|
|
331
|
+
|
|
332
|
+
def get_weighted_non_gpu_metric_value(
|
|
333
|
+
self,
|
|
334
|
+
tag: str,
|
|
335
|
+
) -> List[float]:
|
|
336
|
+
"""
|
|
337
|
+
Parameters
|
|
338
|
+
----------
|
|
339
|
+
tag : str
|
|
340
|
+
A human readable tag that corresponds
|
|
341
|
+
to a particular metric
|
|
342
|
+
|
|
343
|
+
Returns
|
|
344
|
+
-------
|
|
345
|
+
list of floats
|
|
346
|
+
Weighted average of the values of the metric Record corresponding
|
|
347
|
+
to the tag
|
|
348
|
+
"""
|
|
349
|
+
assert len(self._model_config_weights) == len(self._model_config_measurements)
|
|
350
|
+
|
|
351
|
+
weighted_non_gpu_metrics = [
|
|
352
|
+
metric.value() * self._model_config_weights[index]
|
|
353
|
+
for index, metric in enumerate(self.get_non_gpu_metric(tag))
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
return RecordType.get_all_record_types()[tag].value_function()(
|
|
357
|
+
weighted_non_gpu_metrics
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def gpus_used(self) -> List[int]:
|
|
361
|
+
"""
|
|
362
|
+
Returns
|
|
363
|
+
-------
|
|
364
|
+
list of ints
|
|
365
|
+
list of device IDs used in this measurement
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
return list(self._gpu_data.keys())
|
|
369
|
+
|
|
370
|
+
def model_specific_pa_params(self) -> List[Dict[str, int]]:
|
|
371
|
+
"""
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
list:
|
|
375
|
+
of dicts:
|
|
376
|
+
of model specific PA parameters
|
|
377
|
+
used in this measurement
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
return [
|
|
381
|
+
model_config_measurement.model_specific_pa_params()
|
|
382
|
+
for model_config_measurement in self._model_config_measurements
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
def is_better_than(self, other: "RunConfigMeasurement") -> bool:
|
|
386
|
+
"""
|
|
387
|
+
Checks whether a measurement is better than another
|
|
388
|
+
by using the weighted average across all model configs in the
|
|
389
|
+
RunConfig
|
|
390
|
+
|
|
391
|
+
If True, this means this RunConfig measurement is better
|
|
392
|
+
than the other
|
|
393
|
+
"""
|
|
394
|
+
# seems like this should be == -1 but we're using a min heap
|
|
395
|
+
return self._compare_measurements(other) == 1
|
|
396
|
+
|
|
397
|
+
def __eq__(self, other: object) -> bool:
|
|
398
|
+
"""
|
|
399
|
+
Check whether two sets of measurements are equivalent
|
|
400
|
+
"""
|
|
401
|
+
if not isinstance(other, RunConfigMeasurement):
|
|
402
|
+
return NotImplemented
|
|
403
|
+
return self._compare_measurements(other) == 0
|
|
404
|
+
|
|
405
|
+
def __lt__(self, other: "RunConfigMeasurement") -> bool:
|
|
406
|
+
"""
|
|
407
|
+
Checks whether a measurement is better than another
|
|
408
|
+
by using the weighted average across all model configs in the
|
|
409
|
+
RunConfig
|
|
410
|
+
|
|
411
|
+
This is used when sorting
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
bool:
|
|
416
|
+
True if other is better than or equal to self
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
return not self.is_better_than(other)
|
|
420
|
+
|
|
421
|
+
def is_passing_constraints(self) -> bool:
|
|
422
|
+
"""
|
|
423
|
+
Returns true if all model measurements pass
|
|
424
|
+
their respective constraints
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
assert self._constraint_manager is not None
|
|
428
|
+
return self._constraint_manager.satisfies_constraints(self)
|
|
429
|
+
|
|
430
|
+
def compare_measurements(self, other: "RunConfigMeasurement") -> float:
|
|
431
|
+
"""
|
|
432
|
+
Compares two RunConfigMeasurements based on each
|
|
433
|
+
ModelConfigs weighted metric objectives and the
|
|
434
|
+
ModelConfigs weighted value within the RunConfigMeasurement
|
|
435
|
+
|
|
436
|
+
Parameters
|
|
437
|
+
----------
|
|
438
|
+
other: RunConfigMeasurement
|
|
439
|
+
|
|
440
|
+
Returns
|
|
441
|
+
-------
|
|
442
|
+
float
|
|
443
|
+
Positive value if other is better
|
|
444
|
+
Negative value is self is better
|
|
445
|
+
Zero if they are equal
|
|
446
|
+
"""
|
|
447
|
+
# Step 1: for each ModelConfig determine the weighted score
|
|
448
|
+
weighted_mcm_scores = self._calculate_weighted_mcm_score(other)
|
|
449
|
+
|
|
450
|
+
# Step 2: combine these using the ModelConfig weighting
|
|
451
|
+
weighted_rcm_score = self._calculate_weighted_rcm_score(weighted_mcm_scores)
|
|
452
|
+
|
|
453
|
+
# Step 3: Reverse the polarity to match what is expected in the docstring return
|
|
454
|
+
return -1 * weighted_rcm_score
|
|
455
|
+
|
|
456
|
+
def calculate_weighted_percentage_gain(
|
|
457
|
+
self, other: "RunConfigMeasurement"
|
|
458
|
+
) -> float:
|
|
459
|
+
"""
|
|
460
|
+
Calculates the weighted percentage gain between
|
|
461
|
+
two RunConfigMeasurements based on each
|
|
462
|
+
ModelConfigs weighted metric objectives and the
|
|
463
|
+
ModelConfigs weighted value within the RunConfigMeasurement
|
|
464
|
+
|
|
465
|
+
Parameters
|
|
466
|
+
----------
|
|
467
|
+
other: RunConfigMeasurement
|
|
468
|
+
|
|
469
|
+
Returns
|
|
470
|
+
-------
|
|
471
|
+
float
|
|
472
|
+
The weighted percentage gain. A positive value indicates
|
|
473
|
+
this ModelConfig measurement is better than the other
|
|
474
|
+
"""
|
|
475
|
+
# for each ModelConfig determine the weighted percentage gain
|
|
476
|
+
weighted_mcm_pct = self._calculate_weighted_mcm_percentage_gain(other)
|
|
477
|
+
|
|
478
|
+
# combine these using the ModelConfig weighting
|
|
479
|
+
weighted_rcm_pct = self._calculate_weighted_rcm_score(weighted_mcm_pct)
|
|
480
|
+
|
|
481
|
+
return weighted_rcm_pct
|
|
482
|
+
|
|
483
|
+
def compare_constraints(self, other: "RunConfigMeasurement") -> Optional[float]:
|
|
484
|
+
"""
|
|
485
|
+
Compares two RunConfigMeasurements based on how close
|
|
486
|
+
each RCM is to passing their constraints
|
|
487
|
+
|
|
488
|
+
Parameters
|
|
489
|
+
----------
|
|
490
|
+
other: RunConfigMeasurement
|
|
491
|
+
|
|
492
|
+
Returns
|
|
493
|
+
-------
|
|
494
|
+
float
|
|
495
|
+
Positive value if other is closer to passing constraints
|
|
496
|
+
Negative value if self is closer to passing constraints
|
|
497
|
+
Zero if they are equally close to passing constraints
|
|
498
|
+
None if either RCM is passing constraints
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
assert (
|
|
502
|
+
self._constraint_manager is not None
|
|
503
|
+
and other._constraint_manager is not None
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if self.is_passing_constraints() or other.is_passing_constraints():
|
|
507
|
+
return None
|
|
508
|
+
|
|
509
|
+
self_failing_pct = self._constraint_manager.constraint_failure_percentage(self)
|
|
510
|
+
other_failing_pct = other._constraint_manager.constraint_failure_percentage(
|
|
511
|
+
other
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return (self_failing_pct - other_failing_pct) / 100
|
|
515
|
+
|
|
516
|
+
def _compare_measurements(self, other: "RunConfigMeasurement") -> int:
|
|
517
|
+
"""
|
|
518
|
+
Compares two RunConfigMeasurements based on each
|
|
519
|
+
ModelConfigs weighted metric objectives and the
|
|
520
|
+
ModelConfigs weighted value within the RunConfigMeasurement
|
|
521
|
+
|
|
522
|
+
Parameters
|
|
523
|
+
----------
|
|
524
|
+
other: RunConfigMeasurement
|
|
525
|
+
|
|
526
|
+
Returns
|
|
527
|
+
-------
|
|
528
|
+
int
|
|
529
|
+
0
|
|
530
|
+
if the results are determined
|
|
531
|
+
to be the same within a threshold
|
|
532
|
+
1
|
|
533
|
+
if self > other (is better than)
|
|
534
|
+
-1
|
|
535
|
+
if self < other (is worse than)
|
|
536
|
+
"""
|
|
537
|
+
|
|
538
|
+
# Step 1: for each ModelConfig determine the weighted score
|
|
539
|
+
weighted_mcm_scores = self._calculate_weighted_mcm_score(other)
|
|
540
|
+
|
|
541
|
+
# Step 2: combine these using the ModelConfig weighting
|
|
542
|
+
weighted_rcm_score = self._calculate_weighted_rcm_score(weighted_mcm_scores)
|
|
543
|
+
|
|
544
|
+
# Step 3: check the polarity
|
|
545
|
+
if weighted_rcm_score > COMPARISON_SCORE_THRESHOLD:
|
|
546
|
+
return 1
|
|
547
|
+
elif weighted_rcm_score < -COMPARISON_SCORE_THRESHOLD:
|
|
548
|
+
return -1
|
|
549
|
+
return 0
|
|
550
|
+
|
|
551
|
+
def _calculate_weighted_mcm_score(
|
|
552
|
+
self, other: "RunConfigMeasurement"
|
|
553
|
+
) -> List[float]:
|
|
554
|
+
"""
|
|
555
|
+
Parameters
|
|
556
|
+
----------
|
|
557
|
+
other: RunConfigMeasurement
|
|
558
|
+
|
|
559
|
+
Returns
|
|
560
|
+
-------
|
|
561
|
+
list of floats
|
|
562
|
+
A weighted score for each ModelConfig measurement in the RunConfig
|
|
563
|
+
"""
|
|
564
|
+
return [
|
|
565
|
+
self_mcm.get_weighted_score(other_mcm)
|
|
566
|
+
for self_mcm, other_mcm in zip(
|
|
567
|
+
self._model_config_measurements, other._model_config_measurements
|
|
568
|
+
)
|
|
569
|
+
]
|
|
570
|
+
|
|
571
|
+
def _calculate_weighted_mcm_percentage_gain(
|
|
572
|
+
self, other: "RunConfigMeasurement"
|
|
573
|
+
) -> List[float]:
|
|
574
|
+
"""
|
|
575
|
+
Parameters
|
|
576
|
+
----------
|
|
577
|
+
other: RunConfigMeasurement
|
|
578
|
+
|
|
579
|
+
Returns
|
|
580
|
+
-------
|
|
581
|
+
list of floats
|
|
582
|
+
A weighted percentage gain for each ModelConfig measurement in the RunConfig
|
|
583
|
+
"""
|
|
584
|
+
return [
|
|
585
|
+
self_mcm.calculate_weighted_percentage_gain(other_mcm)
|
|
586
|
+
for self_mcm, other_mcm in zip(
|
|
587
|
+
self._model_config_measurements, other._model_config_measurements
|
|
588
|
+
)
|
|
589
|
+
]
|
|
590
|
+
|
|
591
|
+
def _calculate_weighted_rcm_score(self, weighted_mcm_scores: List[float]) -> float:
|
|
592
|
+
"""
|
|
593
|
+
Parameters
|
|
594
|
+
----------
|
|
595
|
+
weighted_mcm_scores: list of floats
|
|
596
|
+
A weighted score for each ModelConfig measurement in the RunConfig
|
|
597
|
+
|
|
598
|
+
Returns
|
|
599
|
+
-------
|
|
600
|
+
float
|
|
601
|
+
The weighted score. A positive value indicates this
|
|
602
|
+
RunConfig measurement is better than the other
|
|
603
|
+
"""
|
|
604
|
+
|
|
605
|
+
assert len(self._model_config_weights) == len(weighted_mcm_scores)
|
|
606
|
+
|
|
607
|
+
return sum(
|
|
608
|
+
[
|
|
609
|
+
weighted_mcm_score * model_config_weight
|
|
610
|
+
for weighted_mcm_score, model_config_weight in zip(
|
|
611
|
+
weighted_mcm_scores, self._model_config_weights
|
|
612
|
+
)
|
|
613
|
+
]
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
def _average_list(self, row_list):
|
|
617
|
+
"""
|
|
618
|
+
Average a 2d list
|
|
619
|
+
"""
|
|
620
|
+
|
|
621
|
+
if not row_list:
|
|
622
|
+
return row_list
|
|
623
|
+
else:
|
|
624
|
+
N = len(row_list)
|
|
625
|
+
d = len(row_list[0])
|
|
626
|
+
avg = [0 for _ in range(d)]
|
|
627
|
+
for i in range(d):
|
|
628
|
+
avg[i] = (
|
|
629
|
+
sum([row_list[j][i] for j in range(1, N)], start=row_list[0][i])
|
|
630
|
+
* 1.0
|
|
631
|
+
) / N
|
|
632
|
+
return avg
|
|
633
|
+
|
|
634
|
+
def _deserialize_gpu_data(
|
|
635
|
+
self, serialized_gpu_data: Dict
|
|
636
|
+
) -> Dict[int, List[Record]]:
|
|
637
|
+
gpu_data = {}
|
|
638
|
+
for gpu_uuid, gpu_data_list in serialized_gpu_data.items():
|
|
639
|
+
metric_list = []
|
|
640
|
+
for [tag, record_dict] in gpu_data_list:
|
|
641
|
+
record_type = RecordType.get(tag)
|
|
642
|
+
record = record_type.from_dict(record_dict)
|
|
643
|
+
metric_list.append(record)
|
|
644
|
+
gpu_data[gpu_uuid] = metric_list
|
|
645
|
+
|
|
646
|
+
return gpu_data
|
|
647
|
+
|
|
648
|
+
def _get_avg_gpu_data_from_tag(self) -> Dict[str, Record]:
|
|
649
|
+
return {metric.tag: metric for metric in self._avg_gpu_data}
|
|
650
|
+
|
|
651
|
+
def _deserialize_model_config_measurements(
|
|
652
|
+
self, serialized_model_config_measurements: List[Dict]
|
|
653
|
+
) -> List[ModelConfigMeasurement]:
|
|
654
|
+
model_config_measurements = []
|
|
655
|
+
for mcm_dict in serialized_model_config_measurements:
|
|
656
|
+
model_config_measurements.append(ModelConfigMeasurement.from_dict(mcm_dict))
|
|
657
|
+
|
|
658
|
+
return model_config_measurements
|