triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import argparse
|
|
15
|
+
import sys
|
|
16
|
+
import logging
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import model_analyzer.monitor.dcgm.pydcgm as pydcgm
|
|
22
|
+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
|
|
23
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
24
|
+
import model_analyzer.monitor.dcgm.dcgm_errors as dcgm_errors
|
|
25
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
26
|
+
import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
|
|
27
|
+
except:
|
|
28
|
+
# If we don't find the bindings, add the default path and try again
|
|
29
|
+
if 'PYTHONPATH' in os.environ:
|
|
30
|
+
os.environ['PYTHONPATH'] = os.environ[
|
|
31
|
+
'PYTHONPATH'] + ":/usr/local/dcgm/bindings"
|
|
32
|
+
else:
|
|
33
|
+
os.environ['PYTHONPATH'] = '/usr/local/dcgm/bindings'
|
|
34
|
+
|
|
35
|
+
import model_analyzer.monitor.dcgm.pydcgm as pydcgm
|
|
36
|
+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
|
|
37
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
38
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
39
|
+
import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
|
|
40
|
+
|
|
41
|
+
BR_ST_HEALTHY = 0x0000
|
|
42
|
+
BR_ST_NOT_DETECTED = 0x0001
|
|
43
|
+
BR_ST_FAILED_PASSIVE_HEALTH = 0x0002
|
|
44
|
+
BR_ST_FAILED_ACTIVE_HEALTH = 0x0004
|
|
45
|
+
|
|
46
|
+
BR_HEALTH_WATCH_BITMAP = dcgm_structs.DCGM_HEALTH_WATCH_ALL
|
|
47
|
+
|
|
48
|
+
DIAG_SM_STRESS_DURATION = 90.0
|
|
49
|
+
DIAG_CONSTANT_POWER_DURATION = 120.0
|
|
50
|
+
DIAG_CONSTANT_STRESS_DURATION = 120.0
|
|
51
|
+
DIAG_DIAGNOSTIC_DURATION = 300.0
|
|
52
|
+
|
|
53
|
+
global g_gpus
|
|
54
|
+
global g_switches
|
|
55
|
+
g_gpus = []
|
|
56
|
+
g_switches = []
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Entity(object):
|
|
60
|
+
|
|
61
|
+
def __init__(self,
|
|
62
|
+
entityId,
|
|
63
|
+
entityType=dcgm_fields.DCGM_FE_GPU,
|
|
64
|
+
uuid=None,
|
|
65
|
+
bdf=None):
|
|
66
|
+
self.health = BR_ST_HEALTHY
|
|
67
|
+
self.entityType = entityType
|
|
68
|
+
self.entityId = entityId
|
|
69
|
+
self.reasonsUnhealthy = []
|
|
70
|
+
if uuid:
|
|
71
|
+
self.uuid = uuid
|
|
72
|
+
if bdf:
|
|
73
|
+
self.bdf = bdf
|
|
74
|
+
|
|
75
|
+
def IsHealthy(self):
|
|
76
|
+
return self.health == BR_ST_HEALTHY
|
|
77
|
+
|
|
78
|
+
def MarkUnhealthy(self, failCondition, reason):
|
|
79
|
+
self.health = self.health | failCondition
|
|
80
|
+
self.reasonsUnhealthy.append(reason)
|
|
81
|
+
|
|
82
|
+
def WhyUnhealthy(self):
|
|
83
|
+
return self.reasonsUnhealthy
|
|
84
|
+
|
|
85
|
+
def SetEntityId(self, entityId):
|
|
86
|
+
self.entityId = entityId
|
|
87
|
+
|
|
88
|
+
def GetEntityId(self):
|
|
89
|
+
return self.entityId
|
|
90
|
+
|
|
91
|
+
def GetUUID(self):
|
|
92
|
+
return self.uuid
|
|
93
|
+
|
|
94
|
+
def GetBDF(self):
|
|
95
|
+
return self.bdf
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def mark_entity_unhealthy(entities, entityId, code, reason):
|
|
99
|
+
found = False
|
|
100
|
+
for entity in entities:
|
|
101
|
+
if entityId == entity.GetEntityId():
|
|
102
|
+
entity.MarkUnhealthy(code, reason)
|
|
103
|
+
found = True
|
|
104
|
+
|
|
105
|
+
return found
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def addParamString(runDiagInfo, paramIndex, paramStr):
|
|
109
|
+
strIndex = 0
|
|
110
|
+
for c in paramStr:
|
|
111
|
+
runDiagInfo.testParms[paramIndex][strIndex] = c
|
|
112
|
+
strIndex = strIndex + 1
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def setTestDurations(runDiagInfo, timePercentage):
|
|
116
|
+
# We only are reducing the test time for the default case
|
|
117
|
+
if runDiagInfo.validate != 3:
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
stressDuration = int(DIAG_SM_STRESS_DURATION * timePercentage)
|
|
121
|
+
powerDuration = int(DIAG_CONSTANT_POWER_DURATION * timePercentage)
|
|
122
|
+
constantStressDuration = int(DIAG_CONSTANT_STRESS_DURATION * timePercentage)
|
|
123
|
+
diagDuration = int(DIAG_DIAGNOSTIC_DURATION * timePercentage)
|
|
124
|
+
|
|
125
|
+
smParam = "sm stress.test_duration=%d" % (stressDuration)
|
|
126
|
+
powerParam = "targeted power.test_duration=%d" % (powerDuration)
|
|
127
|
+
constantStressParam = "targeted stress.test_duration=%d" % (
|
|
128
|
+
constantStressDuration)
|
|
129
|
+
diagParam = "diagnostic.test_duration=%d" % (diagDuration)
|
|
130
|
+
|
|
131
|
+
addParamString(runDiagInfo, 0, diagParam)
|
|
132
|
+
addParamString(runDiagInfo, 1, smParam)
|
|
133
|
+
addParamString(runDiagInfo, 2, constantStressParam)
|
|
134
|
+
addParamString(runDiagInfo, 3, powerParam)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def initialize_run_diag_info(settings):
|
|
138
|
+
runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
|
|
139
|
+
runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
|
|
140
|
+
runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
|
|
141
|
+
testNamesStr = settings['testNames']
|
|
142
|
+
if testNamesStr == '1':
|
|
143
|
+
runDiagInfo.validate = 1
|
|
144
|
+
elif testNamesStr == '2':
|
|
145
|
+
runDiagInfo.validate = 2
|
|
146
|
+
elif testNamesStr == '3':
|
|
147
|
+
runDiagInfo.validate = 3
|
|
148
|
+
else:
|
|
149
|
+
# Make sure no number other that 1-3 were submitted
|
|
150
|
+
if testNamesStr.isdigit():
|
|
151
|
+
raise ValueError("'%s' is not a valid test name" % testNamesStr)
|
|
152
|
+
|
|
153
|
+
# Copy to the testNames portion of the object
|
|
154
|
+
names = testNamesStr.split(',')
|
|
155
|
+
testIndex = 0
|
|
156
|
+
if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
|
|
157
|
+
err = 'Aborting DCGM Diag because %d test names were specified exceeding the limit of %d' %\
|
|
158
|
+
(len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
|
|
159
|
+
raise ValueError(err)
|
|
160
|
+
|
|
161
|
+
for testName in names:
|
|
162
|
+
testNameIndex = 0
|
|
163
|
+
if len(testName) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
|
|
164
|
+
err = 'Aborting DCGM Diag because test name %s exceeds max length %d' % \
|
|
165
|
+
(testName, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
|
|
166
|
+
raise ValueError(err)
|
|
167
|
+
|
|
168
|
+
for c in testName:
|
|
169
|
+
runDiagInfo.testNames[testIndex][testNameIndex] = c
|
|
170
|
+
testNameIndex = testNameIndex + 1
|
|
171
|
+
|
|
172
|
+
testIndex = testIndex + 1
|
|
173
|
+
|
|
174
|
+
if 'timePercentage' in settings:
|
|
175
|
+
setTestDurations(runDiagInfo, settings['timePercentage'])
|
|
176
|
+
|
|
177
|
+
activeGpuIds = []
|
|
178
|
+
|
|
179
|
+
first = True
|
|
180
|
+
for gpuObj in g_gpus:
|
|
181
|
+
if gpuObj.IsHealthy():
|
|
182
|
+
activeGpuIds.append(gpuObj.GetEntityId())
|
|
183
|
+
if first:
|
|
184
|
+
runDiagInfo.gpuList = str(gpuObj.GetEntityId())
|
|
185
|
+
first = False
|
|
186
|
+
else:
|
|
187
|
+
to_append = ',%s' % (str(gpuObj.GetEntityId()))
|
|
188
|
+
runDiagInfo.gpuList = runDiagInfo.gpuList + to_append
|
|
189
|
+
|
|
190
|
+
return runDiagInfo, activeGpuIds
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def mark_all_unhealthy(activeGpuIds, reason):
|
|
194
|
+
for gpuId in activeGpuIds:
|
|
195
|
+
mark_entity_unhealthy(g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, reason)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def result_to_str(result):
|
|
199
|
+
if result == dcgm_structs.DCGM_DIAG_RESULT_PASS:
|
|
200
|
+
return 'PASS'
|
|
201
|
+
elif result == dcgm_structs.DCGM_DIAG_RESULT_SKIP:
|
|
202
|
+
return 'SKIP'
|
|
203
|
+
elif result == dcgm_structs.DCGM_DIAG_RESULT_WARN:
|
|
204
|
+
return 'WARN'
|
|
205
|
+
elif result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
|
|
206
|
+
return 'FAIL'
|
|
207
|
+
else:
|
|
208
|
+
return 'NOT RUN'
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def check_passive_health_checks(response, activeGpuIds):
|
|
212
|
+
unhealthy = False
|
|
213
|
+
for i in range(0, dcgm_structs.DCGM_SWTEST_COUNT):
|
|
214
|
+
if response.levelOneResults[
|
|
215
|
+
i].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
|
|
216
|
+
mark_all_unhealthy(activeGpuIds,
|
|
217
|
+
response.levelOneResults[i].error.msg)
|
|
218
|
+
unhealthy = True
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
return unhealthy
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def check_gpu_diagnostic(handleObj, settings):
|
|
225
|
+
runDiagInfo, activeGpuIds = initialize_run_diag_info(settings)
|
|
226
|
+
if len(activeGpuIds) == 0:
|
|
227
|
+
return
|
|
228
|
+
|
|
229
|
+
response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo)
|
|
230
|
+
|
|
231
|
+
sysError = response.systemError
|
|
232
|
+
if (sysError.code != dcgm_errors.DCGM_FR_OK):
|
|
233
|
+
raise ValueError(sysError)
|
|
234
|
+
|
|
235
|
+
if check_passive_health_checks(response, activeGpuIds) == False:
|
|
236
|
+
for gpuIndex in range(response.gpuCount):
|
|
237
|
+
for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT_V8):
|
|
238
|
+
if response.perGpuResponses[gpuIndex].results[
|
|
239
|
+
testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
|
|
240
|
+
gpuId = response.perGpuResponses[gpuIndex].gpuId
|
|
241
|
+
mark_entity_unhealthy(
|
|
242
|
+
g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH,
|
|
243
|
+
response.perGpuResponses[gpuIndex].results[testIndex].
|
|
244
|
+
result.error.msg)
|
|
245
|
+
|
|
246
|
+
# NVVS marks all subsequent tests as failed so there's no point in continuing
|
|
247
|
+
break
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def query_passive_health(handleObj, desired_watches):
|
|
251
|
+
dcgmGroup = handleObj.GetSystem().GetDefaultGroup()
|
|
252
|
+
watches = dcgmGroup.health.Get()
|
|
253
|
+
|
|
254
|
+
# Check for the correct watches to be set and set them if necessary
|
|
255
|
+
if watches != desired_watches:
|
|
256
|
+
dcgmGroup.health.Set(desired_watches)
|
|
257
|
+
|
|
258
|
+
return dcgmGroup.health.Check()
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def denylist_from_passive_health_check(response):
|
|
262
|
+
for incidentIndex in range(response.incidentCount):
|
|
263
|
+
if response.incidents[
|
|
264
|
+
incidentIndex].health != dcgm_structs.DCGM_HEALTH_RESULT_FAIL:
|
|
265
|
+
# Only add to the denylist for failures; ignore warnings
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
entityId = response.incidents[incidentIndex].entityInfo.entityId
|
|
269
|
+
entityGroupId = response.incidents[
|
|
270
|
+
incidentIndex].entityInfo.entityGroupId
|
|
271
|
+
errorString = response.incidents[incidentIndex].error.msg
|
|
272
|
+
|
|
273
|
+
if entityGroupId == dcgm_fields.DCGM_FE_GPU:
|
|
274
|
+
mark_entity_unhealthy(g_gpus, entityId, BR_ST_FAILED_PASSIVE_HEALTH,
|
|
275
|
+
errorString)
|
|
276
|
+
else:
|
|
277
|
+
mark_entity_unhealthy(g_switches, entityId,
|
|
278
|
+
BR_ST_FAILED_PASSIVE_HEALTH, errorString)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def check_passive_health(handleObj, watches):
|
|
282
|
+
response = query_passive_health(handleObj, watches)
|
|
283
|
+
|
|
284
|
+
if response.overallHealth != dcgm_structs.DCGM_HEALTH_RESULT_PASS:
|
|
285
|
+
denylist_from_passive_health_check(response)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def initialize_devices(handle, flags):
|
|
289
|
+
gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle,
|
|
290
|
+
dcgm_fields.DCGM_FE_GPU,
|
|
291
|
+
flags)
|
|
292
|
+
switchIds = dcgm_agent.dcgmGetEntityGroupEntities(
|
|
293
|
+
handle, dcgm_fields.DCGM_FE_SWITCH, flags)
|
|
294
|
+
|
|
295
|
+
i = 0
|
|
296
|
+
for gpuId in gpuIds:
|
|
297
|
+
attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId)
|
|
298
|
+
gpuObj = Entity(gpuId,
|
|
299
|
+
entityType=dcgm_fields.DCGM_FE_GPU,
|
|
300
|
+
uuid=attributes.identifiers.uuid,
|
|
301
|
+
bdf=attributes.identifiers.pciBusId)
|
|
302
|
+
g_gpus.append(gpuObj)
|
|
303
|
+
i = i + 1
|
|
304
|
+
|
|
305
|
+
i = 0
|
|
306
|
+
for switchId in switchIds:
|
|
307
|
+
switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH)
|
|
308
|
+
g_switches.append(switchObj)
|
|
309
|
+
i = i + 1
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# Process command line arguments
|
|
313
|
+
def __process_command_line__(settings):
|
|
314
|
+
parser = argparse.ArgumentParser()
|
|
315
|
+
parser.add_argument('-g',
|
|
316
|
+
'--num-gpus',
|
|
317
|
+
dest='num_gpus',
|
|
318
|
+
type=int,
|
|
319
|
+
help='The expected number of GPUs.')
|
|
320
|
+
parser.add_argument('-s',
|
|
321
|
+
'--num-switches',
|
|
322
|
+
dest='num_switches',
|
|
323
|
+
type=int,
|
|
324
|
+
help='The expected number of NvSwitches.')
|
|
325
|
+
parser.add_argument(
|
|
326
|
+
'-n',
|
|
327
|
+
'--hostname',
|
|
328
|
+
dest='hostname',
|
|
329
|
+
type=str,
|
|
330
|
+
help='The hostname of the nv-hostengine we want to query.')
|
|
331
|
+
parser.add_argument(
|
|
332
|
+
'-d',
|
|
333
|
+
'--detect',
|
|
334
|
+
dest='detect',
|
|
335
|
+
action='store_true',
|
|
336
|
+
help='Run on whatever GPUs can be detected. Do not check counts.')
|
|
337
|
+
parser.add_argument(
|
|
338
|
+
'-l',
|
|
339
|
+
'--log-file',
|
|
340
|
+
dest='logfileName',
|
|
341
|
+
type=str,
|
|
342
|
+
help=
|
|
343
|
+
'The name of the log file where details should be stored. Default is stdout'
|
|
344
|
+
)
|
|
345
|
+
parser.add_argument(
|
|
346
|
+
'-u',
|
|
347
|
+
'--unsupported-too',
|
|
348
|
+
dest='unsupported',
|
|
349
|
+
action='store_true',
|
|
350
|
+
help='Get unsupported devices in addition to the ones DCGM supports')
|
|
351
|
+
parser.add_argument('-f',
|
|
352
|
+
'--full-report',
|
|
353
|
+
dest='fullReport',
|
|
354
|
+
action='store_true',
|
|
355
|
+
help='Print a health status for each GPU')
|
|
356
|
+
parser.add_argument(
|
|
357
|
+
'-c',
|
|
358
|
+
'--csv',
|
|
359
|
+
dest='outfmtCSV',
|
|
360
|
+
action='store_true',
|
|
361
|
+
help='Write output in csv format. By default, output is in json format.'
|
|
362
|
+
)
|
|
363
|
+
parser.add_argument(
|
|
364
|
+
'-w',
|
|
365
|
+
'--watches',
|
|
366
|
+
dest='watches',
|
|
367
|
+
type=str,
|
|
368
|
+
help=
|
|
369
|
+
'Specify which health watches to monitor. By default, all are watched. Any list of the following may be specified:\n\ta = All watches\n\tp = PCIE\n\tm = Memory\n\ti = Inforom\n\tt = Thermal and Power\n\tn = NVLINK'
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
group = parser.add_mutually_exclusive_group()
|
|
373
|
+
group.add_argument(
|
|
374
|
+
'-r',
|
|
375
|
+
'--specified-test',
|
|
376
|
+
dest='testNames',
|
|
377
|
+
type=str,
|
|
378
|
+
help='Option to specify what tests are run in dcgmi diag.')
|
|
379
|
+
group.add_argument(
|
|
380
|
+
'-i',
|
|
381
|
+
'--instantaneous',
|
|
382
|
+
dest='instant',
|
|
383
|
+
action='store_true',
|
|
384
|
+
help='Specify to skip the longer tests and run instantaneously')
|
|
385
|
+
group.add_argument(
|
|
386
|
+
'-t',
|
|
387
|
+
'--time-limit',
|
|
388
|
+
dest='timeLimit',
|
|
389
|
+
type=int,
|
|
390
|
+
help=
|
|
391
|
+
'The time limit in seconds that all the tests should not exceed. Diagnostics will be reduced in their time to meet this boundary.'
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
parser.set_defaults(instant=False, detect=False, fullReport=False)
|
|
395
|
+
args = parser.parse_args()
|
|
396
|
+
|
|
397
|
+
if args.num_gpus is not None and args.num_switches is not None:
|
|
398
|
+
settings['numGpus'] = args.num_gpus
|
|
399
|
+
settings['numSwitches'] = args.num_switches
|
|
400
|
+
elif args.detect == False:
|
|
401
|
+
raise ValueError(
|
|
402
|
+
'Must specify either a number of gpus and switches with -g and -s or auto-detect with -d'
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
if args.hostname:
|
|
406
|
+
settings['hostname'] = args.hostname
|
|
407
|
+
else:
|
|
408
|
+
settings['hostname'] = 'localhost'
|
|
409
|
+
|
|
410
|
+
if args.unsupported:
|
|
411
|
+
settings['entity_get_flags'] = 0
|
|
412
|
+
else:
|
|
413
|
+
settings[
|
|
414
|
+
'entity_get_flags'] = dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
|
|
415
|
+
|
|
416
|
+
settings['instant'] = args.instant
|
|
417
|
+
settings['fullReport'] = args.fullReport
|
|
418
|
+
|
|
419
|
+
if args.testNames:
|
|
420
|
+
settings['testNames'] = args.testNames
|
|
421
|
+
else:
|
|
422
|
+
settings['testNames'] = '3'
|
|
423
|
+
|
|
424
|
+
if args.timeLimit:
|
|
425
|
+
settings['timePercentage'] = float(args.timeLimit) / 840.0
|
|
426
|
+
|
|
427
|
+
if args.logfileName:
|
|
428
|
+
logging.basicConfig(filename=args.logfileName)
|
|
429
|
+
|
|
430
|
+
if args.outfmtCSV:
|
|
431
|
+
settings['outfmtCSV'] = 1
|
|
432
|
+
|
|
433
|
+
if args.watches:
|
|
434
|
+
health_watches = 0
|
|
435
|
+
for c in args.watches:
|
|
436
|
+
if c == 'p':
|
|
437
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_PCIE
|
|
438
|
+
elif c == 'm':
|
|
439
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_MEM
|
|
440
|
+
elif c == 'i':
|
|
441
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_INFOROM
|
|
442
|
+
elif c == 't':
|
|
443
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_THERMAL
|
|
444
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_POWER
|
|
445
|
+
elif c == 'n':
|
|
446
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_NVLINK
|
|
447
|
+
elif c == 'a':
|
|
448
|
+
health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_ALL
|
|
449
|
+
else:
|
|
450
|
+
print(("Unrecognized character %s found in watch string '%s'" %
|
|
451
|
+
(c, args.watches)))
|
|
452
|
+
sys.exit(-1)
|
|
453
|
+
settings['watches'] = health_watches
|
|
454
|
+
else:
|
|
455
|
+
settings['watches'] = BR_HEALTH_WATCH_BITMAP
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def get_entity_id_list(entities):
|
|
459
|
+
ids = ""
|
|
460
|
+
first = True
|
|
461
|
+
for entity in entities:
|
|
462
|
+
if first:
|
|
463
|
+
ids = str(entity.GetEntityId())
|
|
464
|
+
else:
|
|
465
|
+
ids += ",%d" % (entity.GetEntityId())
|
|
466
|
+
first = False
|
|
467
|
+
|
|
468
|
+
return ids
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def check_health(handleObj, settings, error_list):
|
|
472
|
+
initialize_devices(handleObj.handle, settings['entity_get_flags'])
|
|
473
|
+
|
|
474
|
+
if 'numGpus' in settings:
|
|
475
|
+
if len(g_gpus) != settings['numGpus']:
|
|
476
|
+
error_list.append(
|
|
477
|
+
"%d GPUs were specified but only %d were detected with ids '%s'"
|
|
478
|
+
%
|
|
479
|
+
(settings['numGpus'], len(g_gpus), get_entity_id_list(g_gpus)))
|
|
480
|
+
|
|
481
|
+
if 'numSwitches' in settings:
|
|
482
|
+
if len(g_switches) != settings['numSwitches']:
|
|
483
|
+
error_list.append(
|
|
484
|
+
"%d switches were specified but only %d were detected with ids '%s'"
|
|
485
|
+
% (settings['numSwitches'], len(g_switches),
|
|
486
|
+
get_entity_id_list(g_switches)))
|
|
487
|
+
|
|
488
|
+
check_passive_health(handleObj, settings['watches']) # quick check
|
|
489
|
+
|
|
490
|
+
if settings['instant'] == False:
|
|
491
|
+
check_gpu_diagnostic(handleObj, settings)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def process_command_line(settings):
|
|
495
|
+
try:
|
|
496
|
+
__process_command_line__(settings)
|
|
497
|
+
except ValueError as e:
|
|
498
|
+
return str(e)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def main():
|
|
502
|
+
# Parse the command line
|
|
503
|
+
settings = {}
|
|
504
|
+
error_list = []
|
|
505
|
+
|
|
506
|
+
exitCode = 0
|
|
507
|
+
jsonTop = {}
|
|
508
|
+
|
|
509
|
+
error = process_command_line(settings)
|
|
510
|
+
if error:
|
|
511
|
+
# If we had an error processing the command line, don't attempt to check anything
|
|
512
|
+
error_list.append(error)
|
|
513
|
+
else:
|
|
514
|
+
try:
|
|
515
|
+
handleObj = pydcgm.DcgmHandle(None, settings['hostname'],
|
|
516
|
+
dcgm_structs.DCGM_OPERATION_MODE_AUTO)
|
|
517
|
+
|
|
518
|
+
check_health(handleObj, settings, error_list)
|
|
519
|
+
except dcgm_structs.DCGMError as e:
|
|
520
|
+
# Catch any exceptions from DCGM and add them to the error_list so they'll be printed as JSON
|
|
521
|
+
error_list.append(str(e))
|
|
522
|
+
except ValueError as e:
|
|
523
|
+
error_list.append(str(e))
|
|
524
|
+
|
|
525
|
+
if 'outfmtCSV' in settings: # show all health, then all un-healthy
|
|
526
|
+
for gpuObj in g_gpus:
|
|
527
|
+
if gpuObj.IsHealthy() == True:
|
|
528
|
+
print("healthy,%s,%s" % (gpuObj.GetBDF(), gpuObj.GetUUID()))
|
|
529
|
+
for gpuObj in g_gpus:
|
|
530
|
+
if gpuObj.IsHealthy() == False:
|
|
531
|
+
print("unhealthy,%s,%s,\"%s\"" %
|
|
532
|
+
(gpuObj.GetBDF(), gpuObj.GetUUID(),
|
|
533
|
+
gpuObj.WhyUnhealthy()))
|
|
534
|
+
|
|
535
|
+
else: # build obj that can be output in json
|
|
536
|
+
denylistGpus = {}
|
|
537
|
+
healthyGpus = {}
|
|
538
|
+
for gpuObj in g_gpus:
|
|
539
|
+
if gpuObj.IsHealthy() == False:
|
|
540
|
+
details = {}
|
|
541
|
+
details['UUID'] = gpuObj.GetUUID()
|
|
542
|
+
details['BDF'] = gpuObj.GetBDF()
|
|
543
|
+
details['Failure Explanation'] = gpuObj.WhyUnhealthy()
|
|
544
|
+
denylistGpus[gpuObj.GetEntityId()] = details
|
|
545
|
+
elif settings['fullReport']:
|
|
546
|
+
details = {}
|
|
547
|
+
details['UUID'] = gpuObj.GetUUID()
|
|
548
|
+
details['BDF'] = gpuObj.GetBDF()
|
|
549
|
+
healthyGpus[gpuObj.GetEntityId()] = details
|
|
550
|
+
|
|
551
|
+
jsonTop['denylistedGpus'] = denylistGpus
|
|
552
|
+
if settings['fullReport']:
|
|
553
|
+
jsonTop['Healthy GPUs'] = healthyGpus
|
|
554
|
+
|
|
555
|
+
if len(error_list): # had error processing the command line
|
|
556
|
+
exitCode = 1
|
|
557
|
+
if 'outfmtCSV' in settings: # json output
|
|
558
|
+
if len(error_list):
|
|
559
|
+
for errObj in error_list:
|
|
560
|
+
print("errors,\"%s\"" % (errObj))
|
|
561
|
+
else:
|
|
562
|
+
jsonTop['errors'] = error_list
|
|
563
|
+
|
|
564
|
+
if 'outfmtCSV' in settings: # show all health, then all un-healthy
|
|
565
|
+
pass
|
|
566
|
+
else:
|
|
567
|
+
print(json.dumps(jsonTop, indent=4, separators=(',', ': ')))
|
|
568
|
+
|
|
569
|
+
sys.exit(exitCode)
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
if __name__ == '__main__':
|
|
573
|
+
main()
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _python_version_check():
|
|
17
|
+
import sys
|
|
18
|
+
python_version = sys.version.split(None, 1)[0]
|
|
19
|
+
if python_version < '3':
|
|
20
|
+
print(
|
|
21
|
+
'[ERROR] Detected Python version {}. These bindings are for Python 3.5+. Please load the Python 2 bindings found at /usr/local/dcgm/bindings'
|
|
22
|
+
.format(python_version))
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_python_version_check()
|
|
27
|
+
|
|
28
|
+
#Bring classes into this namespace
|
|
29
|
+
from model_analyzer.monitor.dcgm.DcgmHandle import *
|
|
30
|
+
from model_analyzer.monitor.dcgm.DcgmGroup import *
|
|
31
|
+
from model_analyzer.monitor.dcgm.DcgmStatus import *
|
|
32
|
+
from model_analyzer.monitor.dcgm.DcgmSystem import *
|
|
33
|
+
from model_analyzer.monitor.dcgm.DcgmFieldGroup import *
|
|
34
|
+
|
|
35
|
+
import os
|
|
36
|
+
if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[
|
|
37
|
+
'__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1':
|
|
38
|
+
import utils
|
|
39
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
40
|
+
dcgm_structs._dcgmInit(utils.get_testing_framework_library_path())
|
|
41
|
+
'''
|
|
42
|
+
Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones
|
|
43
|
+
'''
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DcgmException(Exception):
|
|
47
|
+
pass
|