triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,815 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import model_analyzer.monitor.dcgm.pydcgm as pydcgm
|
|
16
|
+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
|
|
17
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
18
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
19
|
+
import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers
|
|
20
|
+
from model_analyzer.monitor.dcgm.DcgmHandle import DcgmHandle
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DcgmGroupConfig:
|
|
24
|
+
|
|
25
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
26
|
+
self._dcgmHandle = dcgmHandle
|
|
27
|
+
self._groupId = groupId
|
|
28
|
+
self._dcgmGroup = dcgmGroup
|
|
29
|
+
|
|
30
|
+
'''
|
|
31
|
+
Set configuration for this group
|
|
32
|
+
|
|
33
|
+
config should be an instance of dcgm_structs.c_dcgmDeviceConfig_v1
|
|
34
|
+
|
|
35
|
+
Will throw an exception on error
|
|
36
|
+
'''
|
|
37
|
+
|
|
38
|
+
def Set(self, config):
|
|
39
|
+
status = pydcgm.DcgmStatus()
|
|
40
|
+
ret = dcgm_structs.DCGM_ST_OK
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle,
|
|
44
|
+
self._groupId, config, status.handle)
|
|
45
|
+
except dcgm_structs.DCGMError as e:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
#Throw specific errors before return error
|
|
49
|
+
status.ThrowExceptionOnErrors()
|
|
50
|
+
#Throw an appropriate exception on error
|
|
51
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
52
|
+
|
|
53
|
+
'''
|
|
54
|
+
Get configuration for this group
|
|
55
|
+
|
|
56
|
+
configType is a DCGM_CONFIG_? constant
|
|
57
|
+
|
|
58
|
+
Returns an array of dcgm_structs.c_dcgmDeviceConfig_v1 objects
|
|
59
|
+
Throws an exception on error
|
|
60
|
+
'''
|
|
61
|
+
|
|
62
|
+
def Get(self, configType):
|
|
63
|
+
status = pydcgm.DcgmStatus()
|
|
64
|
+
|
|
65
|
+
gpuIds = self._dcgmGroup.GetGpuIds()
|
|
66
|
+
configList = dcgm_agent.dcgmConfigGet(self._dcgmHandle.handle,
|
|
67
|
+
self._groupId, configType,
|
|
68
|
+
len(gpuIds), status.handle)
|
|
69
|
+
#Throw specific errors before return error
|
|
70
|
+
status.ThrowExceptionOnErrors()
|
|
71
|
+
return configList
|
|
72
|
+
|
|
73
|
+
'''
|
|
74
|
+
Enforce the configuration that has been set with Set()
|
|
75
|
+
|
|
76
|
+
Throws an exception on error
|
|
77
|
+
'''
|
|
78
|
+
|
|
79
|
+
def Enforce(self):
|
|
80
|
+
status = pydcgm.DcgmStatus()
|
|
81
|
+
ret = dcgm_structs.DCGM_ST_OK
|
|
82
|
+
try:
|
|
83
|
+
ret = dcgm_agent.dcgmConfigEnforce(self._dcgmHandle.handle,
|
|
84
|
+
self._groupId, status.handle)
|
|
85
|
+
except dcgm_structs.DCGMError as e:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
#Throw specific errors before return error
|
|
89
|
+
status.ThrowExceptionOnErrors()
|
|
90
|
+
#Throw an appropriate exception on error
|
|
91
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class DcgmGroupSamples:
|
|
95
|
+
|
|
96
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
97
|
+
self._dcgmHandle = dcgmHandle
|
|
98
|
+
self._groupId = groupId
|
|
99
|
+
self._dcgmGroup = dcgmGroup
|
|
100
|
+
|
|
101
|
+
'''
|
|
102
|
+
Tell DCGM to start recording samples for the given field group
|
|
103
|
+
|
|
104
|
+
fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
|
|
105
|
+
updateFreq: How often to update these fields in usec
|
|
106
|
+
maxKeepAge: How long to keep data for these fields in seconds
|
|
107
|
+
maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
|
|
108
|
+
|
|
109
|
+
Once the field collection is watched, it will update whenever the next update
|
|
110
|
+
loop occurs. If you want to query these values immediately, use
|
|
111
|
+
handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
|
|
112
|
+
'''
|
|
113
|
+
|
|
114
|
+
def WatchFields(self, fieldGroup, updateFreq, maxKeepAge, maxKeepSamples):
|
|
115
|
+
ret = dcgm_agent.dcgmWatchFields(self._dcgmHandle.handle, self._groupId,
|
|
116
|
+
fieldGroup.fieldGroupId, updateFreq,
|
|
117
|
+
maxKeepAge, maxKeepSamples)
|
|
118
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
119
|
+
|
|
120
|
+
'''
|
|
121
|
+
tell DCGM to stop recording samples for a given field group
|
|
122
|
+
|
|
123
|
+
fieldGroup: DcgmFieldGroup() instance tracking the fields we want to unwatch.
|
|
124
|
+
'''
|
|
125
|
+
|
|
126
|
+
def UnwatchFields(self, fieldGroup):
|
|
127
|
+
ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle,
|
|
128
|
+
self._groupId,
|
|
129
|
+
fieldGroup.fieldGroupId)
|
|
130
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
131
|
+
|
|
132
|
+
'''
|
|
133
|
+
Get the most recent values for each field in a field collection
|
|
134
|
+
|
|
135
|
+
fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
|
|
136
|
+
|
|
137
|
+
Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][0].value to access values
|
|
138
|
+
'''
|
|
139
|
+
|
|
140
|
+
def GetLatest(self, fieldGroup):
|
|
141
|
+
dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
|
|
142
|
+
self._dcgmHandle.handle, self._groupId)
|
|
143
|
+
dfvc.GetLatestValues(fieldGroup)
|
|
144
|
+
return dfvc
|
|
145
|
+
|
|
146
|
+
'''
|
|
147
|
+
Get the most recent values for each field in a field collection
|
|
148
|
+
|
|
149
|
+
fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
|
|
150
|
+
|
|
151
|
+
Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][0].value to access values
|
|
152
|
+
'''
|
|
153
|
+
|
|
154
|
+
def GetLatest_v2(self, fieldGroup):
|
|
155
|
+
dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
|
|
156
|
+
self._dcgmHandle.handle, self._groupId)
|
|
157
|
+
dfvec.GetLatestValues(fieldGroup)
|
|
158
|
+
return dfvec
|
|
159
|
+
|
|
160
|
+
'''
|
|
161
|
+
Get the new values for each field in a field collection since the last
|
|
162
|
+
collection.
|
|
163
|
+
|
|
164
|
+
dfvc: DcgmFieldValueCollection() instance. Will return a
|
|
165
|
+
DcgmFieldValueCollection with values since the one passed in.
|
|
166
|
+
Pass None for the first call to get one for subsequent calls.
|
|
167
|
+
On subsequent calls, pass what was returned.
|
|
168
|
+
fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
|
|
169
|
+
|
|
170
|
+
Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][*].value to access values
|
|
171
|
+
'''
|
|
172
|
+
|
|
173
|
+
def GetAllSinceLastCall(self, dfvc, fieldGroup):
|
|
174
|
+
if dfvc == None:
|
|
175
|
+
dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
|
|
176
|
+
self._dcgmHandle.handle, self._groupId)
|
|
177
|
+
dfvc.GetLatestValues(fieldGroup)
|
|
178
|
+
else:
|
|
179
|
+
# We used to expect at least one value (GetLatestValues), so this
|
|
180
|
+
# ensures we provide one at the risk of repetition. This should not
|
|
181
|
+
# happen if we call this function infrequently enough (slower than
|
|
182
|
+
# the sampling rate).
|
|
183
|
+
dfvc.GetAllSinceLastCall(fieldGroup)
|
|
184
|
+
if len(dfvc.values) == 0:
|
|
185
|
+
dfvc.GetLatestValues(fieldGroup)
|
|
186
|
+
return dfvc
|
|
187
|
+
|
|
188
|
+
'''
|
|
189
|
+
Gets more values for each field in a field entity collection
|
|
190
|
+
|
|
191
|
+
dfvec: DcgmFieldValueEntityCollection() instance. Will return a
|
|
192
|
+
DcgmFieldValueEntityCollection with values since the one passed
|
|
193
|
+
in. Pass None for the first call to get one for subsequent
|
|
194
|
+
calls. On subsequent calls, pass what was returned.
|
|
195
|
+
|
|
196
|
+
fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
|
|
197
|
+
|
|
198
|
+
Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][*].value to access values
|
|
199
|
+
'''
|
|
200
|
+
|
|
201
|
+
def GetAllSinceLastCall_v2(self, dvfec, fieldGroup):
|
|
202
|
+
if dfvec == None:
|
|
203
|
+
dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
|
|
204
|
+
self._dcgmHandle.handle, self._groupId)
|
|
205
|
+
dfvec.GetLastestValues_v2(fieldGroup)
|
|
206
|
+
else:
|
|
207
|
+
dfvec.GetAllSinceLastCall_v2(fieldGroup)
|
|
208
|
+
# We used to expect at least one value (GetLatestValues), so this
|
|
209
|
+
# ensures we provide one at the risk of repetition. This should not
|
|
210
|
+
# happen if we call this function infrequently enough (slower than
|
|
211
|
+
# the sampling rate).
|
|
212
|
+
if len(dfvec.values) == 0:
|
|
213
|
+
dfvec.GetLatestValues_v2(fieldGroup)
|
|
214
|
+
|
|
215
|
+
return dfvec
|
|
216
|
+
|
|
217
|
+
'''
|
|
218
|
+
Convenience alias for DcgmHandle.UpdateAllFields(). All fields on the system will be updated, not
|
|
219
|
+
just this group's.
|
|
220
|
+
'''
|
|
221
|
+
|
|
222
|
+
def UpdateAllFields(self, waitForUpdate):
|
|
223
|
+
self._dcgmHandle.UpdateAllFields(waitForUpdate)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class DcgmGroupHealth:
|
|
227
|
+
|
|
228
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
229
|
+
self._dcgmHandle = dcgmHandle
|
|
230
|
+
self._groupId = groupId
|
|
231
|
+
self._dcgmGroup = dcgmGroup
|
|
232
|
+
|
|
233
|
+
'''
|
|
234
|
+
Enable health checks for this group
|
|
235
|
+
|
|
236
|
+
systems: A bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks to enable
|
|
237
|
+
updateInterval: How often DCGM should request new health data from the driver in usec
|
|
238
|
+
maxKeepAge: How long DCGM should keep health data around once it has been retrieved from the driver in seconds
|
|
239
|
+
'''
|
|
240
|
+
|
|
241
|
+
def Set(self, systems, updateInterval=None, maxKeepAge=None):
|
|
242
|
+
if updateInterval is None or maxKeepAge is None:
|
|
243
|
+
ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle,
|
|
244
|
+
self._groupId, systems)
|
|
245
|
+
else:
|
|
246
|
+
ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle,
|
|
247
|
+
self._groupId, systems,
|
|
248
|
+
updateInterval, maxKeepAge)
|
|
249
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
250
|
+
|
|
251
|
+
'''
|
|
252
|
+
Retrieve the current state of the DCGM health check system
|
|
253
|
+
|
|
254
|
+
Returns a bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks are currently enabled
|
|
255
|
+
'''
|
|
256
|
+
|
|
257
|
+
def Get(self):
|
|
258
|
+
systems = dcgm_agent.dcgmHealthGet(self._dcgmHandle.handle,
|
|
259
|
+
self._groupId)
|
|
260
|
+
return systems
|
|
261
|
+
|
|
262
|
+
'''
|
|
263
|
+
Check the configured watches for any errors/failures/warnings that have occurred
|
|
264
|
+
since the last time this check was invoked. On the first call, stateful information
|
|
265
|
+
about all of the enabled watches within a group is created but no error results are
|
|
266
|
+
provided. On subsequent calls, any error information will be returned.
|
|
267
|
+
|
|
268
|
+
@param version IN: Allows the caller to use an older version of this request. Should be
|
|
269
|
+
dcgm_structs.dcgmHealthResponse_version4
|
|
270
|
+
|
|
271
|
+
Returns a dcgm_structs.c_dcgmHealthResponse_* object that contains results for each GPU/entity
|
|
272
|
+
'''
|
|
273
|
+
|
|
274
|
+
def Check(self, version=dcgm_structs.dcgmHealthResponse_version4):
|
|
275
|
+
resp = dcgm_agent.dcgmHealthCheck(self._dcgmHandle.handle,
|
|
276
|
+
self._groupId, version)
|
|
277
|
+
return resp
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class DcgmGroupPolicy:
|
|
281
|
+
|
|
282
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
283
|
+
self._dcgmHandle = dcgmHandle
|
|
284
|
+
self._groupId = groupId
|
|
285
|
+
self._dcgmGroup = dcgmGroup
|
|
286
|
+
|
|
287
|
+
'''
|
|
288
|
+
Get the current violation policy inside the policy manager. Given a groupId, a number of
|
|
289
|
+
policy structures are retrieved.
|
|
290
|
+
|
|
291
|
+
@param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status of the operation. Pass it as None
|
|
292
|
+
if the detailed error information for the operation is not needed (default).
|
|
293
|
+
|
|
294
|
+
Returns a list of dcgm_structs.c_dcgmPolicy_v1 with the same length as the number of GPUs in the group.
|
|
295
|
+
The index of an entry corresponds to a given GPU ID in the group. Throws an exception on error.
|
|
296
|
+
'''
|
|
297
|
+
|
|
298
|
+
def Get(self, statusHandle=None):
|
|
299
|
+
if statusHandle:
|
|
300
|
+
statusHandle = statusHandle.handle
|
|
301
|
+
count = len(self._dcgmGroup.GetGpuIds())
|
|
302
|
+
if count <= 0:
|
|
303
|
+
raise pydcgm.DcgmException(
|
|
304
|
+
"This group has no GPUs, cannot retrieve policies")
|
|
305
|
+
return dcgm_agent.dcgmPolicyGet(self._dcgmHandle.handle, self._groupId,
|
|
306
|
+
count, statusHandle)
|
|
307
|
+
|
|
308
|
+
'''
|
|
309
|
+
Set the current violation policy inside the policy manager. Given the conditions within "policy",
|
|
310
|
+
if a violation has occurred, subsequent action(s) may be performed to either
|
|
311
|
+
report or contain the failure.
|
|
312
|
+
|
|
313
|
+
This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
|
|
314
|
+
|
|
315
|
+
@param policy IN: dcgm_structs.c_dcgmPolicy_v1 that will be applied to all GPUs in the group
|
|
316
|
+
|
|
317
|
+
@param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status for the operation. Pass it as
|
|
318
|
+
None if the detailed error information for the operation is not needed (default).
|
|
319
|
+
|
|
320
|
+
Returns Nothing. Throws an exception on error
|
|
321
|
+
'''
|
|
322
|
+
|
|
323
|
+
def Set(self, policy, statusHandle=None):
|
|
324
|
+
if statusHandle:
|
|
325
|
+
statusHandle = statusHandle.handle
|
|
326
|
+
dcgm_agent.dcgmPolicySet(self._dcgmHandle.handle, self._groupId, policy,
|
|
327
|
+
statusHandle)
|
|
328
|
+
|
|
329
|
+
'''
|
|
330
|
+
Register a function to be called when a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition)
|
|
331
|
+
has been violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after
|
|
332
|
+
DcgmPolicy.Trigger when in DCGM_OPERATION_MODE_MANUAL mode.
|
|
333
|
+
All callbacks are made within a separate thread.
|
|
334
|
+
|
|
335
|
+
This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
|
|
336
|
+
|
|
337
|
+
@param condition IN: The set of conditions specified as an OR'd list
|
|
338
|
+
(see dcgm_structs.DCGM_POLICY_COND_*)
|
|
339
|
+
for which to register a callback function
|
|
340
|
+
|
|
341
|
+
@param beginCallback IN: A function that should be called should a violation occur. This
|
|
342
|
+
function will be called prior to any actions specified by the policy are taken.
|
|
343
|
+
|
|
344
|
+
@param finishCallback IN: A reference to a function that should be called should a violation occur.
|
|
345
|
+
This function will be called after any action specified by the policy are completed.
|
|
346
|
+
|
|
347
|
+
At least one callback must be provided that is not None.
|
|
348
|
+
|
|
349
|
+
Returns Nothing. Throws an exception on error.
|
|
350
|
+
'''
|
|
351
|
+
|
|
352
|
+
def Register(self, condition, beginCallback=None, finishCallback=None):
|
|
353
|
+
if beginCallback is None and finishCallback is None:
|
|
354
|
+
raise pydcgm.DcgmException(
|
|
355
|
+
"At least 1 callback must be provided to register that is not None"
|
|
356
|
+
)
|
|
357
|
+
dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId,
|
|
358
|
+
condition, beginCallback, finishCallback)
|
|
359
|
+
|
|
360
|
+
'''
|
|
361
|
+
Unregister a function to be called for a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) .
|
|
362
|
+
This function will unregister all callbacks for a given condition.
|
|
363
|
+
|
|
364
|
+
@param condition IN: The set of conditions specified as an OR'd list
|
|
365
|
+
(see dcgm_structs.DCGM_POLICY_COND_*)
|
|
366
|
+
for which to unregister a callback function
|
|
367
|
+
|
|
368
|
+
Returns Nothing. Throws an exception on error.
|
|
369
|
+
'''
|
|
370
|
+
|
|
371
|
+
def Unregister(self, condition):
|
|
372
|
+
dcgm_agent.dcgmPolicyUnregister(self._dcgmHandle.handle, self._groupId,
|
|
373
|
+
condition)
|
|
374
|
+
|
|
375
|
+
'''
|
|
376
|
+
Inform the policy manager loop to perform an iteration and trigger the callbacks of any
|
|
377
|
+
registered functions. Callback functions will be called from a separate thread as the calling function.
|
|
378
|
+
|
|
379
|
+
Note: The GPU monitoring and management agent must call this method periodically if the operation
|
|
380
|
+
mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization
|
|
381
|
+
(\ref DcgmHandle.__init__).
|
|
382
|
+
|
|
383
|
+
Returns Nothing. Throws an exception if there is a generic error that the
|
|
384
|
+
policy manager was unable to perform another iteration.
|
|
385
|
+
'''
|
|
386
|
+
|
|
387
|
+
def Trigger(self):
|
|
388
|
+
dcgm_agent.dcgmPolicyTrigger(self._dcgmHandle.handle)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class DcgmGroupDiscovery:
|
|
392
|
+
|
|
393
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
394
|
+
self._dcgmHandle = dcgmHandle
|
|
395
|
+
self._groupId = groupId
|
|
396
|
+
self._dcgmGroup = dcgmGroup
|
|
397
|
+
|
|
398
|
+
'''
|
|
399
|
+
Get the topology for this group
|
|
400
|
+
|
|
401
|
+
Returns a c_dcgmGroupTopology_v1 object representing the topology for this group
|
|
402
|
+
'''
|
|
403
|
+
|
|
404
|
+
def GetTopology(self):
|
|
405
|
+
return dcgm_agent.dcgmGetGroupTopology(self._dcgmHandle.handle,
|
|
406
|
+
self._groupId)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
class DcgmGroupStats:
|
|
410
|
+
|
|
411
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
412
|
+
self._dcgmHandle = dcgmHandle
|
|
413
|
+
self._groupId = groupId
|
|
414
|
+
self._dcgmGroup = dcgmGroup
|
|
415
|
+
|
|
416
|
+
'''
|
|
417
|
+
Tell DCGM to start recording samples for fields returned from GetPidInfo()
|
|
418
|
+
|
|
419
|
+
updateFreq: How often to update these fields in usec
|
|
420
|
+
maxKeepAge: How long to keep data for these fields in seconds
|
|
421
|
+
maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
|
|
422
|
+
|
|
423
|
+
Once the field collection is watched, it will update whenever the next update
|
|
424
|
+
loop occurs. If you want to query these values immediately, use
|
|
425
|
+
handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
|
|
426
|
+
'''
|
|
427
|
+
|
|
428
|
+
def WatchPidFields(self, updateFreq, maxKeepAge, maxKeepSamples):
|
|
429
|
+
ret = dcgm_agent.dcgmWatchPidFields(self._dcgmHandle.handle,
|
|
430
|
+
self._groupId, updateFreq,
|
|
431
|
+
maxKeepAge, maxKeepSamples)
|
|
432
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
433
|
+
|
|
434
|
+
'''
|
|
435
|
+
Get process stats for a given PID on this GPU group
|
|
436
|
+
|
|
437
|
+
You must call WatchPidFields() before this query for this method to return any results
|
|
438
|
+
|
|
439
|
+
Returns a dcgm_structs.c_dcgmPidInfo_v2 structure
|
|
440
|
+
'''
|
|
441
|
+
|
|
442
|
+
def GetPidInfo(self, pid):
|
|
443
|
+
return dcgm_agent.dcgmGetPidInfo(self._dcgmHandle.handle, self._groupId,
|
|
444
|
+
pid)
|
|
445
|
+
|
|
446
|
+
'''
|
|
447
|
+
Tell DCGM to start recording samples for fields returned from GetJobStats()
|
|
448
|
+
|
|
449
|
+
updateFreq: How often to update these fields in usec
|
|
450
|
+
maxKeepAge: How long to keep data for these fields in seconds
|
|
451
|
+
maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
|
|
452
|
+
|
|
453
|
+
Once the fields are watched, they will update whenever the next update
|
|
454
|
+
loop occurs. If you want to query these values immediately, use
|
|
455
|
+
handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
|
|
456
|
+
'''
|
|
457
|
+
|
|
458
|
+
def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples):
|
|
459
|
+
ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle,
|
|
460
|
+
self._groupId, updateFreq,
|
|
461
|
+
maxKeepAge, maxKeepSamples)
|
|
462
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
463
|
+
|
|
464
|
+
'''
|
|
465
|
+
Start collecting stats for a named job for this GPU group
|
|
466
|
+
|
|
467
|
+
Calling this will tell DCGM to start tracking stats for the given jobId. Stats tracking
|
|
468
|
+
will end when StopJobStats() is called
|
|
469
|
+
|
|
470
|
+
You must call WatchJobFields() before this call to tell DCGM to start sampling the fields
|
|
471
|
+
that are returned from GetJobStats().
|
|
472
|
+
|
|
473
|
+
jobId is a unique string identifier for this job. An exception will be thrown if this is not unique
|
|
474
|
+
|
|
475
|
+
Returns Nothing (Will throw exception on error)
|
|
476
|
+
'''
|
|
477
|
+
|
|
478
|
+
def StartJobStats(self, jobId):
|
|
479
|
+
ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle,
|
|
480
|
+
self._groupId, jobId)
|
|
481
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
482
|
+
|
|
483
|
+
'''
|
|
484
|
+
Stop collecting stats for a named job
|
|
485
|
+
|
|
486
|
+
Calling this will tell DCGM to stop collecting stats for a job that was previously started
|
|
487
|
+
with StartJobStats().
|
|
488
|
+
|
|
489
|
+
jobId is the unique string that was passed as jobId to StartJobStats.
|
|
490
|
+
|
|
491
|
+
Returns Nothing (Will throw exception on error)
|
|
492
|
+
'''
|
|
493
|
+
|
|
494
|
+
def StopJobStats(self, jobId):
|
|
495
|
+
ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId)
|
|
496
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
497
|
+
|
|
498
|
+
'''
|
|
499
|
+
Get stats for a job that was started with StartJobStats. If StopJobStats has not been called yet,
|
|
500
|
+
this will get stats from when the job started until now. If StopJob was called prior to
|
|
501
|
+
this, the returned Stats will go from when StartJobStats was called to when StopJobStats was called.
|
|
502
|
+
|
|
503
|
+
jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
|
|
504
|
+
|
|
505
|
+
Returns a dcgm_structs.c_dcgmJobInfo_v3 structure. Throws an exception on error
|
|
506
|
+
'''
|
|
507
|
+
|
|
508
|
+
def GetJobStats(self, jobId):
|
|
509
|
+
ret = dcgm_agent.dcgmJobGetStats(self._dcgmHandle.handle, jobId)
|
|
510
|
+
return ret
|
|
511
|
+
|
|
512
|
+
'''
|
|
513
|
+
This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer
|
|
514
|
+
be able to call GetJobStats() on this jobId. However, you will be able to reuse jobId after
|
|
515
|
+
this call.
|
|
516
|
+
|
|
517
|
+
jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
|
|
518
|
+
|
|
519
|
+
Returns Nothing (Will throw exception on error)
|
|
520
|
+
'''
|
|
521
|
+
|
|
522
|
+
def RemoveJob(self, jobId):
|
|
523
|
+
ret = dcgm_agent.dcgmJobRemove(self._dcgmHandle.handle, jobId)
|
|
524
|
+
return ret
|
|
525
|
+
|
|
526
|
+
'''
|
|
527
|
+
This API tells DCGM to stop tracking all jobs. After this call, you will no longer
|
|
528
|
+
be able to call dcgmJobGetStats() any jobs until you call StartJobStats() again.
|
|
529
|
+
You will be able to reuse any previously-used jobIds after this call.
|
|
530
|
+
|
|
531
|
+
Returns Nothing (Will throw exception on error)
|
|
532
|
+
'''
|
|
533
|
+
|
|
534
|
+
def RemoveAllJobs(self):
|
|
535
|
+
ret = dcgm_agent.dcgmJobRemoveAll(self._dcgmHandle.handle)
|
|
536
|
+
return ret
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
class DcgmGroupAction:
|
|
540
|
+
|
|
541
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
542
|
+
self._dcgmHandle = dcgmHandle
|
|
543
|
+
self._groupId = groupId
|
|
544
|
+
self._dcgmGroup = dcgmGroup
|
|
545
|
+
|
|
546
|
+
'''
|
|
547
|
+
Inform the action manager to perform a manual validation of a group of GPUs on the system
|
|
548
|
+
|
|
549
|
+
validate is what sort of validation to do. See dcgm_structs.DCGM_POLICY_VALID_* defines.
|
|
550
|
+
|
|
551
|
+
Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
|
|
552
|
+
'''
|
|
553
|
+
|
|
554
|
+
def Validate(self, validate):
|
|
555
|
+
runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
|
|
556
|
+
runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
|
|
557
|
+
runDiagInfo.validate = validate
|
|
558
|
+
runDiagInfo.groupId = self._groupId
|
|
559
|
+
|
|
560
|
+
ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
|
|
561
|
+
runDiagInfo)
|
|
562
|
+
return ret
|
|
563
|
+
|
|
564
|
+
'''
|
|
565
|
+
Run a diagnostic on this group of GPUs.
|
|
566
|
+
|
|
567
|
+
diagLevel is the level of diagnostic desired. See dcgm_structs.DCGM_DIAG_LVL_* constants.
|
|
568
|
+
|
|
569
|
+
Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
|
|
570
|
+
'''
|
|
571
|
+
|
|
572
|
+
def RunDiagnostic(self, diagLevel):
|
|
573
|
+
ret = dcgm_agent.dcgmRunDiagnostic(self._dcgmHandle.handle,
|
|
574
|
+
self._groupId, diagLevel)
|
|
575
|
+
return ret
|
|
576
|
+
|
|
577
|
+
'''
|
|
578
|
+
Run a specific diagnostic test on this group of GPUs.
|
|
579
|
+
testName is the name of the specific test that should be invoked.
|
|
580
|
+
Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
|
|
581
|
+
'''
|
|
582
|
+
|
|
583
|
+
def RunSpecificTest(self, testName):
|
|
584
|
+
runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
|
|
585
|
+
runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
|
|
586
|
+
for i in range(len(testName)):
|
|
587
|
+
runDiagInfo.testNames[0][i] = testName[i]
|
|
588
|
+
runDiagInfo.groupId = self._groupId
|
|
589
|
+
runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE
|
|
590
|
+
response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
|
|
591
|
+
runDiagInfo)
|
|
592
|
+
return response
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
class DcgmGroupProfiling:
|
|
596
|
+
|
|
597
|
+
def __init__(self, dcgmHandle, groupId, dcgmGroup):
|
|
598
|
+
"""
|
|
599
|
+
|
|
600
|
+
Parameters
|
|
601
|
+
----------
|
|
602
|
+
dcgmHandle : DcgmHandle
|
|
603
|
+
groupId : int
|
|
604
|
+
dcgmGroup : DcgmGroup
|
|
605
|
+
"""
|
|
606
|
+
self._dcgmHandle = dcgmHandle
|
|
607
|
+
self._groupId = groupId
|
|
608
|
+
self._dcgmGroup = dcgmGroup
|
|
609
|
+
|
|
610
|
+
def GetSupportedMetricGroups(self):
|
|
611
|
+
"""
|
|
612
|
+
Get a list of the profiling metric groups available for this group of entities
|
|
613
|
+
|
|
614
|
+
:return: dcgm_structs.c_dcgmProfGetMetricGroups_v3
|
|
615
|
+
:throws: dcgm_structs.DCGMError on error
|
|
616
|
+
"""
|
|
617
|
+
gpuIds = self._dcgmGroup.GetGpuIds()
|
|
618
|
+
if len(gpuIds) < 1:
|
|
619
|
+
raise dcgm_structs.DCGMError_ProfilingNotSupported
|
|
620
|
+
|
|
621
|
+
ret = dcgm_agent.dcgmProfGetSupportedMetricGroups(
|
|
622
|
+
self._dcgmHandle.handle, gpuIds[0])
|
|
623
|
+
return ret
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
class DcgmGroup:
|
|
627
|
+
'''
|
|
628
|
+
Constructor.
|
|
629
|
+
|
|
630
|
+
Either groupId OR groupName must be provided as a parameter.
|
|
631
|
+
This will set which GPU group this object is bound to
|
|
632
|
+
|
|
633
|
+
groupId=DCGM_GROUP_ALL_GPUS creates a group with all GPUs. Passing an existing groupId will
|
|
634
|
+
not create an additional group.
|
|
635
|
+
If groupName is provided, an empty group (No GPUs) of name groupName will be created. This group
|
|
636
|
+
will be destroyed when this object goes out of scope or is deleted with del().
|
|
637
|
+
groupType is the type of group to create. See dcgm_structs.DCGM_GROUP_? constants.
|
|
638
|
+
'''
|
|
639
|
+
|
|
640
|
+
def __init__(self,
|
|
641
|
+
dcgmHandle,
|
|
642
|
+
groupId=None,
|
|
643
|
+
groupName=None,
|
|
644
|
+
groupType=dcgm_structs.DCGM_GROUP_EMPTY):
|
|
645
|
+
self._dcgmHandle = dcgmHandle
|
|
646
|
+
|
|
647
|
+
if groupId is None and groupName is None:
|
|
648
|
+
raise pydcgm.DcgmException(
|
|
649
|
+
"Either groupId or groupName is required")
|
|
650
|
+
|
|
651
|
+
if groupId is not None:
|
|
652
|
+
self._groupId = groupId
|
|
653
|
+
else:
|
|
654
|
+
self._groupId = dcgm_agent.dcgmGroupCreate(self._dcgmHandle.handle,
|
|
655
|
+
groupType, groupName)
|
|
656
|
+
|
|
657
|
+
#Create namespace classes
|
|
658
|
+
self.config = DcgmGroupConfig(self._dcgmHandle, self._groupId, self)
|
|
659
|
+
self.samples = DcgmGroupSamples(self._dcgmHandle, self._groupId, self)
|
|
660
|
+
self.health = DcgmGroupHealth(self._dcgmHandle, self._groupId, self)
|
|
661
|
+
self.policy = DcgmGroupPolicy(self._dcgmHandle, self._groupId, self)
|
|
662
|
+
self.discovery = DcgmGroupDiscovery(self._dcgmHandle, self._groupId,
|
|
663
|
+
self)
|
|
664
|
+
self.stats = DcgmGroupStats(self._dcgmHandle, self._groupId, self)
|
|
665
|
+
self.action = DcgmGroupAction(self._dcgmHandle, self._groupId, self)
|
|
666
|
+
self.profiling = DcgmGroupProfiling(self._dcgmHandle, self._groupId,
|
|
667
|
+
self)
|
|
668
|
+
|
|
669
|
+
'''
|
|
670
|
+
Remove this group from DCGM. This object will no longer be valid after this call.
|
|
671
|
+
'''
|
|
672
|
+
|
|
673
|
+
def Delete(self):
|
|
674
|
+
del self.config
|
|
675
|
+
self.config = None
|
|
676
|
+
del self.samples
|
|
677
|
+
self.samples = None
|
|
678
|
+
del self.health
|
|
679
|
+
self.health = None
|
|
680
|
+
del self.policy
|
|
681
|
+
self.policy = None
|
|
682
|
+
del self.discovery
|
|
683
|
+
self.discovery = None
|
|
684
|
+
del self.stats
|
|
685
|
+
self.stats = None
|
|
686
|
+
del self.action
|
|
687
|
+
self.action = None
|
|
688
|
+
del self.profiling
|
|
689
|
+
self.profiling = None
|
|
690
|
+
|
|
691
|
+
#Delete the group we created if we're not using the special all-GPU group
|
|
692
|
+
if self._groupId is not None and not self._IsGroupIdStatic():
|
|
693
|
+
ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle,
|
|
694
|
+
self._groupId)
|
|
695
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
696
|
+
|
|
697
|
+
self._groupId = None
|
|
698
|
+
|
|
699
|
+
'''
|
|
700
|
+
Private method to determine if our groupId is a predefined one
|
|
701
|
+
'''
|
|
702
|
+
|
|
703
|
+
def _IsGroupIdStatic(self):
|
|
704
|
+
if self._groupId == dcgm_structs.DCGM_GROUP_ALL_GPUS or \
|
|
705
|
+
self._groupId == dcgm_structs.DCGM_GROUP_ALL_NVSWITCHES:
|
|
706
|
+
return True
|
|
707
|
+
return False
|
|
708
|
+
|
|
709
|
+
'''
|
|
710
|
+
Add a GPU to this group
|
|
711
|
+
|
|
712
|
+
gpuId is the GPU ID to add to our group
|
|
713
|
+
|
|
714
|
+
Returns Nothing. Throws an exception on error
|
|
715
|
+
'''
|
|
716
|
+
|
|
717
|
+
def AddGpu(self, gpuId):
|
|
718
|
+
if self._IsGroupIdStatic():
|
|
719
|
+
raise pydcgm.DcgmException("Can't add a GPU to a static group")
|
|
720
|
+
|
|
721
|
+
ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle,
|
|
722
|
+
self._groupId, gpuId)
|
|
723
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
724
|
+
|
|
725
|
+
'''
|
|
726
|
+
Add an entity to this group
|
|
727
|
+
|
|
728
|
+
entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
|
|
729
|
+
entityId is the entity to add to this group
|
|
730
|
+
|
|
731
|
+
Returns Nothing. Throws an exception on error
|
|
732
|
+
'''
|
|
733
|
+
|
|
734
|
+
def AddEntity(self, entityGroupId, entityId):
|
|
735
|
+
if self._IsGroupIdStatic():
|
|
736
|
+
raise pydcgm.DcgmException("Can't add an entity to a static group")
|
|
737
|
+
|
|
738
|
+
ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle,
|
|
739
|
+
self._groupId, entityGroupId,
|
|
740
|
+
entityId)
|
|
741
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
742
|
+
|
|
743
|
+
'''
|
|
744
|
+
Remove a GPU from this group
|
|
745
|
+
|
|
746
|
+
gpuId is the GPU ID to remove from our group
|
|
747
|
+
|
|
748
|
+
Returns Nothing. Throws an exception on error
|
|
749
|
+
'''
|
|
750
|
+
|
|
751
|
+
def RemoveGpu(self, gpuId):
|
|
752
|
+
if self._IsGroupIdStatic():
|
|
753
|
+
raise pydcgm.DcgmException("Can't remove a GPU from a static group")
|
|
754
|
+
|
|
755
|
+
ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle,
|
|
756
|
+
self._groupId, gpuId)
|
|
757
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
758
|
+
|
|
759
|
+
'''
|
|
760
|
+
Remove an entity from this group
|
|
761
|
+
|
|
762
|
+
entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
|
|
763
|
+
entityId is the entity to remove from this group
|
|
764
|
+
|
|
765
|
+
Returns Nothing. Throws an exception on error
|
|
766
|
+
'''
|
|
767
|
+
|
|
768
|
+
def RemoveEntity(self, entityGroupId, entityId):
|
|
769
|
+
if self._IsGroupIdStatic():
|
|
770
|
+
raise pydcgm.DcgmException(
|
|
771
|
+
"Can't remove an entity from a static group")
|
|
772
|
+
|
|
773
|
+
ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle,
|
|
774
|
+
self._groupId, entityGroupId,
|
|
775
|
+
entityId)
|
|
776
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
777
|
+
|
|
778
|
+
'''
|
|
779
|
+
Get an array of GPU ids that are part of this group
|
|
780
|
+
|
|
781
|
+
Note: this ignores non-GPU members of the group
|
|
782
|
+
|
|
783
|
+
Returns a list of GPU ids. Throws an exception on error
|
|
784
|
+
'''
|
|
785
|
+
|
|
786
|
+
def GetGpuIds(self):
|
|
787
|
+
groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
|
|
788
|
+
self._groupId)
|
|
789
|
+
groupGpuIds = []
|
|
790
|
+
for i in range(groupInfo.count):
|
|
791
|
+
if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU:
|
|
792
|
+
continue
|
|
793
|
+
groupGpuIds.append(groupInfo.entityList[i].entityId)
|
|
794
|
+
return groupGpuIds
|
|
795
|
+
|
|
796
|
+
'''
|
|
797
|
+
Get an array of entities that are part of this group
|
|
798
|
+
|
|
799
|
+
Returns a list of c_dcgmGroupEntityPair_t structs. Throws an exception on error
|
|
800
|
+
'''
|
|
801
|
+
|
|
802
|
+
def GetEntities(self):
|
|
803
|
+
groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
|
|
804
|
+
self._groupId)
|
|
805
|
+
entities = groupInfo.entityList[0:groupInfo.count]
|
|
806
|
+
return entities
|
|
807
|
+
|
|
808
|
+
'''
|
|
809
|
+
Get the groupId of this object
|
|
810
|
+
|
|
811
|
+
Returns our groupId
|
|
812
|
+
'''
|
|
813
|
+
|
|
814
|
+
def GetId(self):
|
|
815
|
+
return self._groupId
|