triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import model_analyzer.monitor.dcgm.pydcgm as pydcgm
|
|
16
|
+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
|
|
17
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
18
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
19
|
+
import ctypes
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DcgmSystemDiscovery:
|
|
23
|
+
'''
|
|
24
|
+
Constructor
|
|
25
|
+
'''
|
|
26
|
+
|
|
27
|
+
def __init__(self, dcgmHandle):
|
|
28
|
+
self._dcgmHandle = dcgmHandle
|
|
29
|
+
|
|
30
|
+
'''
|
|
31
|
+
Get all IDs of the GPUs that DCGM knows about. To get only GPUs that DCGM support,
|
|
32
|
+
use GetAllSupportedGpuIds().
|
|
33
|
+
|
|
34
|
+
Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
|
|
35
|
+
'''
|
|
36
|
+
|
|
37
|
+
def GetAllGpuIds(self):
|
|
38
|
+
gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle)
|
|
39
|
+
return gpuIds
|
|
40
|
+
|
|
41
|
+
'''
|
|
42
|
+
Get all of IDs of the GPUs that DCGM supports. This will exclude unsupported
|
|
43
|
+
GPUs
|
|
44
|
+
|
|
45
|
+
Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
|
|
46
|
+
'''
|
|
47
|
+
|
|
48
|
+
def GetAllSupportedGpuIds(self):
|
|
49
|
+
gpuIds = dcgm_agent.dcgmGetAllSupportedDevices(self._dcgmHandle.handle)
|
|
50
|
+
return gpuIds
|
|
51
|
+
|
|
52
|
+
'''
|
|
53
|
+
Get some basic GPU attributes for a given GPU ID.
|
|
54
|
+
|
|
55
|
+
Returns a dcgm_structs.c_dcgmDeviceAttributes_v3() object for the given GPU
|
|
56
|
+
'''
|
|
57
|
+
|
|
58
|
+
def GetGpuAttributes(self, gpuId):
|
|
59
|
+
return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle,
|
|
60
|
+
gpuId)
|
|
61
|
+
|
|
62
|
+
'''
|
|
63
|
+
Get topology information for a given GPU ID
|
|
64
|
+
|
|
65
|
+
Returns a dcgm_structs.c_dcgmDeviceTopology_v1 structure representing the topology for the given GPU
|
|
66
|
+
'''
|
|
67
|
+
|
|
68
|
+
def GetGpuTopology(self, gpuId):
|
|
69
|
+
return dcgm_agent.dcgmGetDeviceTopology(self._dcgmHandle.handle, gpuId)
|
|
70
|
+
|
|
71
|
+
'''
|
|
72
|
+
Get all entityIds of the entities that DCGM knows about.
|
|
73
|
+
|
|
74
|
+
entityGroupId IN: DCGM_FE_? constant of the entity group to fetch the entities of
|
|
75
|
+
onlyActive IN: Boolean as to whether to fetch entities that are supported by DCGM (True)
|
|
76
|
+
or all entity IDs (False)
|
|
77
|
+
|
|
78
|
+
Returns an array of entity IDs. Each of these can be passed to DcgmGroup::AddEntity()
|
|
79
|
+
'''
|
|
80
|
+
|
|
81
|
+
def GetEntityGroupEntities(self, entityGroupId, onlySupported):
|
|
82
|
+
flags = 0
|
|
83
|
+
if onlySupported:
|
|
84
|
+
flags |= dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
|
|
85
|
+
entityIds = dcgm_agent.dcgmGetEntityGroupEntities(
|
|
86
|
+
self._dcgmHandle.handle, entityGroupId, flags)
|
|
87
|
+
return entityIds
|
|
88
|
+
|
|
89
|
+
'''
|
|
90
|
+
Get the status of all of the NvLink links in the system.
|
|
91
|
+
|
|
92
|
+
Returns a dcgm_structs.c_dcgmNvLinkStatus_v3 object.
|
|
93
|
+
'''
|
|
94
|
+
|
|
95
|
+
def GetNvLinkLinkStatus(self):
|
|
96
|
+
return dcgm_agent.dcgmGetNvLinkLinkStatus(self._dcgmHandle.handle)
|
|
97
|
+
|
|
98
|
+
'''
|
|
99
|
+
From a bitmask of input gpu ids, return a bitmask of numGpus GPUs which identifies the topologically
|
|
100
|
+
closest GPUs to use for a single job. DCGM will consider CPU affinities and NVLink connection speeds
|
|
101
|
+
to determine the closest.
|
|
102
|
+
hintFlags can instruct DCGM to consider GPU health or not. By default, unhealthy GPUs are excluded from
|
|
103
|
+
consideration.
|
|
104
|
+
'''
|
|
105
|
+
|
|
106
|
+
def SelectGpusByTopology(self, inputGpuIds, numGpus, hintFlags):
|
|
107
|
+
return dcgm_agent.dcgmSelectGpusByTopology(self._dcgmHandle.handle,
|
|
108
|
+
inputGpuIds, numGpus,
|
|
109
|
+
hintFlags)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class DcgmSystemIntrospect:
|
|
113
|
+
'''
|
|
114
|
+
Class to access the system-wide introspection modules of DCGM
|
|
115
|
+
'''
|
|
116
|
+
|
|
117
|
+
def __init__(self, dcgmHandle):
|
|
118
|
+
self._handle = dcgmHandle
|
|
119
|
+
self.memory = DcgmSystemIntrospectMemory(dcgmHandle)
|
|
120
|
+
self.cpuUtil = DcgmSystemIntrospectCpuUtil(dcgmHandle)
|
|
121
|
+
|
|
122
|
+
def UpdateAll(self, waitForUpdate=True):
|
|
123
|
+
dcgm_agent.dcgmIntrospectUpdateAll(self._handle.handle, waitForUpdate)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class DcgmSystemIntrospectMemory:
|
|
127
|
+
'''
|
|
128
|
+
Class to access information about the memory usage of DCGM itself
|
|
129
|
+
'''
|
|
130
|
+
|
|
131
|
+
def __init__(self, dcgmHandle):
|
|
132
|
+
self._dcgmHandle = dcgmHandle
|
|
133
|
+
|
|
134
|
+
def GetForHostengine(self, waitIfNoData=True):
|
|
135
|
+
'''
|
|
136
|
+
Retrieve the total amount of virtual memory that the hostengine process is currently using.
|
|
137
|
+
This measurement represents both the resident set size (what is currently in RAM) and
|
|
138
|
+
the swapped memory that belongs to the process.
|
|
139
|
+
|
|
140
|
+
waitIfNoData: wait for metadata to be updated if it's not available
|
|
141
|
+
|
|
142
|
+
Returns a dcgm_structs.c_dcgmIntrospectMemory_v1 object
|
|
143
|
+
Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
|
|
144
|
+
'''
|
|
145
|
+
return dcgm_agent.dcgmIntrospectGetHostengineMemoryUsage(
|
|
146
|
+
self._dcgmHandle.handle, waitIfNoData)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DcgmSystemIntrospectCpuUtil:
|
|
150
|
+
'''
|
|
151
|
+
Class to access information about the CPU Utilization of DCGM
|
|
152
|
+
'''
|
|
153
|
+
|
|
154
|
+
def __init__(self, dcgmHandle):
|
|
155
|
+
self._dcgmHandle = dcgmHandle
|
|
156
|
+
|
|
157
|
+
def GetForHostengine(self, waitIfNoData=True):
|
|
158
|
+
'''
|
|
159
|
+
Get the current CPU Utilization of the hostengine process.
|
|
160
|
+
|
|
161
|
+
waitIfNoData: wait for metadata to be updated if it's not available
|
|
162
|
+
|
|
163
|
+
Returns a dcgm_structs.c_dcgmIntrospectCpuUtil_v1 object
|
|
164
|
+
Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
|
|
165
|
+
'''
|
|
166
|
+
return dcgm_agent.dcgmIntrospectGetHostengineCpuUtilization(
|
|
167
|
+
self._dcgmHandle.handle, waitIfNoData)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
'''
|
|
171
|
+
Class to encapsulate DCGM field-metadata requests
|
|
172
|
+
'''
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class DcgmSystemFields:
|
|
176
|
+
|
|
177
|
+
def GetFieldById(self, fieldId):
|
|
178
|
+
'''
|
|
179
|
+
Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID
|
|
180
|
+
|
|
181
|
+
fieldId: dcgm_fields.DCGM_FI_* field ID of the field
|
|
182
|
+
|
|
183
|
+
Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
|
|
184
|
+
'''
|
|
185
|
+
return dcgm_fields.DcgmFieldGetById(fieldId)
|
|
186
|
+
|
|
187
|
+
def GetFieldByTag(self, tag):
|
|
188
|
+
'''
|
|
189
|
+
Get a field's metadata by its tag name. Ex: 'brand'
|
|
190
|
+
|
|
191
|
+
tag: Tag name of the field
|
|
192
|
+
|
|
193
|
+
Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
|
|
194
|
+
'''
|
|
195
|
+
return dcgm_fields.DcgmFieldGetByTag(tag)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
'''
|
|
199
|
+
Class to encapsulate DCGM module management and introspection
|
|
200
|
+
'''
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class DcgmSystemModules:
|
|
204
|
+
'''
|
|
205
|
+
Constructor
|
|
206
|
+
'''
|
|
207
|
+
|
|
208
|
+
def __init__(self, dcgmHandle):
|
|
209
|
+
self._dcgmHandle = dcgmHandle
|
|
210
|
+
|
|
211
|
+
'''
|
|
212
|
+
Denylist a module from being loaded by DCGM.
|
|
213
|
+
|
|
214
|
+
moduleId a dcgm_structs.dcgmModuleId* ID of the module to denylist
|
|
215
|
+
|
|
216
|
+
Returns: Nothing.
|
|
217
|
+
Raises a DCGM_ST_IN_USE exception if the module was already loaded
|
|
218
|
+
'''
|
|
219
|
+
|
|
220
|
+
def Denylist(self, moduleId):
|
|
221
|
+
dcgm_agent.dcgmModuleDenylist(self._dcgmHandle.handle, moduleId)
|
|
222
|
+
|
|
223
|
+
'''
|
|
224
|
+
Get the statuses of all of the modules in DCGM
|
|
225
|
+
|
|
226
|
+
Returns: a dcgm_structs.c_dcgmModuleGetStatuses_v1 structure.
|
|
227
|
+
'''
|
|
228
|
+
|
|
229
|
+
def GetStatuses(self):
|
|
230
|
+
return dcgm_agent.dcgmModuleGetStatuses(self._dcgmHandle.handle)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
'''
|
|
234
|
+
Class to encapsulate DCGM profiling
|
|
235
|
+
'''
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class DcgmSystemProfiling:
|
|
239
|
+
'''
|
|
240
|
+
Constructor
|
|
241
|
+
'''
|
|
242
|
+
|
|
243
|
+
def __init__(self, dcgmHandle):
|
|
244
|
+
self._dcgmHandle = dcgmHandle
|
|
245
|
+
|
|
246
|
+
'''
|
|
247
|
+
Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields
|
|
248
|
+
from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute.
|
|
249
|
+
Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012.
|
|
250
|
+
|
|
251
|
+
Call this API before you launch one of those tools and Resume() after the tool has completed.
|
|
252
|
+
|
|
253
|
+
DCGM will save BLANK values while profiling is paused.
|
|
254
|
+
Calling this while profiling activities are already paused is fine and will be treated as a no-op.
|
|
255
|
+
'''
|
|
256
|
+
|
|
257
|
+
def Pause(self):
|
|
258
|
+
return dcgm_agent.dcgmProfPause(self._dcgmHandle.handle)
|
|
259
|
+
|
|
260
|
+
'''
|
|
261
|
+
Resume profiling activities in DCGM that were previously paused with Pause().
|
|
262
|
+
|
|
263
|
+
Call this API after you have completed running other NVIDIA developer tools to reenable DCGM
|
|
264
|
+
profiling metrics.
|
|
265
|
+
|
|
266
|
+
DCGM will save BLANK values while profiling is paused.
|
|
267
|
+
|
|
268
|
+
Calling this while profiling activities have already been resumed is fine and will be treated as a no-op.
|
|
269
|
+
'''
|
|
270
|
+
|
|
271
|
+
def Resume(self):
|
|
272
|
+
return dcgm_agent.dcgmProfResume(self._dcgmHandle.handle)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
'''
|
|
276
|
+
Class to encapsulate global DCGM methods. These apply to a single DcgmHandle, provided to the constructor
|
|
277
|
+
'''
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class DcgmSystem:
|
|
281
|
+
'''
|
|
282
|
+
Constructor
|
|
283
|
+
|
|
284
|
+
dcgmHandle is a pydcgm.DcgmHandle instance of the connection that will be used by all methods of this class
|
|
285
|
+
'''
|
|
286
|
+
|
|
287
|
+
def __init__(self, dcgmHandle):
|
|
288
|
+
self._dcgmHandle = dcgmHandle
|
|
289
|
+
|
|
290
|
+
#Child classes
|
|
291
|
+
self.discovery = DcgmSystemDiscovery(self._dcgmHandle)
|
|
292
|
+
self.introspect = DcgmSystemIntrospect(self._dcgmHandle)
|
|
293
|
+
self.fields = DcgmSystemFields()
|
|
294
|
+
self.modules = DcgmSystemModules(self._dcgmHandle)
|
|
295
|
+
self.profiling = DcgmSystemProfiling(self._dcgmHandle)
|
|
296
|
+
|
|
297
|
+
'''
|
|
298
|
+
Request that the host engine perform a field value update cycle. If the host
|
|
299
|
+
engine was starting in DCGM_OPERATION_MODE_MANUAL, calling this method is
|
|
300
|
+
the only way that field values will be updated.
|
|
301
|
+
|
|
302
|
+
Note that performing a field value update cycle does not update every field.
|
|
303
|
+
It only update fields that are newly watched or fields that haven't updated
|
|
304
|
+
in enough time to warrant updating again, based on their update frequency.
|
|
305
|
+
|
|
306
|
+
waitForUpdate specifies whether this function call should block until the
|
|
307
|
+
field value update loop is complete or not. Use True if you intend to query
|
|
308
|
+
values immediately after calling this.
|
|
309
|
+
'''
|
|
310
|
+
|
|
311
|
+
def UpdateAllFields(self, waitForUpdate):
|
|
312
|
+
ret = dcgm_agent.dcgmUpdateAllFields(self._dcgmHandle.handle,
|
|
313
|
+
waitForUpdate)
|
|
314
|
+
#Throw an exception on error
|
|
315
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
316
|
+
|
|
317
|
+
'''
|
|
318
|
+
Get a DcgmGroup instance for the default all-GPUs group. This object is used to
|
|
319
|
+
perform operations on a group of GPUs. See DcgmGroup.py for details.
|
|
320
|
+
|
|
321
|
+
AddGpu() and RemoveGpu() operations are not allowed on the default group
|
|
322
|
+
'''
|
|
323
|
+
|
|
324
|
+
def GetDefaultGroup(self):
|
|
325
|
+
return pydcgm.DcgmGroup(self._dcgmHandle,
|
|
326
|
+
groupId=dcgm_structs.DCGM_GROUP_ALL_GPUS)
|
|
327
|
+
|
|
328
|
+
'''
|
|
329
|
+
Get an instance of DcgmGroup with no GPUs. Call AddGpu() on the returned
|
|
330
|
+
object with GPU IDs from GetAllGpuIds() before performing actions on
|
|
331
|
+
the returned DcgmGroup instance.
|
|
332
|
+
|
|
333
|
+
groupName is the name of the group to create in the host engine. This name must be
|
|
334
|
+
unique.
|
|
335
|
+
|
|
336
|
+
Note: The group will be deleted from the host engine when the returned object goes out of scope
|
|
337
|
+
'''
|
|
338
|
+
|
|
339
|
+
def GetEmptyGroup(self, groupName):
|
|
340
|
+
return pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
|
|
341
|
+
|
|
342
|
+
'''
|
|
343
|
+
Get an instance of DcgmGroup populated with the gpuIds provided
|
|
344
|
+
|
|
345
|
+
groupName is the name of the group to create in the host engine. This name must be
|
|
346
|
+
unique.
|
|
347
|
+
gpuIds is the list of GPU IDs to add to the group
|
|
348
|
+
|
|
349
|
+
Note: The group will be deleted from the host engine when the returned object goes out of scope
|
|
350
|
+
'''
|
|
351
|
+
|
|
352
|
+
def GetGroupWithGpuIds(self, groupName, gpuIds):
|
|
353
|
+
newGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
|
|
354
|
+
for gpuId in gpuIds:
|
|
355
|
+
newGroup.AddGpu(gpuId)
|
|
356
|
+
return newGroup
|
|
357
|
+
|
|
358
|
+
'''
|
|
359
|
+
Get an instance of DcgmGroup populated with the provided entities
|
|
360
|
+
|
|
361
|
+
groupName is the name of the group to create in the host engine. This name must be
|
|
362
|
+
unique.
|
|
363
|
+
entities is the list of entity pairs (type and id) to add to the group
|
|
364
|
+
|
|
365
|
+
Note: The group will be deleted from the host engine when the returned object goes out of scope
|
|
366
|
+
'''
|
|
367
|
+
|
|
368
|
+
def GetGroupWithEntities(self, groupName, entities):
|
|
369
|
+
group = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
|
|
370
|
+
for entity in entities:
|
|
371
|
+
group.AddEntity(entity.entityGroupId, entity.entityId)
|
|
372
|
+
|
|
373
|
+
return group
|
|
374
|
+
|
|
375
|
+
'''
|
|
376
|
+
Get ids of all DcgmGroups of GPUs. This returns a list containing the ids of the DcgmGroups.
|
|
377
|
+
'''
|
|
378
|
+
|
|
379
|
+
def GetAllGroupIds(self):
|
|
380
|
+
return dcgm_agent.dcgmGroupGetAllIds(self._dcgmHandle.handle)
|
|
381
|
+
|
|
382
|
+
'''
|
|
383
|
+
Get all all of the field groups in the system
|
|
384
|
+
'''
|
|
385
|
+
|
|
386
|
+
def GetAllFieldGroups(self):
|
|
387
|
+
return dcgm_agent.dcgmFieldGroupGetAll(self._dcgmHandle.handle)
|
|
388
|
+
|
|
389
|
+
'''
|
|
390
|
+
Get a field group's id by its name.
|
|
391
|
+
|
|
392
|
+
Returns: Field group ID if found
|
|
393
|
+
None if not found
|
|
394
|
+
'''
|
|
395
|
+
|
|
396
|
+
def GetFieldGroupIdByName(self, name):
|
|
397
|
+
allGroups = self.GetAllFieldGroups()
|
|
398
|
+
for i in range(0, allGroups.numFieldGroups):
|
|
399
|
+
if allGroups.fieldGroups[i].fieldGroupName == name:
|
|
400
|
+
return ctypes.c_void_p(allGroups.fieldGroups[i].fieldGroupId)
|
|
401
|
+
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
def PauseTelemetryForDiag(self):
|
|
405
|
+
"""Pause DCGM modules from updating field values."""
|
|
406
|
+
import dcgm_agent_internal
|
|
407
|
+
dcgm_agent_internal.dcgmPauseTelemetryForDiag(self._dcgmHandle.handle)
|
|
408
|
+
|
|
409
|
+
def ResumeTelemetryForDiag(self):
|
|
410
|
+
"""Resume previously paused DCGM modules so that they can update field values."""
|
|
411
|
+
import dcgm_agent_internal
|
|
412
|
+
dcgm_agent_internal.dcgmResumeTelemetryForDiag(self._dcgmHandle.handle)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from os import environ
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
###############################################################################
|
|
21
|
+
def create_parser(
|
|
22
|
+
publish_port=8000,
|
|
23
|
+
interval=10,
|
|
24
|
+
name='the monitoring tool', # Replace with 'prometheus', 'telegraf', etc.
|
|
25
|
+
field_ids=None,
|
|
26
|
+
log_file=None,
|
|
27
|
+
log_level='INFO',
|
|
28
|
+
dcgm_hostname=environ.get('DCGM_HOSTNAME') or 'localhost',
|
|
29
|
+
):
|
|
30
|
+
'''
|
|
31
|
+
Create a parser that defaults to sane parameters.
|
|
32
|
+
|
|
33
|
+
The default parameters can be overridden through keyword arguments.
|
|
34
|
+
|
|
35
|
+
Note: if DCGM_HOSTNAME is set as an environment variable, it is used as
|
|
36
|
+
the default instead of localhost
|
|
37
|
+
'''
|
|
38
|
+
|
|
39
|
+
parser = argparse.ArgumentParser()
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
'-p',
|
|
42
|
+
'--publish-port',
|
|
43
|
+
dest='publish_port',
|
|
44
|
+
type=int,
|
|
45
|
+
default=publish_port,
|
|
46
|
+
help='TCP port that the client should publish to. Default={}.'.format(
|
|
47
|
+
publish_port))
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
'-i',
|
|
50
|
+
'--interval',
|
|
51
|
+
dest='interval',
|
|
52
|
+
type=int,
|
|
53
|
+
default=interval,
|
|
54
|
+
help=
|
|
55
|
+
'How often the client should retrieve new values from DCGM in seconds. Default={}.'
|
|
56
|
+
.format(interval))
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
'-f',
|
|
59
|
+
'--field-ids',
|
|
60
|
+
dest='field_ids',
|
|
61
|
+
type=str,
|
|
62
|
+
default=field_ids,
|
|
63
|
+
help=
|
|
64
|
+
'Comma-separated list of field IDs that should be retrieved from DCGM. '
|
|
65
|
+
+
|
|
66
|
+
'The full list of available field IDs can be obtained from dcgm_fields.h, dcgm_fields.py, '
|
|
67
|
+
+ 'or running \'dcgmi dmon -l\'.')
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
'--log-file',
|
|
70
|
+
dest='logfile',
|
|
71
|
+
type=str,
|
|
72
|
+
default=log_file,
|
|
73
|
+
help=
|
|
74
|
+
'A path to a log file for recording what information is being sent to {}'
|
|
75
|
+
.format(name))
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
'--log-level',
|
|
78
|
+
dest='loglevel',
|
|
79
|
+
type=str,
|
|
80
|
+
default=log_level,
|
|
81
|
+
help=
|
|
82
|
+
'Specify a log level to use for logging.\n\tCRITICAL (0) - log only critical errors that drastically affect execution'
|
|
83
|
+
+
|
|
84
|
+
'\n\tERROR (1) - Log any error in execution\n\tWARNING (2) - Log all warnings and errors that occur'
|
|
85
|
+
+
|
|
86
|
+
'\n\tINFO (3) - Log informational messages about program execution in addition to warnings and errors'
|
|
87
|
+
+
|
|
88
|
+
'\n\tDEBUG (4) - Log debugging information in addition to all information about execution'
|
|
89
|
+
+ '\nDefault: {}'.format(log_level))
|
|
90
|
+
|
|
91
|
+
group = parser.add_mutually_exclusive_group()
|
|
92
|
+
group.add_argument(
|
|
93
|
+
'-n',
|
|
94
|
+
'--hostname',
|
|
95
|
+
dest='hostname',
|
|
96
|
+
type=str,
|
|
97
|
+
default=dcgm_hostname,
|
|
98
|
+
help=
|
|
99
|
+
'IP/hostname where the client should query DCGM for values. Default={} (all interfaces).'
|
|
100
|
+
.format(dcgm_hostname))
|
|
101
|
+
group.add_argument(
|
|
102
|
+
'-e',
|
|
103
|
+
'--embedded',
|
|
104
|
+
dest='embedded',
|
|
105
|
+
action='store_true',
|
|
106
|
+
help=
|
|
107
|
+
'Launch DCGM from within this process instead of connecting to nv-hostengine.'
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return parser
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def add_custom_argument(parser, *args, **kwargs):
|
|
114
|
+
parser.add_argument(*args, **kwargs)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
###############################################################################
|
|
118
|
+
def add_target_host_argument(name, parser, default_target='localhost'):
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
'-t',
|
|
121
|
+
'--publish-hostname',
|
|
122
|
+
dest='publish_hostname',
|
|
123
|
+
type=str,
|
|
124
|
+
default=default_target,
|
|
125
|
+
help='The hostname at which the client will publish the readings to {}'.
|
|
126
|
+
format(name))
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
###############################################################################
|
|
130
|
+
def run_parser(parser):
|
|
131
|
+
'''
|
|
132
|
+
Run a parser created using create_parser
|
|
133
|
+
'''
|
|
134
|
+
return parser.parse_args()
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
###############################################################################
|
|
138
|
+
def get_field_ids(args):
|
|
139
|
+
# This indicates the user supplied a string, so we should override the
|
|
140
|
+
# default
|
|
141
|
+
if isinstance(args.field_ids, str):
|
|
142
|
+
tokens = args.field_ids.split(",")
|
|
143
|
+
field_ids = [int(token) for token in tokens]
|
|
144
|
+
return field_ids
|
|
145
|
+
# The default object should already be an array of ints. Just return it
|
|
146
|
+
else:
|
|
147
|
+
return args.field_ids
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
###############################################################################
|
|
151
|
+
def get_log_level(args):
|
|
152
|
+
levelStr = args.loglevel.upper()
|
|
153
|
+
if levelStr == '0' or levelStr == 'CRITICAL':
|
|
154
|
+
numeric_log_level = logging.CRITICAL
|
|
155
|
+
elif levelStr == '1' or levelStr == 'ERROR':
|
|
156
|
+
numeric_log_level = logging.ERROR
|
|
157
|
+
elif levelStr == '2' or levelStr == 'WARNING':
|
|
158
|
+
numeric_log_level = logging.WARNING
|
|
159
|
+
elif levelStr == '3' or levelStr == 'INFO':
|
|
160
|
+
numeric_log_level = logging.INFO
|
|
161
|
+
elif levelStr == '4' or levelStr == 'DEBUG':
|
|
162
|
+
numeric_log_level = logging.DEBUG
|
|
163
|
+
else:
|
|
164
|
+
print("Could not understand the specified --log-level '%s'" %
|
|
165
|
+
(args.loglevel))
|
|
166
|
+
args.print_help()
|
|
167
|
+
sys.exit(2)
|
|
168
|
+
return numeric_log_level
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
###############################################################################
|
|
172
|
+
def parse_command_line(name, default_port, add_target_host=False):
|
|
173
|
+
# Fields we accept raw from the CLI
|
|
174
|
+
FIELDS_AS_IS = ['publish_port', 'interval', 'logfile', 'publish_hostname']
|
|
175
|
+
|
|
176
|
+
parser = create_parser(
|
|
177
|
+
name=name,
|
|
178
|
+
publish_port=default_port,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if add_target_host:
|
|
182
|
+
add_target_host_argument(name, parser)
|
|
183
|
+
|
|
184
|
+
args = run_parser(parser)
|
|
185
|
+
field_ids = get_field_ids(args)
|
|
186
|
+
log_level = get_log_level(args)
|
|
187
|
+
|
|
188
|
+
args_as_dict = vars(args)
|
|
189
|
+
settings = {i: args_as_dict[i] for i in FIELDS_AS_IS}
|
|
190
|
+
settings['dcgm_hostname'] = None if args.embedded else args.hostname
|
|
191
|
+
settings['field_ids'] = field_ids
|
|
192
|
+
settings['log_level'] = log_level
|
|
193
|
+
|
|
194
|
+
return settings
|