triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
##
|
|
15
|
+
# Python bindings for the internal API of DCGM library (dcgm_fields_internal.hpp)
|
|
16
|
+
##
|
|
17
|
+
|
|
18
|
+
from ctypes import *
|
|
19
|
+
from ctypes.util import find_library
|
|
20
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
21
|
+
|
|
22
|
+
# Provides access to functions
|
|
23
|
+
dcgmFP = dcgm_structs._dcgmGetFunctionPointer
|
|
24
|
+
|
|
25
|
+
#internal-only fields
|
|
26
|
+
DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 #Memory utilization samples
|
|
27
|
+
DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 #SM utilization samples
|
|
28
|
+
DCGM_FI_DEV_GRAPHICS_PIDS = 220 #Graphics processes running on the GPU.
|
|
29
|
+
DCGM_FI_DEV_COMPUTE_PIDS = 221 #Compute processes running on the GPU.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from model_analyzer.monitor.dcgm.common.dcgm_client_main import main
|
|
15
|
+
from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader
|
|
16
|
+
from socket import socket, AF_INET, SOCK_DGRAM
|
|
17
|
+
|
|
18
|
+
# Displayed to the user
|
|
19
|
+
FLUENTD_NAME = 'Fluentd'
|
|
20
|
+
DEFAULT_FLUENTD_PORT = 24225
|
|
21
|
+
|
|
22
|
+
# Fluentd Configuration
|
|
23
|
+
# =====================
|
|
24
|
+
# In order to use this client, Fluentd needs to accept json over udp.
|
|
25
|
+
# The default port is 24225
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DcgmFluentd(DcgmJsonReader):
|
|
29
|
+
###########################################################################
|
|
30
|
+
def __init__(self, publish_hostname, publish_port, **kwargs):
|
|
31
|
+
self.m_sock = socket(AF_INET, SOCK_DGRAM)
|
|
32
|
+
self.m_dest = (publish_hostname, publish_port)
|
|
33
|
+
super(DcgmFluentd, self).__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
###########################################################################
|
|
36
|
+
def SendToFluentd(self, payload):
|
|
37
|
+
self.m_sock.sendto(payload, self.m_dest)
|
|
38
|
+
|
|
39
|
+
###########################################################################
|
|
40
|
+
def CustomJsonHandler(self, outJson):
|
|
41
|
+
self.SendToFluentd(outJson)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == '__main__': # pragma: no cover
|
|
45
|
+
main(DcgmFluentd, FLUENTD_NAME, DEFAULT_FLUENTD_PORT, add_target_host=True)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
|
|
18
|
+
import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers
|
|
19
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
20
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as structs
|
|
21
|
+
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
|
|
22
|
+
from model_analyzer.monitor.monitor import Monitor
|
|
23
|
+
from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
|
|
24
|
+
from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
|
|
25
|
+
from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
|
|
26
|
+
from model_analyzer.record.types.gpu_utilization import GPUUtilization
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DCGMMonitor(Monitor):
|
|
30
|
+
"""
|
|
31
|
+
Use DCGM to monitor GPU metrics
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# Mapping between the DCGM Fields and Model Analyzer Records
|
|
35
|
+
model_analyzer_to_dcgm_field = {
|
|
36
|
+
GPUUsedMemory: dcgm_fields.DCGM_FI_DEV_FB_USED,
|
|
37
|
+
GPUFreeMemory: dcgm_fields.DCGM_FI_DEV_FB_FREE,
|
|
38
|
+
GPUUtilization: dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
|
|
39
|
+
GPUPowerUsage: dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def __init__(self, gpus, frequency, metrics, dcgmPath=None):
|
|
43
|
+
"""
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
gpus : list of GPUDevice
|
|
47
|
+
The gpus to be monitored
|
|
48
|
+
frequency : int
|
|
49
|
+
Sampling frequency for the metric
|
|
50
|
+
metrics : list
|
|
51
|
+
List of Record types to monitor
|
|
52
|
+
dcgmPath : str (optional)
|
|
53
|
+
DCGM installation path
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
super().__init__(frequency, metrics)
|
|
57
|
+
structs._dcgmInit(dcgmPath)
|
|
58
|
+
dcgm_agent.dcgmInit()
|
|
59
|
+
|
|
60
|
+
self._gpus = gpus
|
|
61
|
+
|
|
62
|
+
# Start DCGM in the embedded mode to use the shared library
|
|
63
|
+
self.dcgm_handle = dcgm_handle = dcgm_agent.dcgmStartEmbedded(
|
|
64
|
+
structs.DCGM_OPERATION_MODE_MANUAL
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Create DCGM monitor group
|
|
68
|
+
self.group_id = dcgm_agent.dcgmGroupCreate(
|
|
69
|
+
dcgm_handle, structs.DCGM_GROUP_EMPTY, "triton-monitor"
|
|
70
|
+
)
|
|
71
|
+
# Add the GPUs to the group
|
|
72
|
+
for gpu in self._gpus:
|
|
73
|
+
dcgm_agent.dcgmGroupAddDevice(dcgm_handle, self.group_id, gpu.device_id())
|
|
74
|
+
|
|
75
|
+
frequency = int(self._frequency * 1000)
|
|
76
|
+
fields = []
|
|
77
|
+
try:
|
|
78
|
+
for metric in metrics:
|
|
79
|
+
fields.append(self.model_analyzer_to_dcgm_field[metric])
|
|
80
|
+
except KeyError:
|
|
81
|
+
dcgm_agent.dcgmShutdown()
|
|
82
|
+
raise TritonModelAnalyzerException(
|
|
83
|
+
f"{metric} is not supported by Model Analyzer DCGM Monitor"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.dcgm_field_group_id = dcgm_agent.dcgmFieldGroupCreate(
|
|
87
|
+
dcgm_handle, fields, "triton-monitor"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.group_watcher = dcgm_field_helpers.DcgmFieldGroupWatcher(
|
|
91
|
+
dcgm_handle,
|
|
92
|
+
self.group_id,
|
|
93
|
+
self.dcgm_field_group_id.value,
|
|
94
|
+
structs.DCGM_OPERATION_MODE_MANUAL,
|
|
95
|
+
frequency,
|
|
96
|
+
3600,
|
|
97
|
+
0,
|
|
98
|
+
0,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def is_monitoring_connected(self) -> bool:
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
def _monitoring_iteration(self):
|
|
105
|
+
self.group_watcher.GetMore()
|
|
106
|
+
|
|
107
|
+
def _collect_records(self):
|
|
108
|
+
records = []
|
|
109
|
+
for gpu in self._gpus:
|
|
110
|
+
device_id = gpu.device_id()
|
|
111
|
+
metrics = self.group_watcher.values[device_id]
|
|
112
|
+
|
|
113
|
+
# Find the first key in the metrics dictionary to find the
|
|
114
|
+
# dictionary length
|
|
115
|
+
if len(list(metrics)) > 0:
|
|
116
|
+
for metric_type in self._metrics:
|
|
117
|
+
dcgm_field = self.model_analyzer_to_dcgm_field[metric_type]
|
|
118
|
+
for measurement in metrics[dcgm_field].values:
|
|
119
|
+
if measurement.value is not None:
|
|
120
|
+
# DCGM timestamp is in nanoseconds
|
|
121
|
+
records.append(
|
|
122
|
+
metric_type(
|
|
123
|
+
value=float(measurement.value),
|
|
124
|
+
device_uuid=gpu.device_uuid(),
|
|
125
|
+
timestamp=measurement.ts,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return records
|
|
130
|
+
|
|
131
|
+
def destroy(self):
|
|
132
|
+
"""
|
|
133
|
+
Destroy the DCGMMonitor. This function must be called
|
|
134
|
+
in order to appropriately deallocate the resources.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
dcgm_agent.dcgmShutdown()
|
|
138
|
+
super().destroy()
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
15
|
+
import time
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
import signal
|
|
21
|
+
|
|
22
|
+
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
23
|
+
parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
|
|
24
|
+
sys.path.insert(0, parent_dir_path)
|
|
25
|
+
|
|
26
|
+
from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
|
|
27
|
+
from model_analyzer.monitor.dcgm.common import dcgm_client_cli_parser as cli
|
|
28
|
+
|
|
29
|
+
if 'DCGM_TESTING_FRAMEWORK' in os.environ:
|
|
30
|
+
try:
|
|
31
|
+
from prometheus_tester_api import start_http_server, Gauge
|
|
32
|
+
except:
|
|
33
|
+
logging.critical(
|
|
34
|
+
"prometheus_tester_api missing, reinstall test framework.")
|
|
35
|
+
sys.exit(3)
|
|
36
|
+
else:
|
|
37
|
+
try:
|
|
38
|
+
from prometheus_client import start_http_server, Gauge
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
41
|
+
logging.critical(
|
|
42
|
+
"prometheus_client not installed, please run: \"pip install prometheus_client\""
|
|
43
|
+
)
|
|
44
|
+
sys.exit(3)
|
|
45
|
+
|
|
46
|
+
DEFAULT_FIELDS = [
|
|
47
|
+
dcgm_fields.DCGM_FI_DEV_PCI_BUSID, #Needed for plugin_instance
|
|
48
|
+
dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
|
|
49
|
+
dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
|
|
50
|
+
dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
|
|
51
|
+
dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
|
|
52
|
+
dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
|
|
53
|
+
dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
|
|
54
|
+
dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
|
|
55
|
+
dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
|
|
56
|
+
dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
|
|
57
|
+
dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
|
|
58
|
+
dcgm_fields.DCGM_FI_DEV_FB_FREE,
|
|
59
|
+
dcgm_fields.DCGM_FI_DEV_FB_USED,
|
|
60
|
+
dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
|
|
61
|
+
dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
|
|
62
|
+
dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
|
|
63
|
+
dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
|
|
64
|
+
dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
|
|
65
|
+
dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
|
|
66
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
|
|
67
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
|
|
68
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
|
|
69
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DcgmPrometheus(DcgmReader):
|
|
74
|
+
###########################################################################
|
|
75
|
+
def __init__(self):
|
|
76
|
+
#Have DCGM update its watches twice as fast as our update interval so we don't get out of phase by our update interval
|
|
77
|
+
updateIntervalUsec = int(
|
|
78
|
+
(1000000 * g_settings['prometheusPublishInterval']) / 2)
|
|
79
|
+
#Add our PID to our field group name so we can have multiple instances running
|
|
80
|
+
fieldGroupName = 'dcgm_prometheus_' + str(os.getpid())
|
|
81
|
+
|
|
82
|
+
DcgmReader.__init__(self,
|
|
83
|
+
ignoreList=g_settings['ignoreList'],
|
|
84
|
+
fieldIds=g_settings['publishFieldIds'],
|
|
85
|
+
updateFrequency=updateIntervalUsec,
|
|
86
|
+
fieldGroupName=fieldGroupName,
|
|
87
|
+
hostname=g_settings['dcgmHostName'])
|
|
88
|
+
self.m_existingGauge = {}
|
|
89
|
+
|
|
90
|
+
###########################################################################
|
|
91
|
+
'''
|
|
92
|
+
This function is implemented from the base class : DcgmReader. It converts each
|
|
93
|
+
field / value from the fvs dictionary to a gauge and publishes the gauge to the
|
|
94
|
+
prometheus client server.
|
|
95
|
+
|
|
96
|
+
@params:
|
|
97
|
+
fvs : The fieldvalue dictionary that contains info about the values of field Ids for each gpuId.
|
|
98
|
+
'''
|
|
99
|
+
|
|
100
|
+
def CustomDataHandler(self, fvs):
|
|
101
|
+
if not self.m_existingGauge:
|
|
102
|
+
self.SetupGauges()
|
|
103
|
+
|
|
104
|
+
for _, fieldIds in self.m_publishFields.items():
|
|
105
|
+
if fieldIds is None:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
for fieldId in fieldIds:
|
|
109
|
+
if fieldId in self.m_dcgmIgnoreFields:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
g = self.m_existingGauge[fieldId]
|
|
113
|
+
|
|
114
|
+
for gpuId in list(fvs.keys()):
|
|
115
|
+
gpuFv = fvs[gpuId]
|
|
116
|
+
val = gpuFv[fieldId][-1]
|
|
117
|
+
|
|
118
|
+
#Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
|
|
119
|
+
if val.isBlank:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
gpuUuid = self.m_gpuIdToUUId[gpuId]
|
|
123
|
+
gpuBusId = self.m_gpuIdToBusId[gpuId]
|
|
124
|
+
gpuUniqueId = gpuUuid if g_settings['sendUuid'] else gpuBusId
|
|
125
|
+
|
|
126
|
+
# pylint doesn't find the labels member for Gauge, but it exists. Ignore the warning
|
|
127
|
+
g.labels(gpuId, gpuUniqueId).set(val.value) # pylint: disable=no-member
|
|
128
|
+
|
|
129
|
+
logging.debug(
|
|
130
|
+
'Sent GPU %d %s %s = %s' %
|
|
131
|
+
(gpuId, gpuUniqueId, self.m_fieldIdToInfo[fieldId].tag,
|
|
132
|
+
str(val.value)))
|
|
133
|
+
|
|
134
|
+
###############################################################################
|
|
135
|
+
'''
|
|
136
|
+
NOTE: even though some fields are monotonically increasing and therefore fit the mold to be
|
|
137
|
+
counters, all are published as gauges so that DCGM is the sole authority on the state of the
|
|
138
|
+
system, preventing problems around down times, driver reboots, and the unlikely event of
|
|
139
|
+
flashing the inforom.
|
|
140
|
+
For specific information about which fields monotonically increase, see the API guide or
|
|
141
|
+
dcgm_fields.h
|
|
142
|
+
'''
|
|
143
|
+
|
|
144
|
+
def SetupGauges(self):
|
|
145
|
+
for _, fieldIds in self.m_publishFields.items():
|
|
146
|
+
if fieldIds is None:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
for fieldId in fieldIds:
|
|
150
|
+
if fieldId in self.m_dcgmIgnoreFields:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
uniqueIdName = 'GpuUuid' if g_settings[
|
|
154
|
+
'sendUuid'] else 'GpuBusID'
|
|
155
|
+
|
|
156
|
+
fieldTag = self.m_fieldIdToInfo[fieldId].tag
|
|
157
|
+
self.m_existingGauge[fieldId] = Gauge("dcgm_" + fieldTag,
|
|
158
|
+
'DCGM_PROMETHEUS',
|
|
159
|
+
['GpuID', uniqueIdName])
|
|
160
|
+
|
|
161
|
+
###############################################################################
|
|
162
|
+
'''
|
|
163
|
+
Scrape the fieldvalue data and publish. This function calls the process function of
|
|
164
|
+
the base class DcgmReader.
|
|
165
|
+
'''
|
|
166
|
+
|
|
167
|
+
def Scrape(self, data=None):
|
|
168
|
+
return self.Process()
|
|
169
|
+
|
|
170
|
+
###############################################################################
|
|
171
|
+
def LogBasicInformation(self):
|
|
172
|
+
# Reconnect causes everything to get initialized
|
|
173
|
+
self.Reconnect()
|
|
174
|
+
|
|
175
|
+
logging.info('Started prometheus client')
|
|
176
|
+
|
|
177
|
+
fieldTagList = ''
|
|
178
|
+
|
|
179
|
+
for _, fieldIds in self.m_publishFields.items():
|
|
180
|
+
if fieldIds is None:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
for fieldId in fieldIds:
|
|
184
|
+
if fieldId in self.m_dcgmIgnoreFields:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
if fieldTagList == '':
|
|
188
|
+
fieldTagList = self.m_fieldIdToInfo[fieldId].tag
|
|
189
|
+
else:
|
|
190
|
+
fieldTagList = fieldTagList + ", %s" % (
|
|
191
|
+
self.m_fieldIdToInfo[fieldId].tag)
|
|
192
|
+
|
|
193
|
+
logging.info("Publishing fields: '%s'" % (fieldTagList))
|
|
194
|
+
|
|
195
|
+
###############################################################################
|
|
196
|
+
def LogError(self, msg):
|
|
197
|
+
logging.error(msg)
|
|
198
|
+
|
|
199
|
+
###############################################################################
|
|
200
|
+
def LogInfo(self, msg):
|
|
201
|
+
logging.info(msg)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
###############################################################################
|
|
205
|
+
def exit_handler(signum, frame):
|
|
206
|
+
g_settings['shouldExit'] = True
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
###############################################################################
|
|
210
|
+
def main_loop(prometheus_obj, publish_interval):
|
|
211
|
+
try:
|
|
212
|
+
while True:
|
|
213
|
+
prometheus_obj.Scrape(prometheus_obj)
|
|
214
|
+
time.sleep(publish_interval)
|
|
215
|
+
|
|
216
|
+
if g_settings['shouldExit'] == True:
|
|
217
|
+
prometheus_obj.LogInfo('Received a signal...shutting down')
|
|
218
|
+
break
|
|
219
|
+
except KeyboardInterrupt:
|
|
220
|
+
print("Caught CTRL-C. Exiting")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
###############################################################################
|
|
224
|
+
def initialize_globals():
|
|
225
|
+
'''
|
|
226
|
+
Name of the host.
|
|
227
|
+
'''
|
|
228
|
+
global g_settings
|
|
229
|
+
g_settings = {}
|
|
230
|
+
|
|
231
|
+
g_settings['shouldExit'] = False
|
|
232
|
+
'''
|
|
233
|
+
List of the ids that are present in g_settings['publishFieldIds'] but ignored for watch.
|
|
234
|
+
'''
|
|
235
|
+
g_settings['ignoreList'] = [
|
|
236
|
+
dcgm_fields.DCGM_FI_DEV_PCI_BUSID,
|
|
237
|
+
]
|
|
238
|
+
'''
|
|
239
|
+
Those are initialized by the CLI parser. We only list them here for clarity.
|
|
240
|
+
'''
|
|
241
|
+
for key in [
|
|
242
|
+
'dcgmHostName',
|
|
243
|
+
'prometheusPort',
|
|
244
|
+
'prometheusPublishInterval',
|
|
245
|
+
'publishFieldIds',
|
|
246
|
+
]:
|
|
247
|
+
g_settings[key] = None
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
###############################################################################
|
|
251
|
+
def parse_command_line():
|
|
252
|
+
parser = cli.create_parser(
|
|
253
|
+
name='Prometheus',
|
|
254
|
+
field_ids=DEFAULT_FIELDS,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
cli.add_custom_argument(parser,
|
|
258
|
+
'--send-uuid',
|
|
259
|
+
dest='send_uuid',
|
|
260
|
+
default=False,
|
|
261
|
+
action='store_true',
|
|
262
|
+
help='Send GPU UUID instead of bus id')
|
|
263
|
+
|
|
264
|
+
args = cli.run_parser(parser)
|
|
265
|
+
field_ids = cli.get_field_ids(args)
|
|
266
|
+
numeric_log_level = cli.get_log_level(args)
|
|
267
|
+
|
|
268
|
+
# Defaults to localhost, so we need to set it to None
|
|
269
|
+
if args.embedded:
|
|
270
|
+
g_settings['dcgmHostName'] = None
|
|
271
|
+
else:
|
|
272
|
+
g_settings['dcgmHostName'] = args.hostname
|
|
273
|
+
|
|
274
|
+
g_settings['prometheusPort'] = args.publish_port
|
|
275
|
+
|
|
276
|
+
g_settings['prometheusPublishInterval'] = args.interval
|
|
277
|
+
|
|
278
|
+
logfile = args.logfile
|
|
279
|
+
|
|
280
|
+
g_settings['publishFieldIds'] = field_ids
|
|
281
|
+
|
|
282
|
+
g_settings['sendUuid'] = args.send_uuid
|
|
283
|
+
|
|
284
|
+
if logfile != None:
|
|
285
|
+
logging.basicConfig(level=numeric_log_level,
|
|
286
|
+
filename=logfile,
|
|
287
|
+
filemode='w+',
|
|
288
|
+
format='%(asctime)s %(levelname)s: %(message)s')
|
|
289
|
+
else:
|
|
290
|
+
logging.basicConfig(level=numeric_log_level,
|
|
291
|
+
stream=sys.stdout,
|
|
292
|
+
filemode='w+',
|
|
293
|
+
format='%(asctime)s %(levelname)s: %(message)s')
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
###############################################################################
|
|
297
|
+
def initialize_signal_handlers():
|
|
298
|
+
signal.signal(signal.SIGINT, exit_handler)
|
|
299
|
+
signal.signal(signal.SIGTERM, exit_handler)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
###############################################################################
|
|
303
|
+
def main():
|
|
304
|
+
initialize_globals()
|
|
305
|
+
|
|
306
|
+
initialize_signal_handlers()
|
|
307
|
+
|
|
308
|
+
parse_command_line()
|
|
309
|
+
|
|
310
|
+
prometheus_obj = DcgmPrometheus()
|
|
311
|
+
|
|
312
|
+
logging.info("Starting Prometheus server on port " +
|
|
313
|
+
str(g_settings['prometheusPort']))
|
|
314
|
+
|
|
315
|
+
#start prometheus client server.
|
|
316
|
+
start_http_server(g_settings['prometheusPort'])
|
|
317
|
+
|
|
318
|
+
prometheus_obj.LogBasicInformation()
|
|
319
|
+
|
|
320
|
+
main_loop(prometheus_obj, g_settings['prometheusPublishInterval'])
|
|
321
|
+
|
|
322
|
+
prometheus_obj.Shutdown()
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
if __name__ == '__main__':
|
|
326
|
+
main()
|