triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import sys
|
|
15
|
+
import subprocess
|
|
16
|
+
import signal
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
22
|
+
parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
|
|
23
|
+
sys.path.insert(0, parent_dir_path)
|
|
24
|
+
|
|
25
|
+
import model_analyzer.monitor.dcgm.dcgm_fields_collectd as dcgm_fields_collectd
|
|
26
|
+
import model_analyzer.monitor.dcgm.pydcgm as pydcgm
|
|
27
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
28
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
29
|
+
import threading
|
|
30
|
+
from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
|
|
31
|
+
|
|
32
|
+
if 'DCGM_TESTING_FRAMEWORK' in os.environ:
|
|
33
|
+
try:
|
|
34
|
+
import collectd_tester_api as collectd
|
|
35
|
+
except:
|
|
36
|
+
import collectd
|
|
37
|
+
else:
|
|
38
|
+
import collectd
|
|
39
|
+
|
|
40
|
+
# Set default values for the hostname and the library path
|
|
41
|
+
g_dcgmLibPath = '/usr/lib'
|
|
42
|
+
g_dcgmHostName = 'localhost'
|
|
43
|
+
|
|
44
|
+
# Add overriding through the environment instead of hard coded.
|
|
45
|
+
if 'DCGM_HOSTNAME' in os.environ:
|
|
46
|
+
g_dcgmHostName = os.environ['DCGM_HOSTNAME']
|
|
47
|
+
|
|
48
|
+
if 'DCGMLIBPATH' in os.environ:
|
|
49
|
+
g_dcgmLibPath = os.environ['DCGMLIBPATH']
|
|
50
|
+
|
|
51
|
+
c_ONE_SEC_IN_USEC = 1000000
|
|
52
|
+
|
|
53
|
+
g_intervalSec = 10 # Default
|
|
54
|
+
|
|
55
|
+
g_dcgmIgnoreFields = [dcgm_fields.DCGM_FI_DEV_UUID] # Fields not to publish
|
|
56
|
+
|
|
57
|
+
g_publishFieldIds = [
|
|
58
|
+
dcgm_fields.DCGM_FI_DEV_UUID, #Needed for plugin instance
|
|
59
|
+
dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
|
|
60
|
+
dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
|
|
61
|
+
dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
|
|
62
|
+
dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
|
|
63
|
+
dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
|
|
64
|
+
dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
|
|
65
|
+
dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
|
|
66
|
+
dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
|
|
67
|
+
dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
|
|
68
|
+
dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
|
|
69
|
+
dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
|
|
70
|
+
dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
|
|
71
|
+
dcgm_fields.DCGM_FI_DEV_FB_FREE,
|
|
72
|
+
dcgm_fields.DCGM_FI_DEV_FB_USED,
|
|
73
|
+
dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
|
|
74
|
+
dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
|
|
75
|
+
dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
|
|
76
|
+
dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
|
|
77
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
|
|
78
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
|
|
79
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
|
|
80
|
+
dcgm_fields.DCGM_FI_DEV_MEM_CLOCK,
|
|
81
|
+
dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
|
|
82
|
+
dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
|
|
83
|
+
dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
|
|
84
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
|
|
85
|
+
dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
|
|
86
|
+
dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
|
|
87
|
+
dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
g_fieldIntervalMap = None
|
|
91
|
+
g_parseRegEx = None
|
|
92
|
+
g_fieldRegEx = None
|
|
93
|
+
|
|
94
|
+
# We build up a regex to match field IDs. These can be numeric IDs, or
|
|
95
|
+
# names. We start with field_regex that matches either as a string (as
|
|
96
|
+
# well as names that might start with digits, but we do not worry about
|
|
97
|
+
# this over-generation of valid IDs at this point).
|
|
98
|
+
#
|
|
99
|
+
# Basically a field is an integral number or a textual name. A field
|
|
100
|
+
# list is a field, or a list of fields separated by commas and enclosed
|
|
101
|
+
# in parenthssis. A field list may be optionally followed by a colon,
|
|
102
|
+
# indicating a possible non-default interval if also followed by a
|
|
103
|
+
# floating point interval value. This is a complete field list.
|
|
104
|
+
# Multiple complete field lists may appear, separated by commas.
|
|
105
|
+
#
|
|
106
|
+
# For example: (1001,tensor_active):5,1002:10
|
|
107
|
+
#
|
|
108
|
+
# This specifies that fields 1001 and tensor_active are to be sampled
|
|
109
|
+
# at a rate of every 5 seconds, and 1002 every ten seconds.
|
|
110
|
+
#
|
|
111
|
+
# For example: (1001,tensor_active):5,1002:
|
|
112
|
+
#
|
|
113
|
+
# This is the same, but field 1002 is to be sampled at the default rate
|
|
114
|
+
# (and the colon in entirely unnecessary, but not illegal).
|
|
115
|
+
|
|
116
|
+
field_regex = r"[0-9a-zA-Z_]+"
|
|
117
|
+
g_fieldRegEx = re.compile("((" + field_regex + "),?)")
|
|
118
|
+
|
|
119
|
+
# We now generate a list of field regular expressions, separated by a
|
|
120
|
+
# comma, and enclosed with parenthesis, for grouping.
|
|
121
|
+
|
|
122
|
+
fields_regex = r"\(" + field_regex + "(," + field_regex + ")*" + r"\)"
|
|
123
|
+
|
|
124
|
+
# This is an optional interval specification, allowing an optional :,
|
|
125
|
+
# followed by an optional floating point dcgm sampling interval. If any
|
|
126
|
+
# are missing, the default collectd sampling interval is used.
|
|
127
|
+
|
|
128
|
+
interval_regex = r"(:[0-9]*(\.[0-9]+)?)?,?"
|
|
129
|
+
|
|
130
|
+
# Here, we combine a field regex or field list regex with an optional
|
|
131
|
+
# interval regex. Multiple of these may appear in succession.
|
|
132
|
+
|
|
133
|
+
g_parseRegEx = re.compile("((" + field_regex + "|(" + fields_regex + "))" +
|
|
134
|
+
interval_regex + ")")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class DcgmCollectdPlugin(DcgmReader):
|
|
138
|
+
###########################################################################
|
|
139
|
+
def __init__(self):
|
|
140
|
+
global c_ONE_SEC_IN_USEC
|
|
141
|
+
|
|
142
|
+
collectd.debug(
|
|
143
|
+
'Initializing DCGM with interval={}s'.format(g_intervalSec))
|
|
144
|
+
DcgmReader.__init__(self,
|
|
145
|
+
fieldIds=g_publishFieldIds,
|
|
146
|
+
ignoreList=g_dcgmIgnoreFields,
|
|
147
|
+
fieldGroupName='collectd_plugin',
|
|
148
|
+
updateFrequency=g_intervalSec * c_ONE_SEC_IN_USEC,
|
|
149
|
+
fieldIntervalMap=g_fieldIntervalMap)
|
|
150
|
+
|
|
151
|
+
###########################################################################
|
|
152
|
+
|
|
153
|
+
def CustomDataHandler(self, fvs):
|
|
154
|
+
global c_ONE_SEC_IN_USEC
|
|
155
|
+
|
|
156
|
+
value = collectd.Values(type='gauge') # pylint: disable=no-member
|
|
157
|
+
value.plugin = 'dcgm_collectd'
|
|
158
|
+
|
|
159
|
+
for gpuId in list(fvs.keys()):
|
|
160
|
+
gpuFv = fvs[gpuId]
|
|
161
|
+
|
|
162
|
+
uuid = self.m_gpuIdToUUId[gpuId]
|
|
163
|
+
collectd.debug('CustomDataHandler uuid: ' + '%s' % (uuid) + '\n')
|
|
164
|
+
value.plugin_instance = '%s' % (uuid)
|
|
165
|
+
|
|
166
|
+
typeInstance = str(gpuId)
|
|
167
|
+
|
|
168
|
+
for fieldId in list(gpuFv.keys()):
|
|
169
|
+
# Skip ignore list
|
|
170
|
+
if fieldId in self.m_dcgmIgnoreFields:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
fieldTag = self.m_fieldIdToInfo[fieldId].tag
|
|
174
|
+
lastValTime = float("inf")
|
|
175
|
+
|
|
176
|
+
# Filter out times too close together (< 1.0 sec) but always
|
|
177
|
+
# include latest one.
|
|
178
|
+
|
|
179
|
+
for val in gpuFv[fieldId][::-1]:
|
|
180
|
+
# Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
|
|
181
|
+
if val.isBlank:
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC
|
|
185
|
+
) #Round down to 1-second for now
|
|
186
|
+
if (lastValTime - valTimeSec1970) < 1.0:
|
|
187
|
+
collectd.debug(
|
|
188
|
+
"DCGM sample for field ID %d too soon at %f, last one sampled at %f"
|
|
189
|
+
% (fieldId, valTimeSec1970, lastValTime))
|
|
190
|
+
val.isBlank = True # Filter this one out
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
lastValTime = valTimeSec1970
|
|
194
|
+
|
|
195
|
+
i = 0
|
|
196
|
+
|
|
197
|
+
for val in gpuFv[fieldId]:
|
|
198
|
+
# Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
|
|
199
|
+
if val.isBlank:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# Round down to 1-second for now
|
|
203
|
+
valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC)
|
|
204
|
+
valueArray = [
|
|
205
|
+
val.value,
|
|
206
|
+
]
|
|
207
|
+
value.dispatch(type=fieldTag,
|
|
208
|
+
type_instance=typeInstance,
|
|
209
|
+
time=valTimeSec1970,
|
|
210
|
+
values=valueArray,
|
|
211
|
+
plugin=value.plugin)
|
|
212
|
+
|
|
213
|
+
collectd.debug(
|
|
214
|
+
" gpuId %d, tag %s, sample %d, value %s, time %s" %
|
|
215
|
+
(gpuId, fieldTag, i, str(val.value), str(val.ts))) # pylint: disable=no-member
|
|
216
|
+
i += 1
|
|
217
|
+
|
|
218
|
+
###########################################################################
|
|
219
|
+
def LogInfo(self, msg):
|
|
220
|
+
collectd.info(msg) # pylint: disable=no-member
|
|
221
|
+
|
|
222
|
+
###########################################################################
|
|
223
|
+
def LogError(self, msg):
|
|
224
|
+
collectd.error(msg) # pylint: disable=no-member
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
###############################################################################
|
|
228
|
+
##### Parse supplied collectd configuration object.
|
|
229
|
+
###############################################################################
|
|
230
|
+
def parse_config(config):
|
|
231
|
+
global c_ONE_SEC_IN_USEC
|
|
232
|
+
global g_intervalSec
|
|
233
|
+
global g_fieldIntervalMap
|
|
234
|
+
global g_parseRegEx
|
|
235
|
+
global g_fieldRegEx
|
|
236
|
+
|
|
237
|
+
g_fieldIntervalMap = {}
|
|
238
|
+
|
|
239
|
+
for node in config.children:
|
|
240
|
+
if node.key == 'Interval':
|
|
241
|
+
g_intervalSec = float(node.values[0])
|
|
242
|
+
elif node.key == 'FieldIds':
|
|
243
|
+
fieldIds = node.values[0]
|
|
244
|
+
|
|
245
|
+
# And we parse out the field ID list with this regex.
|
|
246
|
+
field_set_list = g_parseRegEx.finditer(fieldIds)
|
|
247
|
+
|
|
248
|
+
for field_set in field_set_list:
|
|
249
|
+
# We get the list of fields...
|
|
250
|
+
fields = field_set.group(2)
|
|
251
|
+
|
|
252
|
+
# ... and the optional interval.
|
|
253
|
+
interval_str = field_set.group(5)
|
|
254
|
+
|
|
255
|
+
# We figure out if the default collectd sampling interval is
|
|
256
|
+
# to be used, or a different one.
|
|
257
|
+
if (interval_str == None) or (interval_str == ":"):
|
|
258
|
+
interval = int(g_intervalSec * c_ONE_SEC_IN_USEC)
|
|
259
|
+
else:
|
|
260
|
+
interval = int(float(interval_str[1:]) *
|
|
261
|
+
c_ONE_SEC_IN_USEC) # strip :
|
|
262
|
+
|
|
263
|
+
# We keep a set of fields for each unique interval
|
|
264
|
+
if interval not in g_fieldIntervalMap.keys():
|
|
265
|
+
g_fieldIntervalMap[interval] = []
|
|
266
|
+
|
|
267
|
+
# Here we parse out either miltiple fields sharing an
|
|
268
|
+
# interval, or a single field.
|
|
269
|
+
if fields[0:1] == "(": # a true field set
|
|
270
|
+
fields = fields[1:-1]
|
|
271
|
+
field_list = g_fieldRegEx.finditer(fields)
|
|
272
|
+
for field_group in field_list:
|
|
273
|
+
|
|
274
|
+
# We map any field names to field numbers, and add
|
|
275
|
+
# them to the list for the interval
|
|
276
|
+
field = dcgm_fields_collectd.GetFieldByName(
|
|
277
|
+
field_group.group(2))
|
|
278
|
+
g_fieldIntervalMap[interval] += [field]
|
|
279
|
+
else: # just one field
|
|
280
|
+
# Map field name to number.
|
|
281
|
+
field = dcgm_fields_collectd.GetFieldByName(fields)
|
|
282
|
+
g_fieldIntervalMap[interval] += [field]
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
###############################################################################
|
|
286
|
+
##### Wrapper the Class methods for collectd callbacks
|
|
287
|
+
###############################################################################
|
|
288
|
+
def config_dcgm(config=None):
|
|
289
|
+
"""
|
|
290
|
+
collectd config for dcgm is in the form of a dcgm.conf file, usually
|
|
291
|
+
installed in /etc/collectd/collectd.conf.d/dcgm.conf.
|
|
292
|
+
|
|
293
|
+
An example is:
|
|
294
|
+
|
|
295
|
+
LoadPlugin python
|
|
296
|
+
<Plugin python>
|
|
297
|
+
ModulePath "/usr/lib64/collectd/dcgm"
|
|
298
|
+
LogTraces true
|
|
299
|
+
Interactive false
|
|
300
|
+
Import "dcgm_collectd_plugin"
|
|
301
|
+
<Module dcgm_collectd_plugin>
|
|
302
|
+
Interval 2
|
|
303
|
+
FieldIds "(1001,tensor_active):5,1002:10,1004:.1,1010:"
|
|
304
|
+
FieldIds "1007"
|
|
305
|
+
</Module>
|
|
306
|
+
</Plugin>
|
|
307
|
+
|
|
308
|
+
ModulePath indicates where the plugin and supporting files are installed
|
|
309
|
+
(generally copied from /usr/local/dcgm/bindings/python3).
|
|
310
|
+
|
|
311
|
+
Interval is the default collectd sampling interval in seconds.
|
|
312
|
+
|
|
313
|
+
FieldIds may appear several times. One is either a field ID by name or
|
|
314
|
+
number. A field ID list is either a single field ID or a list of same,
|
|
315
|
+
separated by commas (,) and bounded by parenthesis ( ( and ) ). Each field
|
|
316
|
+
ID list can be followed by an optional colon (:) and a floating point
|
|
317
|
+
DCGM sampling interval. If no sampling interval is specified the default
|
|
318
|
+
collectd sampling interval is used (and the colon is redundant but not
|
|
319
|
+
illegal). Multiple field ID lists can appear on one FieldIds entry,
|
|
320
|
+
separated by commas (,). FieldIDs are strings and must be enclosed in
|
|
321
|
+
quotes ("). Multiple FieldIds lines are permitted.
|
|
322
|
+
|
|
323
|
+
DCGM will sample the fields at the interval(s) indicated, and collectd will
|
|
324
|
+
collect the samples asynchronously at the Interval specified. Because this
|
|
325
|
+
is asynchronous sometimes one less than expected will be collected and other
|
|
326
|
+
times one more than expected will be collected.
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
# If we throw an exception here, collectd config will terminate loading the
|
|
330
|
+
# plugin.
|
|
331
|
+
if config is not None:
|
|
332
|
+
parse_config(config)
|
|
333
|
+
|
|
334
|
+
# Register the read function with the default collectd sampling interval.
|
|
335
|
+
collectd.register_read(read_dcgm, interval=g_intervalSec) # pylint: disable=no-member
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
###############################################################################
|
|
339
|
+
def init_dcgm():
|
|
340
|
+
global g_dcgmCollectd
|
|
341
|
+
|
|
342
|
+
# restore default SIGCHLD behavior to avoid exceptions with new processes
|
|
343
|
+
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
|
|
344
|
+
|
|
345
|
+
g_dcgmCollectd = DcgmCollectdPlugin()
|
|
346
|
+
g_dcgmCollectd.Init()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
###############################################################################
|
|
350
|
+
def shutdown_dcgm():
|
|
351
|
+
g_dcgmCollectd.Shutdown()
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
###############################################################################
|
|
355
|
+
def read_dcgm(data=None):
|
|
356
|
+
g_dcgmCollectd.Process()
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def register_collectd_callbacks():
|
|
360
|
+
collectd.register_config(config_dcgm, name="dcgm_collectd_plugin") # pylint: disable=no-member
|
|
361
|
+
# config_dcgm registers read since it needs to parse the sampling interval.
|
|
362
|
+
collectd.register_init(init_dcgm) # pylint: disable=no-member
|
|
363
|
+
collectd.register_shutdown(shutdown_dcgm) # pylint: disable=no-member
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
###############################################################################
|
|
367
|
+
##### Main
|
|
368
|
+
###############################################################################
|
|
369
|
+
register_collectd_callbacks()
|