triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,369 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import sys
15
+ import subprocess
16
+ import signal
17
+ import os
18
+ import re
19
+ import sys
20
+
21
+ dir_path = os.path.dirname(os.path.realpath(__file__))
22
+ parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
23
+ sys.path.insert(0, parent_dir_path)
24
+
25
+ import model_analyzer.monitor.dcgm.dcgm_fields_collectd as dcgm_fields_collectd
26
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
27
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
28
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
29
+ import threading
30
+ from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
31
+
32
+ if 'DCGM_TESTING_FRAMEWORK' in os.environ:
33
+ try:
34
+ import collectd_tester_api as collectd
35
+ except:
36
+ import collectd
37
+ else:
38
+ import collectd
39
+
40
+ # Set default values for the hostname and the library path
41
+ g_dcgmLibPath = '/usr/lib'
42
+ g_dcgmHostName = 'localhost'
43
+
44
+ # Add overriding through the environment instead of hard coded.
45
+ if 'DCGM_HOSTNAME' in os.environ:
46
+ g_dcgmHostName = os.environ['DCGM_HOSTNAME']
47
+
48
+ if 'DCGMLIBPATH' in os.environ:
49
+ g_dcgmLibPath = os.environ['DCGMLIBPATH']
50
+
51
+ c_ONE_SEC_IN_USEC = 1000000
52
+
53
+ g_intervalSec = 10 # Default
54
+
55
+ g_dcgmIgnoreFields = [dcgm_fields.DCGM_FI_DEV_UUID] # Fields not to publish
56
+
57
+ g_publishFieldIds = [
58
+ dcgm_fields.DCGM_FI_DEV_UUID, #Needed for plugin instance
59
+ dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
60
+ dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
61
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
62
+ dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
63
+ dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
64
+ dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
65
+ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
66
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
67
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
68
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
69
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
70
+ dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
71
+ dcgm_fields.DCGM_FI_DEV_FB_FREE,
72
+ dcgm_fields.DCGM_FI_DEV_FB_USED,
73
+ dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
74
+ dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
75
+ dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
76
+ dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
77
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
78
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
79
+ dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
80
+ dcgm_fields.DCGM_FI_DEV_MEM_CLOCK,
81
+ dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
82
+ dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
83
+ dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
84
+ dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
85
+ dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
86
+ dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
87
+ dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
88
+ ]
89
+
90
+ g_fieldIntervalMap = None
91
+ g_parseRegEx = None
92
+ g_fieldRegEx = None
93
+
94
+ # We build up a regex to match field IDs. These can be numeric IDs, or
95
+ # names. We start with field_regex that matches either as a string (as
96
+ # well as names that might start with digits, but we do not worry about
97
+ # this over-generation of valid IDs at this point).
98
+ #
99
+ # Basically a field is an integral number or a textual name. A field
100
+ # list is a field, or a list of fields separated by commas and enclosed
101
+ # in parenthssis. A field list may be optionally followed by a colon,
102
+ # indicating a possible non-default interval if also followed by a
103
+ # floating point interval value. This is a complete field list.
104
+ # Multiple complete field lists may appear, separated by commas.
105
+ #
106
+ # For example: (1001,tensor_active):5,1002:10
107
+ #
108
+ # This specifies that fields 1001 and tensor_active are to be sampled
109
+ # at a rate of every 5 seconds, and 1002 every ten seconds.
110
+ #
111
+ # For example: (1001,tensor_active):5,1002:
112
+ #
113
+ # This is the same, but field 1002 is to be sampled at the default rate
114
+ # (and the colon in entirely unnecessary, but not illegal).
115
+
116
+ field_regex = r"[0-9a-zA-Z_]+"
117
+ g_fieldRegEx = re.compile("((" + field_regex + "),?)")
118
+
119
+ # We now generate a list of field regular expressions, separated by a
120
+ # comma, and enclosed with parenthesis, for grouping.
121
+
122
+ fields_regex = r"\(" + field_regex + "(," + field_regex + ")*" + r"\)"
123
+
124
+ # This is an optional interval specification, allowing an optional :,
125
+ # followed by an optional floating point dcgm sampling interval. If any
126
+ # are missing, the default collectd sampling interval is used.
127
+
128
+ interval_regex = r"(:[0-9]*(\.[0-9]+)?)?,?"
129
+
130
+ # Here, we combine a field regex or field list regex with an optional
131
+ # interval regex. Multiple of these may appear in succession.
132
+
133
+ g_parseRegEx = re.compile("((" + field_regex + "|(" + fields_regex + "))" +
134
+ interval_regex + ")")
135
+
136
+
137
+ class DcgmCollectdPlugin(DcgmReader):
138
+ ###########################################################################
139
+ def __init__(self):
140
+ global c_ONE_SEC_IN_USEC
141
+
142
+ collectd.debug(
143
+ 'Initializing DCGM with interval={}s'.format(g_intervalSec))
144
+ DcgmReader.__init__(self,
145
+ fieldIds=g_publishFieldIds,
146
+ ignoreList=g_dcgmIgnoreFields,
147
+ fieldGroupName='collectd_plugin',
148
+ updateFrequency=g_intervalSec * c_ONE_SEC_IN_USEC,
149
+ fieldIntervalMap=g_fieldIntervalMap)
150
+
151
+ ###########################################################################
152
+
153
+ def CustomDataHandler(self, fvs):
154
+ global c_ONE_SEC_IN_USEC
155
+
156
+ value = collectd.Values(type='gauge') # pylint: disable=no-member
157
+ value.plugin = 'dcgm_collectd'
158
+
159
+ for gpuId in list(fvs.keys()):
160
+ gpuFv = fvs[gpuId]
161
+
162
+ uuid = self.m_gpuIdToUUId[gpuId]
163
+ collectd.debug('CustomDataHandler uuid: ' + '%s' % (uuid) + '\n')
164
+ value.plugin_instance = '%s' % (uuid)
165
+
166
+ typeInstance = str(gpuId)
167
+
168
+ for fieldId in list(gpuFv.keys()):
169
+ # Skip ignore list
170
+ if fieldId in self.m_dcgmIgnoreFields:
171
+ continue
172
+
173
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
174
+ lastValTime = float("inf")
175
+
176
+ # Filter out times too close together (< 1.0 sec) but always
177
+ # include latest one.
178
+
179
+ for val in gpuFv[fieldId][::-1]:
180
+ # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
181
+ if val.isBlank:
182
+ continue
183
+
184
+ valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC
185
+ ) #Round down to 1-second for now
186
+ if (lastValTime - valTimeSec1970) < 1.0:
187
+ collectd.debug(
188
+ "DCGM sample for field ID %d too soon at %f, last one sampled at %f"
189
+ % (fieldId, valTimeSec1970, lastValTime))
190
+ val.isBlank = True # Filter this one out
191
+ continue
192
+
193
+ lastValTime = valTimeSec1970
194
+
195
+ i = 0
196
+
197
+ for val in gpuFv[fieldId]:
198
+ # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
199
+ if val.isBlank:
200
+ continue
201
+
202
+ # Round down to 1-second for now
203
+ valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC)
204
+ valueArray = [
205
+ val.value,
206
+ ]
207
+ value.dispatch(type=fieldTag,
208
+ type_instance=typeInstance,
209
+ time=valTimeSec1970,
210
+ values=valueArray,
211
+ plugin=value.plugin)
212
+
213
+ collectd.debug(
214
+ " gpuId %d, tag %s, sample %d, value %s, time %s" %
215
+ (gpuId, fieldTag, i, str(val.value), str(val.ts))) # pylint: disable=no-member
216
+ i += 1
217
+
218
+ ###########################################################################
219
+ def LogInfo(self, msg):
220
+ collectd.info(msg) # pylint: disable=no-member
221
+
222
+ ###########################################################################
223
+ def LogError(self, msg):
224
+ collectd.error(msg) # pylint: disable=no-member
225
+
226
+
227
+ ###############################################################################
228
+ ##### Parse supplied collectd configuration object.
229
+ ###############################################################################
230
+ def parse_config(config):
231
+ global c_ONE_SEC_IN_USEC
232
+ global g_intervalSec
233
+ global g_fieldIntervalMap
234
+ global g_parseRegEx
235
+ global g_fieldRegEx
236
+
237
+ g_fieldIntervalMap = {}
238
+
239
+ for node in config.children:
240
+ if node.key == 'Interval':
241
+ g_intervalSec = float(node.values[0])
242
+ elif node.key == 'FieldIds':
243
+ fieldIds = node.values[0]
244
+
245
+ # And we parse out the field ID list with this regex.
246
+ field_set_list = g_parseRegEx.finditer(fieldIds)
247
+
248
+ for field_set in field_set_list:
249
+ # We get the list of fields...
250
+ fields = field_set.group(2)
251
+
252
+ # ... and the optional interval.
253
+ interval_str = field_set.group(5)
254
+
255
+ # We figure out if the default collectd sampling interval is
256
+ # to be used, or a different one.
257
+ if (interval_str == None) or (interval_str == ":"):
258
+ interval = int(g_intervalSec * c_ONE_SEC_IN_USEC)
259
+ else:
260
+ interval = int(float(interval_str[1:]) *
261
+ c_ONE_SEC_IN_USEC) # strip :
262
+
263
+ # We keep a set of fields for each unique interval
264
+ if interval not in g_fieldIntervalMap.keys():
265
+ g_fieldIntervalMap[interval] = []
266
+
267
+ # Here we parse out either miltiple fields sharing an
268
+ # interval, or a single field.
269
+ if fields[0:1] == "(": # a true field set
270
+ fields = fields[1:-1]
271
+ field_list = g_fieldRegEx.finditer(fields)
272
+ for field_group in field_list:
273
+
274
+ # We map any field names to field numbers, and add
275
+ # them to the list for the interval
276
+ field = dcgm_fields_collectd.GetFieldByName(
277
+ field_group.group(2))
278
+ g_fieldIntervalMap[interval] += [field]
279
+ else: # just one field
280
+ # Map field name to number.
281
+ field = dcgm_fields_collectd.GetFieldByName(fields)
282
+ g_fieldIntervalMap[interval] += [field]
283
+
284
+
285
+ ###############################################################################
286
+ ##### Wrapper the Class methods for collectd callbacks
287
+ ###############################################################################
288
+ def config_dcgm(config=None):
289
+ """
290
+ collectd config for dcgm is in the form of a dcgm.conf file, usually
291
+ installed in /etc/collectd/collectd.conf.d/dcgm.conf.
292
+
293
+ An example is:
294
+
295
+ LoadPlugin python
296
+ <Plugin python>
297
+ ModulePath "/usr/lib64/collectd/dcgm"
298
+ LogTraces true
299
+ Interactive false
300
+ Import "dcgm_collectd_plugin"
301
+ <Module dcgm_collectd_plugin>
302
+ Interval 2
303
+ FieldIds "(1001,tensor_active):5,1002:10,1004:.1,1010:"
304
+ FieldIds "1007"
305
+ </Module>
306
+ </Plugin>
307
+
308
+ ModulePath indicates where the plugin and supporting files are installed
309
+ (generally copied from /usr/local/dcgm/bindings/python3).
310
+
311
+ Interval is the default collectd sampling interval in seconds.
312
+
313
+ FieldIds may appear several times. One is either a field ID by name or
314
+ number. A field ID list is either a single field ID or a list of same,
315
+ separated by commas (,) and bounded by parenthesis ( ( and ) ). Each field
316
+ ID list can be followed by an optional colon (:) and a floating point
317
+ DCGM sampling interval. If no sampling interval is specified the default
318
+ collectd sampling interval is used (and the colon is redundant but not
319
+ illegal). Multiple field ID lists can appear on one FieldIds entry,
320
+ separated by commas (,). FieldIDs are strings and must be enclosed in
321
+ quotes ("). Multiple FieldIds lines are permitted.
322
+
323
+ DCGM will sample the fields at the interval(s) indicated, and collectd will
324
+ collect the samples asynchronously at the Interval specified. Because this
325
+ is asynchronous sometimes one less than expected will be collected and other
326
+ times one more than expected will be collected.
327
+ """
328
+
329
+ # If we throw an exception here, collectd config will terminate loading the
330
+ # plugin.
331
+ if config is not None:
332
+ parse_config(config)
333
+
334
+ # Register the read function with the default collectd sampling interval.
335
+ collectd.register_read(read_dcgm, interval=g_intervalSec) # pylint: disable=no-member
336
+
337
+
338
+ ###############################################################################
339
+ def init_dcgm():
340
+ global g_dcgmCollectd
341
+
342
+ # restore default SIGCHLD behavior to avoid exceptions with new processes
343
+ signal.signal(signal.SIGCHLD, signal.SIG_DFL)
344
+
345
+ g_dcgmCollectd = DcgmCollectdPlugin()
346
+ g_dcgmCollectd.Init()
347
+
348
+
349
+ ###############################################################################
350
+ def shutdown_dcgm():
351
+ g_dcgmCollectd.Shutdown()
352
+
353
+
354
+ ###############################################################################
355
+ def read_dcgm(data=None):
356
+ g_dcgmCollectd.Process()
357
+
358
+
359
+ def register_collectd_callbacks():
360
+ collectd.register_config(config_dcgm, name="dcgm_collectd_plugin") # pylint: disable=no-member
361
+ # config_dcgm registers read since it needs to parse the sampling interval.
362
+ collectd.register_init(init_dcgm) # pylint: disable=no-member
363
+ collectd.register_shutdown(shutdown_dcgm) # pylint: disable=no-member
364
+
365
+
366
+ ###############################################################################
367
+ ##### Main
368
+ ###############################################################################
369
+ register_collectd_callbacks()