triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,29 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ ##
15
+ # Python bindings for the internal API of DCGM library (dcgm_fields_internal.hpp)
16
+ ##
17
+
18
+ from ctypes import *
19
+ from ctypes.util import find_library
20
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
21
+
22
+ # Provides access to functions
23
+ dcgmFP = dcgm_structs._dcgmGetFunctionPointer
24
+
25
+ #internal-only fields
26
+ DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 #Memory utilization samples
27
+ DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 #SM utilization samples
28
+ DCGM_FI_DEV_GRAPHICS_PIDS = 220 #Graphics processes running on the GPU.
29
+ DCGM_FI_DEV_COMPUTE_PIDS = 221 #Compute processes running on the GPU.
@@ -0,0 +1,45 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from model_analyzer.monitor.dcgm.common.dcgm_client_main import main
15
+ from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader
16
+ from socket import socket, AF_INET, SOCK_DGRAM
17
+
18
+ # Displayed to the user
19
+ FLUENTD_NAME = 'Fluentd'
20
+ DEFAULT_FLUENTD_PORT = 24225
21
+
22
+ # Fluentd Configuration
23
+ # =====================
24
+ # In order to use this client, Fluentd needs to accept json over udp.
25
+ # The default port is 24225
26
+
27
+
28
+ class DcgmFluentd(DcgmJsonReader):
29
+ ###########################################################################
30
+ def __init__(self, publish_hostname, publish_port, **kwargs):
31
+ self.m_sock = socket(AF_INET, SOCK_DGRAM)
32
+ self.m_dest = (publish_hostname, publish_port)
33
+ super(DcgmFluentd, self).__init__(**kwargs)
34
+
35
+ ###########################################################################
36
+ def SendToFluentd(self, payload):
37
+ self.m_sock.sendto(payload, self.m_dest)
38
+
39
+ ###########################################################################
40
+ def CustomJsonHandler(self, outJson):
41
+ self.SendToFluentd(outJson)
42
+
43
+
44
+ if __name__ == '__main__': # pragma: no cover
45
+ main(DcgmFluentd, FLUENTD_NAME, DEFAULT_FLUENTD_PORT, add_target_host=True)
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
18
+ import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers
19
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
20
+ import model_analyzer.monitor.dcgm.dcgm_structs as structs
21
+ from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
22
+ from model_analyzer.monitor.monitor import Monitor
23
+ from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
24
+ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
25
+ from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
26
+ from model_analyzer.record.types.gpu_utilization import GPUUtilization
27
+
28
+
29
+ class DCGMMonitor(Monitor):
30
+ """
31
+ Use DCGM to monitor GPU metrics
32
+ """
33
+
34
+ # Mapping between the DCGM Fields and Model Analyzer Records
35
+ model_analyzer_to_dcgm_field = {
36
+ GPUUsedMemory: dcgm_fields.DCGM_FI_DEV_FB_USED,
37
+ GPUFreeMemory: dcgm_fields.DCGM_FI_DEV_FB_FREE,
38
+ GPUUtilization: dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
39
+ GPUPowerUsage: dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
40
+ }
41
+
42
+ def __init__(self, gpus, frequency, metrics, dcgmPath=None):
43
+ """
44
+ Parameters
45
+ ----------
46
+ gpus : list of GPUDevice
47
+ The gpus to be monitored
48
+ frequency : int
49
+ Sampling frequency for the metric
50
+ metrics : list
51
+ List of Record types to monitor
52
+ dcgmPath : str (optional)
53
+ DCGM installation path
54
+ """
55
+
56
+ super().__init__(frequency, metrics)
57
+ structs._dcgmInit(dcgmPath)
58
+ dcgm_agent.dcgmInit()
59
+
60
+ self._gpus = gpus
61
+
62
+ # Start DCGM in the embedded mode to use the shared library
63
+ self.dcgm_handle = dcgm_handle = dcgm_agent.dcgmStartEmbedded(
64
+ structs.DCGM_OPERATION_MODE_MANUAL
65
+ )
66
+
67
+ # Create DCGM monitor group
68
+ self.group_id = dcgm_agent.dcgmGroupCreate(
69
+ dcgm_handle, structs.DCGM_GROUP_EMPTY, "triton-monitor"
70
+ )
71
+ # Add the GPUs to the group
72
+ for gpu in self._gpus:
73
+ dcgm_agent.dcgmGroupAddDevice(dcgm_handle, self.group_id, gpu.device_id())
74
+
75
+ frequency = int(self._frequency * 1000)
76
+ fields = []
77
+ try:
78
+ for metric in metrics:
79
+ fields.append(self.model_analyzer_to_dcgm_field[metric])
80
+ except KeyError:
81
+ dcgm_agent.dcgmShutdown()
82
+ raise TritonModelAnalyzerException(
83
+ f"{metric} is not supported by Model Analyzer DCGM Monitor"
84
+ )
85
+
86
+ self.dcgm_field_group_id = dcgm_agent.dcgmFieldGroupCreate(
87
+ dcgm_handle, fields, "triton-monitor"
88
+ )
89
+
90
+ self.group_watcher = dcgm_field_helpers.DcgmFieldGroupWatcher(
91
+ dcgm_handle,
92
+ self.group_id,
93
+ self.dcgm_field_group_id.value,
94
+ structs.DCGM_OPERATION_MODE_MANUAL,
95
+ frequency,
96
+ 3600,
97
+ 0,
98
+ 0,
99
+ )
100
+
101
+ def is_monitoring_connected(self) -> bool:
102
+ return True
103
+
104
+ def _monitoring_iteration(self):
105
+ self.group_watcher.GetMore()
106
+
107
+ def _collect_records(self):
108
+ records = []
109
+ for gpu in self._gpus:
110
+ device_id = gpu.device_id()
111
+ metrics = self.group_watcher.values[device_id]
112
+
113
+ # Find the first key in the metrics dictionary to find the
114
+ # dictionary length
115
+ if len(list(metrics)) > 0:
116
+ for metric_type in self._metrics:
117
+ dcgm_field = self.model_analyzer_to_dcgm_field[metric_type]
118
+ for measurement in metrics[dcgm_field].values:
119
+ if measurement.value is not None:
120
+ # DCGM timestamp is in nanoseconds
121
+ records.append(
122
+ metric_type(
123
+ value=float(measurement.value),
124
+ device_uuid=gpu.device_uuid(),
125
+ timestamp=measurement.ts,
126
+ )
127
+ )
128
+
129
+ return records
130
+
131
+ def destroy(self):
132
+ """
133
+ Destroy the DCGMMonitor. This function must be called
134
+ in order to appropriately deallocate the resources.
135
+ """
136
+
137
+ dcgm_agent.dcgmShutdown()
138
+ super().destroy()
@@ -0,0 +1,326 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
15
+ import time
16
+ import logging
17
+ import os
18
+ import argparse
19
+ import sys
20
+ import signal
21
+
22
+ dir_path = os.path.dirname(os.path.realpath(__file__))
23
+ parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
24
+ sys.path.insert(0, parent_dir_path)
25
+
26
+ from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
27
+ from model_analyzer.monitor.dcgm.common import dcgm_client_cli_parser as cli
28
+
29
+ if 'DCGM_TESTING_FRAMEWORK' in os.environ:
30
+ try:
31
+ from prometheus_tester_api import start_http_server, Gauge
32
+ except:
33
+ logging.critical(
34
+ "prometheus_tester_api missing, reinstall test framework.")
35
+ sys.exit(3)
36
+ else:
37
+ try:
38
+ from prometheus_client import start_http_server, Gauge
39
+ except ImportError:
40
+ pass
41
+ logging.critical(
42
+ "prometheus_client not installed, please run: \"pip install prometheus_client\""
43
+ )
44
+ sys.exit(3)
45
+
46
+ DEFAULT_FIELDS = [
47
+ dcgm_fields.DCGM_FI_DEV_PCI_BUSID, #Needed for plugin_instance
48
+ dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
49
+ dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
50
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
51
+ dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
52
+ dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
53
+ dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
54
+ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
55
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
56
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
57
+ dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
58
+ dcgm_fields.DCGM_FI_DEV_FB_FREE,
59
+ dcgm_fields.DCGM_FI_DEV_FB_USED,
60
+ dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
61
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
62
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
63
+ dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
64
+ dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
65
+ dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
66
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
67
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
68
+ dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
69
+ dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
70
+ ]
71
+
72
+
73
+ class DcgmPrometheus(DcgmReader):
74
+ ###########################################################################
75
+ def __init__(self):
76
+ #Have DCGM update its watches twice as fast as our update interval so we don't get out of phase by our update interval
77
+ updateIntervalUsec = int(
78
+ (1000000 * g_settings['prometheusPublishInterval']) / 2)
79
+ #Add our PID to our field group name so we can have multiple instances running
80
+ fieldGroupName = 'dcgm_prometheus_' + str(os.getpid())
81
+
82
+ DcgmReader.__init__(self,
83
+ ignoreList=g_settings['ignoreList'],
84
+ fieldIds=g_settings['publishFieldIds'],
85
+ updateFrequency=updateIntervalUsec,
86
+ fieldGroupName=fieldGroupName,
87
+ hostname=g_settings['dcgmHostName'])
88
+ self.m_existingGauge = {}
89
+
90
+ ###########################################################################
91
+ '''
92
+ This function is implemented from the base class : DcgmReader. It converts each
93
+ field / value from the fvs dictionary to a gauge and publishes the gauge to the
94
+ prometheus client server.
95
+
96
+ @params:
97
+ fvs : The fieldvalue dictionary that contains info about the values of field Ids for each gpuId.
98
+ '''
99
+
100
+ def CustomDataHandler(self, fvs):
101
+ if not self.m_existingGauge:
102
+ self.SetupGauges()
103
+
104
+ for _, fieldIds in self.m_publishFields.items():
105
+ if fieldIds is None:
106
+ continue
107
+
108
+ for fieldId in fieldIds:
109
+ if fieldId in self.m_dcgmIgnoreFields:
110
+ continue
111
+
112
+ g = self.m_existingGauge[fieldId]
113
+
114
+ for gpuId in list(fvs.keys()):
115
+ gpuFv = fvs[gpuId]
116
+ val = gpuFv[fieldId][-1]
117
+
118
+ #Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
119
+ if val.isBlank:
120
+ continue
121
+
122
+ gpuUuid = self.m_gpuIdToUUId[gpuId]
123
+ gpuBusId = self.m_gpuIdToBusId[gpuId]
124
+ gpuUniqueId = gpuUuid if g_settings['sendUuid'] else gpuBusId
125
+
126
+ # pylint doesn't find the labels member for Gauge, but it exists. Ignore the warning
127
+ g.labels(gpuId, gpuUniqueId).set(val.value) # pylint: disable=no-member
128
+
129
+ logging.debug(
130
+ 'Sent GPU %d %s %s = %s' %
131
+ (gpuId, gpuUniqueId, self.m_fieldIdToInfo[fieldId].tag,
132
+ str(val.value)))
133
+
134
+ ###############################################################################
135
+ '''
136
+ NOTE: even though some fields are monotonically increasing and therefore fit the mold to be
137
+ counters, all are published as gauges so that DCGM is the sole authority on the state of the
138
+ system, preventing problems around down times, driver reboots, and the unlikely event of
139
+ flashing the inforom.
140
+ For specific information about which fields monotonically increase, see the API guide or
141
+ dcgm_fields.h
142
+ '''
143
+
144
+ def SetupGauges(self):
145
+ for _, fieldIds in self.m_publishFields.items():
146
+ if fieldIds is None:
147
+ continue
148
+
149
+ for fieldId in fieldIds:
150
+ if fieldId in self.m_dcgmIgnoreFields:
151
+ continue
152
+
153
+ uniqueIdName = 'GpuUuid' if g_settings[
154
+ 'sendUuid'] else 'GpuBusID'
155
+
156
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
157
+ self.m_existingGauge[fieldId] = Gauge("dcgm_" + fieldTag,
158
+ 'DCGM_PROMETHEUS',
159
+ ['GpuID', uniqueIdName])
160
+
161
+ ###############################################################################
162
+ '''
163
+ Scrape the fieldvalue data and publish. This function calls the process function of
164
+ the base class DcgmReader.
165
+ '''
166
+
167
+ def Scrape(self, data=None):
168
+ return self.Process()
169
+
170
+ ###############################################################################
171
+ def LogBasicInformation(self):
172
+ # Reconnect causes everything to get initialized
173
+ self.Reconnect()
174
+
175
+ logging.info('Started prometheus client')
176
+
177
+ fieldTagList = ''
178
+
179
+ for _, fieldIds in self.m_publishFields.items():
180
+ if fieldIds is None:
181
+ continue
182
+
183
+ for fieldId in fieldIds:
184
+ if fieldId in self.m_dcgmIgnoreFields:
185
+ continue
186
+
187
+ if fieldTagList == '':
188
+ fieldTagList = self.m_fieldIdToInfo[fieldId].tag
189
+ else:
190
+ fieldTagList = fieldTagList + ", %s" % (
191
+ self.m_fieldIdToInfo[fieldId].tag)
192
+
193
+ logging.info("Publishing fields: '%s'" % (fieldTagList))
194
+
195
+ ###############################################################################
196
+ def LogError(self, msg):
197
+ logging.error(msg)
198
+
199
+ ###############################################################################
200
+ def LogInfo(self, msg):
201
+ logging.info(msg)
202
+
203
+
204
+ ###############################################################################
205
+ def exit_handler(signum, frame):
206
+ g_settings['shouldExit'] = True
207
+
208
+
209
+ ###############################################################################
210
+ def main_loop(prometheus_obj, publish_interval):
211
+ try:
212
+ while True:
213
+ prometheus_obj.Scrape(prometheus_obj)
214
+ time.sleep(publish_interval)
215
+
216
+ if g_settings['shouldExit'] == True:
217
+ prometheus_obj.LogInfo('Received a signal...shutting down')
218
+ break
219
+ except KeyboardInterrupt:
220
+ print("Caught CTRL-C. Exiting")
221
+
222
+
223
+ ###############################################################################
224
+ def initialize_globals():
225
+ '''
226
+ Name of the host.
227
+ '''
228
+ global g_settings
229
+ g_settings = {}
230
+
231
+ g_settings['shouldExit'] = False
232
+ '''
233
+ List of the ids that are present in g_settings['publishFieldIds'] but ignored for watch.
234
+ '''
235
+ g_settings['ignoreList'] = [
236
+ dcgm_fields.DCGM_FI_DEV_PCI_BUSID,
237
+ ]
238
+ '''
239
+ Those are initialized by the CLI parser. We only list them here for clarity.
240
+ '''
241
+ for key in [
242
+ 'dcgmHostName',
243
+ 'prometheusPort',
244
+ 'prometheusPublishInterval',
245
+ 'publishFieldIds',
246
+ ]:
247
+ g_settings[key] = None
248
+
249
+
250
+ ###############################################################################
251
+ def parse_command_line():
252
+ parser = cli.create_parser(
253
+ name='Prometheus',
254
+ field_ids=DEFAULT_FIELDS,
255
+ )
256
+
257
+ cli.add_custom_argument(parser,
258
+ '--send-uuid',
259
+ dest='send_uuid',
260
+ default=False,
261
+ action='store_true',
262
+ help='Send GPU UUID instead of bus id')
263
+
264
+ args = cli.run_parser(parser)
265
+ field_ids = cli.get_field_ids(args)
266
+ numeric_log_level = cli.get_log_level(args)
267
+
268
+ # Defaults to localhost, so we need to set it to None
269
+ if args.embedded:
270
+ g_settings['dcgmHostName'] = None
271
+ else:
272
+ g_settings['dcgmHostName'] = args.hostname
273
+
274
+ g_settings['prometheusPort'] = args.publish_port
275
+
276
+ g_settings['prometheusPublishInterval'] = args.interval
277
+
278
+ logfile = args.logfile
279
+
280
+ g_settings['publishFieldIds'] = field_ids
281
+
282
+ g_settings['sendUuid'] = args.send_uuid
283
+
284
+ if logfile != None:
285
+ logging.basicConfig(level=numeric_log_level,
286
+ filename=logfile,
287
+ filemode='w+',
288
+ format='%(asctime)s %(levelname)s: %(message)s')
289
+ else:
290
+ logging.basicConfig(level=numeric_log_level,
291
+ stream=sys.stdout,
292
+ filemode='w+',
293
+ format='%(asctime)s %(levelname)s: %(message)s')
294
+
295
+
296
+ ###############################################################################
297
+ def initialize_signal_handlers():
298
+ signal.signal(signal.SIGINT, exit_handler)
299
+ signal.signal(signal.SIGTERM, exit_handler)
300
+
301
+
302
+ ###############################################################################
303
+ def main():
304
+ initialize_globals()
305
+
306
+ initialize_signal_handlers()
307
+
308
+ parse_command_line()
309
+
310
+ prometheus_obj = DcgmPrometheus()
311
+
312
+ logging.info("Starting Prometheus server on port " +
313
+ str(g_settings['prometheusPort']))
314
+
315
+ #start prometheus client server.
316
+ start_http_server(g_settings['prometheusPort'])
317
+
318
+ prometheus_obj.LogBasicInformation()
319
+
320
+ main_loop(prometheus_obj, g_settings['prometheusPublishInterval'])
321
+
322
+ prometheus_obj.Shutdown()
323
+
324
+
325
+ if __name__ == '__main__':
326
+ main()