triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
|
|
17
|
+
import model_analyzer.monitor.dcgm.dcgm_fields_internal as dcgm_fields_internal
|
|
18
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
19
|
+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
|
|
20
|
+
import ctypes
|
|
21
|
+
import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
|
|
22
|
+
import model_analyzer.monitor.dcgm.pydcgm as pydcgm
|
|
23
|
+
import json
|
|
24
|
+
'''
|
|
25
|
+
Helper class that makes a python-friendly field value from one returned from the python bindings
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DcgmFieldValue():
|
|
30
|
+
'''
|
|
31
|
+
Constructor
|
|
32
|
+
|
|
33
|
+
rawValue is the latest dcgm_structs.c_dcgmFieldValue_v? structure of a field value returned from the raw APIs
|
|
34
|
+
'''
|
|
35
|
+
|
|
36
|
+
def __init__(self, rawValue):
|
|
37
|
+
#Make sure the class passed in is an expected type
|
|
38
|
+
if not type(rawValue) == dcgm_structs.c_dcgmFieldValue_v1:
|
|
39
|
+
raise Exception("Unexpected rawValue type %s" % str(type(rawValue)))
|
|
40
|
+
|
|
41
|
+
self.ts = rawValue.ts
|
|
42
|
+
self.fieldId = rawValue.fieldId
|
|
43
|
+
self.fieldType = chr(rawValue.fieldType)
|
|
44
|
+
self.isBlank = False
|
|
45
|
+
self.value = None
|
|
46
|
+
|
|
47
|
+
if rawValue.status != dcgm_structs.DCGM_ST_OK:
|
|
48
|
+
self.isBlank = True
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
if self.fieldType == dcgm_fields.DCGM_FT_DOUBLE:
|
|
52
|
+
self.value = float(rawValue.value.dbl)
|
|
53
|
+
self.isBlank = dcgmvalue.DCGM_FP64_IS_BLANK(self.value)
|
|
54
|
+
elif self.fieldType == dcgm_fields.DCGM_FT_INT64 or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP:
|
|
55
|
+
self.value = int(rawValue.value.i64)
|
|
56
|
+
self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(self.value)
|
|
57
|
+
elif self.fieldType == dcgm_fields.DCGM_FT_STRING:
|
|
58
|
+
self.value = str(rawValue.value.str)
|
|
59
|
+
self.isBlank = dcgmvalue.DCGM_STR_IS_BLANK(self.value)
|
|
60
|
+
elif self.fieldType == dcgm_fields.DCGM_FT_BINARY:
|
|
61
|
+
if self.fieldId == dcgm_fields.DCGM_FI_DEV_ACCOUNTING_DATA:
|
|
62
|
+
accStats = dcgm_structs.c_dcgmDevicePidAccountingStats_v1()
|
|
63
|
+
ctypes.memmove(ctypes.addressof(accStats), rawValue.value.blob,
|
|
64
|
+
accStats.FieldsSizeof())
|
|
65
|
+
if self.fieldId in [
|
|
66
|
+
dcgm_fields_internal.DCGM_FI_DEV_COMPUTE_PIDS,
|
|
67
|
+
dcgm_fields_internal.DCGM_FI_DEV_GRAPHICS_PIDS
|
|
68
|
+
]:
|
|
69
|
+
processStats = dcgm_structs.c_dcgmRunningProcess_t()
|
|
70
|
+
ctypes.memmove(ctypes.addressof(processStats),
|
|
71
|
+
rawValue.value.blob, processStats.FieldsSizeof())
|
|
72
|
+
self.value = processStats
|
|
73
|
+
self.fieldType = dcgm_fields.DCGM_FT_BINARY
|
|
74
|
+
# This should always be false
|
|
75
|
+
self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(processStats.pid)
|
|
76
|
+
elif self.fieldId == dcgm_fields.DCGM_FI_SYNC_BOOST:
|
|
77
|
+
#Not exposed publicly for now
|
|
78
|
+
self.value = None
|
|
79
|
+
else:
|
|
80
|
+
raise Exception("Blobs not handled yet for fieldId %d" %
|
|
81
|
+
self.fieldId)
|
|
82
|
+
else:
|
|
83
|
+
raise Exception("Unhandled fieldType: %s" % self.fieldType)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class DcgmFieldValueTimeSeries:
|
|
87
|
+
|
|
88
|
+
def __init__(self):
|
|
89
|
+
self.values = [] #Values in timestamp order
|
|
90
|
+
|
|
91
|
+
def __len__(self):
|
|
92
|
+
return len(self.values)
|
|
93
|
+
|
|
94
|
+
def __getitem__(self, key):
|
|
95
|
+
return self.values[key]
|
|
96
|
+
|
|
97
|
+
def InsertValue(self, value):
|
|
98
|
+
if len(self.values) < 1 or value.ts >= self.values[-1].ts:
|
|
99
|
+
self.values.append(value)
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
#Otherwise, we need to insert the value in the correct place. Find the place
|
|
103
|
+
for i, existingValue in enumerate(self.values):
|
|
104
|
+
if value.ts < existingValue.ts:
|
|
105
|
+
self.values.insert(i, value)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
raise Exception("Unexpected no place to insert ts %d" % value.ts)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class FieldValueEncoder(json.JSONEncoder):
|
|
112
|
+
# Pylint does not link overloading the default method, so the comment below is WAR for the linting problem
|
|
113
|
+
def default(self, obj): # pylint: disable=E0202
|
|
114
|
+
nested_json = []
|
|
115
|
+
i = 0
|
|
116
|
+
for key in obj:
|
|
117
|
+
if isinstance(key, DcgmFieldValue):
|
|
118
|
+
if (key.isBlank):
|
|
119
|
+
continue
|
|
120
|
+
nested_json.append({
|
|
121
|
+
'Timestamp': key.ts,
|
|
122
|
+
'FieldId': key.fieldId,
|
|
123
|
+
'Value': key.value
|
|
124
|
+
})
|
|
125
|
+
else:
|
|
126
|
+
return json.JSONEncoder.default(
|
|
127
|
+
self, obj) # Let default encoder throw exception
|
|
128
|
+
return nested_json
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues,
|
|
132
|
+
userData):
|
|
133
|
+
|
|
134
|
+
userData = ctypes.cast(userData, ctypes.py_object).value
|
|
135
|
+
userData._ProcessValues(gpuId, values[0:numValues])
|
|
136
|
+
return 0
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
helper_dcgm_field_values_since_callback = dcgm_agent.dcgmFieldValueEnumeration_f(
|
|
140
|
+
py_helper_dcgm_field_values_since_callback)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def py_helper_dcgm_field_values_since_callback_v2(entityGroupId, entityId,
|
|
144
|
+
values, numValues, userData):
|
|
145
|
+
userData = ctypes.cast(userData, ctypes.py_object).value
|
|
146
|
+
userData._ProcessValuesV2(entityGroupId, entityId, values[0:numValues])
|
|
147
|
+
return 0
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
helper_dcgm_field_values_since_callback_v2 = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
|
|
151
|
+
py_helper_dcgm_field_values_since_callback_v2)
|
|
152
|
+
'''
|
|
153
|
+
Helper class for handling field value update callbacks and storing them in a .values member variable
|
|
154
|
+
'''
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class DcgmFieldValueCollection:
|
|
158
|
+
|
|
159
|
+
def __init__(self, handle, groupId):
|
|
160
|
+
self.values = {
|
|
161
|
+
} #2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries)
|
|
162
|
+
self.entityValues = {
|
|
163
|
+
} #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
|
|
164
|
+
self._handle = handle
|
|
165
|
+
self._groupId = groupId
|
|
166
|
+
self._numValuesSeen = 0
|
|
167
|
+
self._nextSinceTimestamp = 0
|
|
168
|
+
|
|
169
|
+
'''
|
|
170
|
+
Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
|
|
171
|
+
'''
|
|
172
|
+
|
|
173
|
+
def _ProcessValues(self, gpuId, values):
|
|
174
|
+
self._numValuesSeen += len(values)
|
|
175
|
+
|
|
176
|
+
if gpuId not in self.values:
|
|
177
|
+
self.values[gpuId] = {}
|
|
178
|
+
|
|
179
|
+
for rawValue in values:
|
|
180
|
+
#Convert to python-friendly value
|
|
181
|
+
value = DcgmFieldValue(rawValue)
|
|
182
|
+
|
|
183
|
+
if value.fieldId not in self.values[gpuId]:
|
|
184
|
+
self.values[gpuId][value.fieldId] = DcgmFieldValueTimeSeries()
|
|
185
|
+
|
|
186
|
+
self.values[gpuId][value.fieldId].InsertValue(value)
|
|
187
|
+
|
|
188
|
+
'''
|
|
189
|
+
Helper function called by the callback py_helper_dcgm_field_values_since_callback_v2 to process individual field values
|
|
190
|
+
'''
|
|
191
|
+
|
|
192
|
+
def _ProcessValuesV2(self, entityGroupId, entityId, values):
|
|
193
|
+
self._numValuesSeen += len(values)
|
|
194
|
+
|
|
195
|
+
if entityGroupId not in self.entityValues:
|
|
196
|
+
self.entityValues[entityGroupId] = {}
|
|
197
|
+
|
|
198
|
+
if entityId not in self.entityValues[entityGroupId]:
|
|
199
|
+
self.entityValues[entityGroupId][entityId] = {}
|
|
200
|
+
|
|
201
|
+
for rawValue in values:
|
|
202
|
+
#Convert to python-friendly value
|
|
203
|
+
value = DcgmFieldValue(rawValue)
|
|
204
|
+
|
|
205
|
+
if value.fieldId not in self.entityValues[entityGroupId][entityId]:
|
|
206
|
+
self.entityValues[entityGroupId][entityId][
|
|
207
|
+
value.fieldId] = DcgmFieldValueTimeSeries()
|
|
208
|
+
|
|
209
|
+
self.entityValues[entityGroupId][entityId][
|
|
210
|
+
value.fieldId].InsertValue(value)
|
|
211
|
+
|
|
212
|
+
'''
|
|
213
|
+
Get the latest values for a fieldGroup and store them to the .values member variable
|
|
214
|
+
|
|
215
|
+
Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
|
|
216
|
+
'''
|
|
217
|
+
|
|
218
|
+
def GetLatestValues(self, fieldGroup):
|
|
219
|
+
ret = dcgm_agent.dcgmGetLatestValues(
|
|
220
|
+
self._handle, self._groupId, fieldGroup.fieldGroupId,
|
|
221
|
+
helper_dcgm_field_values_since_callback, self)
|
|
222
|
+
#Will throw exception on error
|
|
223
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
224
|
+
|
|
225
|
+
'''
|
|
226
|
+
Method to cause more field values to be retrieved from DCGM. Returns the
|
|
227
|
+
number of field values that were retrieved.
|
|
228
|
+
'''
|
|
229
|
+
|
|
230
|
+
def GetAllSinceLastCall(self, fieldGroup):
|
|
231
|
+
beforeCount = self._numValuesSeen
|
|
232
|
+
self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince(
|
|
233
|
+
self._handle, self._groupId, fieldGroup.fieldGroupId,
|
|
234
|
+
self._nextSinceTimestamp, helper_dcgm_field_values_since_callback,
|
|
235
|
+
self)
|
|
236
|
+
afterCount = self._numValuesSeen
|
|
237
|
+
return afterCount - beforeCount
|
|
238
|
+
|
|
239
|
+
def GetLatestValues_v2(self, fieldGroup):
|
|
240
|
+
ret = dcgm_agent.dcgmGetLatestValues_v2(
|
|
241
|
+
self._handle, self._groupId, fieldGroup.fieldGroupId,
|
|
242
|
+
helper_dcgm_field_values_since_callback_v2, self)
|
|
243
|
+
#Will throw exception on error
|
|
244
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
245
|
+
|
|
246
|
+
'''
|
|
247
|
+
Method to cause more field values to be retrieved from DCGM. Returns the number of field values that were retrieved
|
|
248
|
+
'''
|
|
249
|
+
|
|
250
|
+
def GetAllSinceLastCall_v2(self, fieldGroup):
|
|
251
|
+
beforeCount = self._numValuesSeen
|
|
252
|
+
self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
|
|
253
|
+
self._handle, self._groupId, fieldGroup.fieldGroupId,
|
|
254
|
+
self._nextSinceTimestamp,
|
|
255
|
+
helper_dcgm_field_values_since_entity_callback, self)
|
|
256
|
+
afterCount = self._numValuesSeen
|
|
257
|
+
return afterCount - beforeCount
|
|
258
|
+
|
|
259
|
+
'''
|
|
260
|
+
Empty .values{} so that old data is no longer present in this structure.
|
|
261
|
+
This can be used to prevent .values from growing over time
|
|
262
|
+
'''
|
|
263
|
+
|
|
264
|
+
def EmptyValues(self):
|
|
265
|
+
self.values = {}
|
|
266
|
+
self._numValuesSeen = 0
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
'''
|
|
270
|
+
Helper class for watching a field group and storing fields values returned from it
|
|
271
|
+
'''
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class DcgmFieldGroupWatcher(DcgmFieldValueCollection):
|
|
275
|
+
'''
|
|
276
|
+
Constructor
|
|
277
|
+
|
|
278
|
+
handle is a DCGM handle from dcgm_agent.dcgmInit()
|
|
279
|
+
groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
|
|
280
|
+
fieldGroup is the DcgmFieldGroup() instance to watch fields for
|
|
281
|
+
operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
|
|
282
|
+
updateFreq is how often to update each field in usec
|
|
283
|
+
maxKeepAge is how long DCGM should keep values for in seconds
|
|
284
|
+
maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
|
|
285
|
+
startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
|
|
286
|
+
previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
|
|
287
|
+
0=start with all cached data
|
|
288
|
+
'''
|
|
289
|
+
|
|
290
|
+
def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
|
|
291
|
+
maxKeepAge, maxKeepSamples, startTimestamp):
|
|
292
|
+
self._fieldGroup = fieldGroup
|
|
293
|
+
self._operationMode = operationMode
|
|
294
|
+
self._updateFreq = updateFreq
|
|
295
|
+
self._maxKeepAge = maxKeepAge
|
|
296
|
+
self._maxKeepSamples = maxKeepSamples
|
|
297
|
+
DcgmFieldValueCollection.__init__(self, handle, groupId)
|
|
298
|
+
|
|
299
|
+
self._nextSinceTimestamp = 0 #Start from beginning of time
|
|
300
|
+
if startTimestamp > 0:
|
|
301
|
+
self._nextSinceTimestamp = startTimestamp
|
|
302
|
+
|
|
303
|
+
#Start watches
|
|
304
|
+
self._WatchFieldGroup()
|
|
305
|
+
|
|
306
|
+
'''
|
|
307
|
+
Initiate the host engine watch on the fields
|
|
308
|
+
'''
|
|
309
|
+
|
|
310
|
+
def _WatchFieldGroup(self):
|
|
311
|
+
ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
|
|
312
|
+
self._fieldGroup.fieldGroupId,
|
|
313
|
+
self._updateFreq, self._maxKeepAge,
|
|
314
|
+
self._maxKeepSamples)
|
|
315
|
+
dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
|
|
316
|
+
|
|
317
|
+
# Force an update of the fields so that we can fetch initial values.
|
|
318
|
+
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
|
|
319
|
+
dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
|
|
320
|
+
|
|
321
|
+
# Initial update will fetch from startTimestamp.
|
|
322
|
+
self.GetAllSinceLastCall()
|
|
323
|
+
|
|
324
|
+
'''
|
|
325
|
+
Method to cause more field values to be retrieved from DCGM. Returns the
|
|
326
|
+
number of field values that were retrieved
|
|
327
|
+
'''
|
|
328
|
+
|
|
329
|
+
def GetAllSinceLastCall(self):
|
|
330
|
+
#If we're in manual mode, force an update
|
|
331
|
+
if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
|
|
332
|
+
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
|
|
333
|
+
dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
|
|
334
|
+
|
|
335
|
+
return super().GetAllSinceLastCall(self._fieldGroup)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def py_helper_dcgm_field_values_since_entity_callback(entityGroupId, entityId,
|
|
339
|
+
values, numValues,
|
|
340
|
+
userData):
|
|
341
|
+
|
|
342
|
+
userData = ctypes.cast(userData, ctypes.py_object).value
|
|
343
|
+
userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
|
|
344
|
+
return 0
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
helper_dcgm_field_values_since_entity_callback = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
|
|
348
|
+
py_helper_dcgm_field_values_since_entity_callback)
|
|
349
|
+
'''
|
|
350
|
+
Helper class for handling field value update callbacks and storing them in a .values member variable
|
|
351
|
+
'''
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class DcgmFieldValueEntityCollection:
|
|
355
|
+
|
|
356
|
+
def __init__(self, handle, groupId):
|
|
357
|
+
self.values = {
|
|
358
|
+
} #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
|
|
359
|
+
self._handle = handle
|
|
360
|
+
self._groupId = groupId
|
|
361
|
+
self._numValuesSeen = 0
|
|
362
|
+
self._nextSinceTimestamp = 0
|
|
363
|
+
|
|
364
|
+
'''
|
|
365
|
+
Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
|
|
366
|
+
'''
|
|
367
|
+
|
|
368
|
+
def _ProcessValues(self, entityGroupId, entityId, values):
|
|
369
|
+
self._numValuesSeen += len(values)
|
|
370
|
+
|
|
371
|
+
if entityGroupId not in self.values:
|
|
372
|
+
self.values[entityGroupId] = {}
|
|
373
|
+
|
|
374
|
+
if entityId not in self.values[entityGroupId]:
|
|
375
|
+
self.values[entityGroupId][entityId] = {}
|
|
376
|
+
|
|
377
|
+
for rawValue in values:
|
|
378
|
+
#Convert to python-friendly value
|
|
379
|
+
value = DcgmFieldValue(rawValue)
|
|
380
|
+
|
|
381
|
+
if value.fieldId not in self.values[entityGroupId][entityId]:
|
|
382
|
+
self.values[entityGroupId][entityId][
|
|
383
|
+
value.fieldId] = DcgmFieldValueTimeSeries()
|
|
384
|
+
|
|
385
|
+
self.values[entityGroupId][entityId][value.fieldId].InsertValue(
|
|
386
|
+
value)
|
|
387
|
+
|
|
388
|
+
'''
|
|
389
|
+
Get the latest values for a fieldGroup and store them to the .values member variable
|
|
390
|
+
|
|
391
|
+
Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
|
|
392
|
+
'''
|
|
393
|
+
|
|
394
|
+
def GetLatestValues(self, fieldGroup):
|
|
395
|
+
ret = dcgm_agent.dcgmGetLatestValues_v2(
|
|
396
|
+
self._handle, self._groupId, fieldGroup.fieldGroupId,
|
|
397
|
+
helper_dcgm_field_values_since_entity_callback, self)
|
|
398
|
+
#Will throw exception on error
|
|
399
|
+
dcgm_structs._dcgmCheckReturn(ret)
|
|
400
|
+
|
|
401
|
+
'''
|
|
402
|
+
Method to cause more field values to be retrieved from DCGM. Returns the
|
|
403
|
+
number of field values that were retrieved.
|
|
404
|
+
'''
|
|
405
|
+
|
|
406
|
+
def GetAllSinceLastCall(self, fieldGroup):
|
|
407
|
+
beforeCount = self._numValuesSeen
|
|
408
|
+
self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
|
|
409
|
+
self._handle, self._groupId, fieldGroup.fieldGroupId,
|
|
410
|
+
self._nextSinceTimestamp,
|
|
411
|
+
helper_dcgm_field_values_since_entity_callback, self)
|
|
412
|
+
afterCount = self._numValuesSeen
|
|
413
|
+
return afterCount - beforeCount
|
|
414
|
+
|
|
415
|
+
'''
|
|
416
|
+
Empty .values{} so that old data is no longer present in this structure.
|
|
417
|
+
This can be used to prevent .values from growing over time
|
|
418
|
+
'''
|
|
419
|
+
|
|
420
|
+
def EmptyValues(self):
|
|
421
|
+
self.values = {}
|
|
422
|
+
self._numValuesSeen = 0
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
'''
|
|
426
|
+
Helper class for watching a field group and storing fields values returned from it
|
|
427
|
+
'''
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
class DcgmFieldGroupEntityWatcher(DcgmFieldValueEntityCollection):
|
|
431
|
+
'''
|
|
432
|
+
Constructor
|
|
433
|
+
|
|
434
|
+
handle is a DCGM handle from dcgm_agent.dcgmInit()
|
|
435
|
+
groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
|
|
436
|
+
fieldGroup is the DcgmFieldGroup() instance to watch fields for
|
|
437
|
+
operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
|
|
438
|
+
updateFreq is how often to update each field in usec
|
|
439
|
+
maxKeepAge is how long DCGM should keep values for in seconds
|
|
440
|
+
maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
|
|
441
|
+
startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
|
|
442
|
+
previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
|
|
443
|
+
0=start with all cached data
|
|
444
|
+
'''
|
|
445
|
+
|
|
446
|
+
def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
|
|
447
|
+
maxKeepAge, maxKeepSamples, startTimestamp):
|
|
448
|
+
self._fieldGroup = fieldGroup
|
|
449
|
+
self._operationMode = operationMode
|
|
450
|
+
self._updateFreq = updateFreq
|
|
451
|
+
self._maxKeepAge = maxKeepAge
|
|
452
|
+
self._maxKeepSamples = maxKeepSamples
|
|
453
|
+
DcgmFieldValueEntityCollection.__init__(self, handle, groupId)
|
|
454
|
+
|
|
455
|
+
self._nextSinceTimestamp = 0 #Start from beginning of time
|
|
456
|
+
if startTimestamp > 0:
|
|
457
|
+
self._nextSinceTimestamp = startTimestamp
|
|
458
|
+
|
|
459
|
+
#Start watches
|
|
460
|
+
self._WatchFieldGroup()
|
|
461
|
+
|
|
462
|
+
'''
|
|
463
|
+
Initiate the host engine watch on the fields
|
|
464
|
+
'''
|
|
465
|
+
|
|
466
|
+
def _WatchFieldGroup(self):
|
|
467
|
+
ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
|
|
468
|
+
self._fieldGroup.fieldGroupId,
|
|
469
|
+
self._updateFreq, self._maxKeepAge,
|
|
470
|
+
self._maxKeepSamples)
|
|
471
|
+
dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
|
|
472
|
+
|
|
473
|
+
# Force an update of the fields so that we can fetch initial values.
|
|
474
|
+
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
|
|
475
|
+
dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
|
|
476
|
+
|
|
477
|
+
# Initial update will fetch from startTimestamp.
|
|
478
|
+
self.GetAllSinceLastCall()
|
|
479
|
+
|
|
480
|
+
'''
|
|
481
|
+
Method to cause more field values to be retrieved from DCGM. Returns the
|
|
482
|
+
number of field values that were retrieved
|
|
483
|
+
'''
|
|
484
|
+
|
|
485
|
+
def GetAllSinceLastCall(self):
|
|
486
|
+
#If we're in manual mode, force an update
|
|
487
|
+
if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
|
|
488
|
+
ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
|
|
489
|
+
dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
|
|
490
|
+
|
|
491
|
+
return super().GetAllSinceLastCall(self._fieldGroup)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
#Test program for demonstrating how this module works
|
|
495
|
+
def main():
|
|
496
|
+
operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
|
|
497
|
+
timeStep = 1.0
|
|
498
|
+
|
|
499
|
+
dcgm_structs._dcgmInit()
|
|
500
|
+
dcgm_agent.dcgmInit() #Will throw an exception on error
|
|
501
|
+
handle = dcgm_agent.dcgmStartEmbedded(operationMode)
|
|
502
|
+
handleObj = pydcgm.DcgmHandle(handle=handle)
|
|
503
|
+
groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
|
|
504
|
+
fieldIds = [
|
|
505
|
+
dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
|
|
506
|
+
]
|
|
507
|
+
|
|
508
|
+
fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)
|
|
509
|
+
|
|
510
|
+
updateFreq = int(timeStep * 1000000.0)
|
|
511
|
+
maxKeepAge = 3600.0 #1 hour
|
|
512
|
+
maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota
|
|
513
|
+
startTimestamp = 0 #beginning of time
|
|
514
|
+
|
|
515
|
+
dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
|
|
516
|
+
updateFreq, maxKeepAge, maxKeepSamples,
|
|
517
|
+
startTimestamp)
|
|
518
|
+
dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
|
|
519
|
+
operationMode, updateFreq, maxKeepAge,
|
|
520
|
+
maxKeepSamples, startTimestamp)
|
|
521
|
+
|
|
522
|
+
while (True):
|
|
523
|
+
newUpdateCount = dfcw.GetAllSinceLastCall()
|
|
524
|
+
newUpdateCount2 = dfcw2.GetAllSinceLastCall()
|
|
525
|
+
print("Got %d and %d new field value updates" %
|
|
526
|
+
(newUpdateCount, newUpdateCount2))
|
|
527
|
+
for gpuId in list(dfcw.values.keys()):
|
|
528
|
+
print("gpuId %d" % gpuId)
|
|
529
|
+
for fieldId in list(dfcw.values[gpuId].keys()):
|
|
530
|
+
print(" fieldId %d: %d values. latest timestamp %d" % \
|
|
531
|
+
(fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))
|
|
532
|
+
|
|
533
|
+
for entityGroupId in list(dfcw2.values.keys()):
|
|
534
|
+
print("entityGroupId %d" % entityGroupId)
|
|
535
|
+
for entityId in list(dfcw2.values[entityGroupId].keys()):
|
|
536
|
+
print(" entityId %d" % entityId)
|
|
537
|
+
for fieldId in list(
|
|
538
|
+
dfcw2.values[entityGroupId][entityId].keys()):
|
|
539
|
+
print(" fieldId %d: %d values. latest timestamp %d" % \
|
|
540
|
+
(fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))
|
|
541
|
+
|
|
542
|
+
time.sleep(timeStep)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
if __name__ == "__main__":
|
|
546
|
+
main()
|