PyPI - triton-model-analyzer - Versions diffs - 1.48.0__py3-none-any.whl - Mend

triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (204) hide show

model_analyzer/__init__.py +15 -0
model_analyzer/analyzer.py +448 -0
model_analyzer/cli/__init__.py +15 -0
model_analyzer/cli/cli.py +193 -0
model_analyzer/config/__init__.py +15 -0
model_analyzer/config/generate/__init__.py +15 -0
model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
model_analyzer/config/generate/base_model_config_generator.py +352 -0
model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
model_analyzer/config/generate/brute_run_config_generator.py +154 -0
model_analyzer/config/generate/concurrency_sweeper.py +75 -0
model_analyzer/config/generate/config_generator_interface.py +52 -0
model_analyzer/config/generate/coordinate.py +143 -0
model_analyzer/config/generate/coordinate_data.py +86 -0
model_analyzer/config/generate/generator_utils.py +116 -0
model_analyzer/config/generate/manual_model_config_generator.py +187 -0
model_analyzer/config/generate/model_config_generator_factory.py +92 -0
model_analyzer/config/generate/model_profile_spec.py +74 -0
model_analyzer/config/generate/model_run_config_generator.py +154 -0
model_analyzer/config/generate/model_variant_name_manager.py +150 -0
model_analyzer/config/generate/neighborhood.py +536 -0
model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
model_analyzer/config/generate/quick_run_config_generator.py +753 -0
model_analyzer/config/generate/run_config_generator_factory.py +329 -0
model_analyzer/config/generate/search_config.py +112 -0
model_analyzer/config/generate/search_dimension.py +73 -0
model_analyzer/config/generate/search_dimensions.py +85 -0
model_analyzer/config/generate/search_parameter.py +49 -0
model_analyzer/config/generate/search_parameters.py +388 -0
model_analyzer/config/input/__init__.py +15 -0
model_analyzer/config/input/config_command.py +483 -0
model_analyzer/config/input/config_command_profile.py +1747 -0
model_analyzer/config/input/config_command_report.py +267 -0
model_analyzer/config/input/config_defaults.py +236 -0
model_analyzer/config/input/config_enum.py +83 -0
model_analyzer/config/input/config_field.py +216 -0
model_analyzer/config/input/config_list_generic.py +112 -0
model_analyzer/config/input/config_list_numeric.py +151 -0
model_analyzer/config/input/config_list_string.py +111 -0
model_analyzer/config/input/config_none.py +71 -0
model_analyzer/config/input/config_object.py +129 -0
model_analyzer/config/input/config_primitive.py +81 -0
model_analyzer/config/input/config_status.py +75 -0
model_analyzer/config/input/config_sweep.py +83 -0
model_analyzer/config/input/config_union.py +113 -0
model_analyzer/config/input/config_utils.py +128 -0
model_analyzer/config/input/config_value.py +243 -0
model_analyzer/config/input/objects/__init__.py +15 -0
model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
model_analyzer/config/input/objects/config_plot.py +198 -0
model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
model_analyzer/config/input/yaml_config_validator.py +82 -0
model_analyzer/config/run/__init__.py +15 -0
model_analyzer/config/run/model_run_config.py +313 -0
model_analyzer/config/run/run_config.py +168 -0
model_analyzer/constants.py +76 -0
model_analyzer/device/__init__.py +15 -0
model_analyzer/device/device.py +24 -0
model_analyzer/device/gpu_device.py +87 -0
model_analyzer/device/gpu_device_factory.py +248 -0
model_analyzer/entrypoint.py +307 -0
model_analyzer/log_formatter.py +65 -0
model_analyzer/model_analyzer_exceptions.py +24 -0
model_analyzer/model_manager.py +255 -0
model_analyzer/monitor/__init__.py +15 -0
model_analyzer/monitor/cpu_monitor.py +69 -0
model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
model_analyzer/monitor/dcgm/__init__.py +15 -0
model_analyzer/monitor/dcgm/common/__init__.py +13 -0
model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
model_analyzer/monitor/dcgm/pydcgm.py +47 -0
model_analyzer/monitor/monitor.py +143 -0
model_analyzer/monitor/remote_monitor.py +137 -0
model_analyzer/output/__init__.py +15 -0
model_analyzer/output/file_writer.py +63 -0
model_analyzer/output/output_writer.py +42 -0
model_analyzer/perf_analyzer/__init__.py +15 -0
model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
model_analyzer/perf_analyzer/perf_config.py +479 -0
model_analyzer/plots/__init__.py +15 -0
model_analyzer/plots/detailed_plot.py +266 -0
model_analyzer/plots/plot_manager.py +224 -0
model_analyzer/plots/simple_plot.py +213 -0
model_analyzer/record/__init__.py +15 -0
model_analyzer/record/gpu_record.py +68 -0
model_analyzer/record/metrics_manager.py +887 -0
model_analyzer/record/record.py +280 -0
model_analyzer/record/record_aggregator.py +256 -0
model_analyzer/record/types/__init__.py +15 -0
model_analyzer/record/types/cpu_available_ram.py +93 -0
model_analyzer/record/types/cpu_used_ram.py +93 -0
model_analyzer/record/types/gpu_free_memory.py +96 -0
model_analyzer/record/types/gpu_power_usage.py +107 -0
model_analyzer/record/types/gpu_total_memory.py +96 -0
model_analyzer/record/types/gpu_used_memory.py +96 -0
model_analyzer/record/types/gpu_utilization.py +108 -0
model_analyzer/record/types/inter_token_latency_avg.py +60 -0
model_analyzer/record/types/inter_token_latency_base.py +74 -0
model_analyzer/record/types/inter_token_latency_max.py +60 -0
model_analyzer/record/types/inter_token_latency_min.py +60 -0
model_analyzer/record/types/inter_token_latency_p25.py +60 -0
model_analyzer/record/types/inter_token_latency_p50.py +60 -0
model_analyzer/record/types/inter_token_latency_p75.py +60 -0
model_analyzer/record/types/inter_token_latency_p90.py +60 -0
model_analyzer/record/types/inter_token_latency_p95.py +60 -0
model_analyzer/record/types/inter_token_latency_p99.py +60 -0
model_analyzer/record/types/output_token_throughput.py +105 -0
model_analyzer/record/types/perf_client_response_wait.py +97 -0
model_analyzer/record/types/perf_client_send_recv.py +97 -0
model_analyzer/record/types/perf_latency.py +111 -0
model_analyzer/record/types/perf_latency_avg.py +60 -0
model_analyzer/record/types/perf_latency_base.py +74 -0
model_analyzer/record/types/perf_latency_p90.py +60 -0
model_analyzer/record/types/perf_latency_p95.py +60 -0
model_analyzer/record/types/perf_latency_p99.py +60 -0
model_analyzer/record/types/perf_server_compute_infer.py +97 -0
model_analyzer/record/types/perf_server_compute_input.py +97 -0
model_analyzer/record/types/perf_server_compute_output.py +97 -0
model_analyzer/record/types/perf_server_queue.py +97 -0
model_analyzer/record/types/perf_throughput.py +105 -0
model_analyzer/record/types/time_to_first_token_avg.py +60 -0
model_analyzer/record/types/time_to_first_token_base.py +74 -0
model_analyzer/record/types/time_to_first_token_max.py +60 -0
model_analyzer/record/types/time_to_first_token_min.py +60 -0
model_analyzer/record/types/time_to_first_token_p25.py +60 -0
model_analyzer/record/types/time_to_first_token_p50.py +60 -0
model_analyzer/record/types/time_to_first_token_p75.py +60 -0
model_analyzer/record/types/time_to_first_token_p90.py +60 -0
model_analyzer/record/types/time_to_first_token_p95.py +60 -0
model_analyzer/record/types/time_to_first_token_p99.py +60 -0
model_analyzer/reports/__init__.py +15 -0
model_analyzer/reports/html_report.py +195 -0
model_analyzer/reports/pdf_report.py +50 -0
model_analyzer/reports/report.py +86 -0
model_analyzer/reports/report_factory.py +62 -0
model_analyzer/reports/report_manager.py +1376 -0
model_analyzer/reports/report_utils.py +42 -0
model_analyzer/result/__init__.py +15 -0
model_analyzer/result/constraint_manager.py +150 -0
model_analyzer/result/model_config_measurement.py +354 -0
model_analyzer/result/model_constraints.py +105 -0
model_analyzer/result/parameter_search.py +246 -0
model_analyzer/result/result_manager.py +430 -0
model_analyzer/result/result_statistics.py +159 -0
model_analyzer/result/result_table.py +217 -0
model_analyzer/result/result_table_manager.py +646 -0
model_analyzer/result/result_utils.py +42 -0
model_analyzer/result/results.py +277 -0
model_analyzer/result/run_config_measurement.py +658 -0
model_analyzer/result/run_config_result.py +210 -0
model_analyzer/result/run_config_result_comparator.py +110 -0
model_analyzer/result/sorted_results.py +151 -0
model_analyzer/state/__init__.py +15 -0
model_analyzer/state/analyzer_state.py +76 -0
model_analyzer/state/analyzer_state_manager.py +215 -0
model_analyzer/triton/__init__.py +15 -0
model_analyzer/triton/client/__init__.py +15 -0
model_analyzer/triton/client/client.py +234 -0
model_analyzer/triton/client/client_factory.py +57 -0
model_analyzer/triton/client/grpc_client.py +104 -0
model_analyzer/triton/client/http_client.py +107 -0
model_analyzer/triton/model/__init__.py +15 -0
model_analyzer/triton/model/model_config.py +556 -0
model_analyzer/triton/model/model_config_variant.py +29 -0
model_analyzer/triton/server/__init__.py +15 -0
model_analyzer/triton/server/server.py +76 -0
model_analyzer/triton/server/server_config.py +269 -0
model_analyzer/triton/server/server_docker.py +229 -0
model_analyzer/triton/server/server_factory.py +306 -0
model_analyzer/triton/server/server_local.py +158 -0
triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0

model_analyzer/monitor/dcgm/DcgmReader.py ADDED Viewed

@@ -0,0 +1,623 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+import signal, os
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import threading
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import sys
+import logging
+defaultFieldIds = [
+    dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+    dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_FB_FREE, dcgm_fields.DCGM_FI_DEV_FB_USED,
+    dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+    dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_MEM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
+    dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
+    dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
+    dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
+]
+def entity_group_id_to_string(entityGroupId):
+    if entityGroupId == dcgm_fields.DCGM_FE_GPU:
+        return 'GPU'
+    elif entityGroupId == dcgm_fields.DCGM_FE_VGPU:
+        return 'VGPU'
+    elif entityGroupId == dcgm_fields.DCGM_FE_SWITCH:
+        return 'NVSWITCH'
+    elif entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
+        return 'GPU INSTANCE'
+    elif entityGroupId == dcgm_fields.DCGM_FE_GPU_CI:
+        return 'COMPUTE INSTANCE'
+    elif entityGroupId == dcgm_fields.DCGM_FE_LINK:
+        return 'LINK'
+    else:
+        return ''
+class DcgmReader(object):
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle each field individually.
+    By default, it passes a string with the gpu, field tag, and value to LogInfo()
+    @params:
+    gpuId : the id of the GPU this field is reporting on
+    fieldId : the id of the field (ignored by default, may be useful for children)
+    fieldTag : the string representation of the field id
+    val : the value class that comes from DCGM (v.value is the value for the field)
+    '''
+    def CustomFieldHandler(self, gpuId, fieldId, fieldTag, val):
+        print("GPU %s field %s=%s" % (str(gpuId), fieldTag, str(val.value)))
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle each field individually.
+    By default, it passes a string with the gpu, field tag, and value to LogInfo()
+    @params:
+    entityGroupId : the type of entity this field is reporting on
+    entityId : the id of the entity this field is reporting on
+    fieldId : the id of the field (ignored by default, may be useful for children)
+    fieldTag : the string representation of the field id
+    val : the value class that comes from DCGM (v.value is the value for the field)
+    '''
+    def CustomFieldHandler_v2(self, entityGroupId, entityId, fieldId, fieldTag,
+                              val):
+        print("%s %s field %s=%s" % (entity_group_id_to_string(entityGroupId),
+                                     str(entityId), fieldTag, str(val.value)))
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle all of the data queried from DCGM.
+    By default, it will simply print the field tags and values for each GPU
+    @params:
+    fvs : Data in the format entityGroupId -> entityId -> values (dictionary of dictionaries)
+    '''
+    def CustomDataHandler_v2(self, fvs):
+        for entityGroupId in list(fvs.keys()):
+            entityGroup = fvs[entityGroupId]
+            for entityId in list(entityGroup.keys()):
+                entityFv = entityGroup[entityId]
+                for fieldId in list(entityFv.keys()):
+                    if fieldId in self.m_dcgmIgnoreFields:
+                        continue
+                    val = entityFv[fieldId][-1]
+                    if val.isBlank:
+                        continue
+                    fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                    self.CustomFieldHandler_v2(entityGroupId, entityId, fieldId,
+                                               fieldTag, val)
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle all of the data queried from DCGM.
+    By default, it will simply print the field tags and values for each GPU
+    @params:
+    fvs : Dictionary with gpuID as key and values as Value
+    '''
+    def CustomDataHandler(self, fvs):
+        for gpuId in list(fvs.keys()):
+            gpuFv = fvs[gpuId]
+            for fieldId in list(gpuFv.keys()):
+                if fieldId in self.m_dcgmIgnoreFields:
+                    continue
+                val = gpuFv[fieldId][-1]
+                if val.isBlank:
+                    continue
+                fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                self.CustomFieldHandler(gpuId, fieldId, fieldTag, val)
+    ###########################################################################
+    def SetupGpuIdUUIdMappings(self):
+        '''
+        Populate the m_gpuIdToUUId map
+        '''
+        gpuIds = self.m_dcgmGroup.GetGpuIds()
+        for gpuId in gpuIds:
+            gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
+            self.m_gpuIdToUUId[gpuId] = gpuInfo.identifiers.uuid
+    ###########################################################################
+    '''
+    Constructor
+    @params:
+    hostname        : Address:port of the host to connect. Defaults to localhost
+    fieldIds        : List of the field ids to publish. If it isn't specified, our default list is used.
+    updateFrequency : Frequency of update in microseconds. Defauls to 10 seconds or 10000000 microseconds
+    maxKeepAge      : Max time to keep data from NVML, in seconds. Default is 3600.0 (1 hour)
+    ignoreList      : List of the field ids we want to query but not publish.
+    gpuIds          : List of GPU IDs to monitor. If not provided, DcgmReader will monitor all GPUs on the system
+    fieldIntervalMap: Map of intervals to list of field numbers to monitor. Takes precedence over fieldIds and updateFrequency if not None.
+    '''
+    def __init__(self,
+                 hostname='localhost',
+                 fieldIds=None,
+                 updateFrequency=10000000,
+                 maxKeepAge=3600.0,
+                 ignoreList=None,
+                 fieldGroupName='dcgm_fieldgroupData',
+                 gpuIds=None,
+                 entities=None,
+                 fieldIntervalMap=None):
+        fieldIds = fieldIds or defaultFieldIds
+        ignoreList = ignoreList or []
+        self.m_dcgmHostName = hostname
+        self.m_updateFreq = updateFrequency  # default / redundant
+        self.m_fieldGroupName = fieldGroupName
+        self.m_publishFields = {}
+        if fieldIntervalMap is not None:
+            self.m_publishFields = fieldIntervalMap
+        else:
+            self.m_publishFields[self.m_updateFreq] = fieldIds
+        self.m_requestedGpuIds = gpuIds
+        self.m_requestedEntities = entities
+        self.m_dcgmIgnoreFields = ignoreList  #Fields not to publish
+        self.m_maxKeepAge = maxKeepAge
+        self.m_dcgmHandle = None
+        self.m_dcgmSystem = None
+        self.m_dcgmGroup = None
+        self.m_closeHandle = False
+        self.m_gpuIdToBusId = {}  #GpuID => PCI-E busId string
+        self.m_gpuIdToUUId = {}  # FieldId => dcgm_fields.dcgm_field_meta_t
+        self.m_fieldIdToInfo = {}  #FieldId => dcgm_fields.dcgm_field_meta_t
+        self.m_lock = threading.Lock(
+        )  #DCGM connection start-up/shutdown is not thread safe. Just lock pessimistically
+        self.m_debug = False
+        # For GetAllSinceLastCall* calls. We cache the value for these objects
+        # after first retrieval, so initializing them to None lets us know if
+        # we've made a first retrieval. The first retrieval is based on a
+        # "since" timestamp of 0, so it gets data in which we are not
+        # interested in. The second retrieval gets data since the first one, in
+        # which we ARE interested. The practical upshot of this is that actual
+        # reporting of data is delayed one collectd sampling interval -- as if
+        # the sampling was actually started one collectd sampling interval
+        # later. We expect this is not an issue.
+        self.fvs = None
+        self.dfvc = None
+        self.dfvec = None
+    ###########################################################################
+    '''
+    Define what should happen to this object at the beginning of a with
+    block. In this case, nothing more is needed since the constructor should've
+    been called.
+    '''
+    def __enter__(self):
+        return self
+    ###########################################################################
+    '''
+    Define the cleanup
+    '''
+    def __exit__(self, type, value, traceback):
+        self.Shutdown()
+    ###########################################################################
+    '''
+    This function intializes DCGM from the specified directory and connects to
+    the host engine.
+    '''
+    def InitWrapped(self, path=None):
+        dcgm_structs._dcgmInit(libDcgmPath=path)
+        self.Reconnect()
+    ###########################################################################
+    '''
+    This function tries to connect to hostengine and calls initwrapped to initialize
+    the dcgm.
+    '''
+    def Init(self, libpath=None):
+        with self.m_lock:
+            try:
+                self.InitWrapped(path=libpath)
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError("Can't connect to nv-hostengine. Is it down?")
+                self.SetDisconnected()
+    ###########################################################################
+    '''
+    Delete the DCGM group, DCGM system and DCGM handle and clear the attributes
+    on shutdown.
+    '''
+    def SetDisconnected(self):
+        #Force destructors since DCGM currently doesn't support more than one client connection per process
+        if self.m_dcgmGroup is not None:
+            del (self.m_dcgmGroup)
+            self.m_dcgmGroup = None
+        if self.m_dcgmSystem is not None:
+            del (self.m_dcgmSystem)
+            self.m_dcgmSystem = None
+        if self.m_dcgmHandle is not None:
+            del (self.m_dcgmHandle)
+            self.m_dcgmHandle = None
+    ##########################################################################
+    '''
+    This function calls the SetDisconnected function which disconnects from
+    DCGM and clears DCGM handle and DCGM group.
+    '''
+    def Shutdown(self):
+        with self.m_lock:
+            if self.m_closeHandle == True:
+                self.SetDisconnected()
+    ############################################################################
+    '''
+    Turns debugging output on
+    '''
+    def AddDebugOutput(self):
+        self.m_debug = True
+    ############################################################################
+    '''
+    '''
+    def InitializeFromHandle(self):
+        self.m_dcgmSystem = self.m_dcgmHandle.GetSystem()
+        if not self.m_requestedGpuIds and not self.m_requestedEntities:
+            self.m_dcgmGroup = self.m_dcgmSystem.GetDefaultGroup()
+        else:
+            groupName = "dcgmreader_%d" % os.getpid()
+            if self.m_requestedGpuIds:
+                self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithGpuIds(
+                    groupName, self.m_requestedGpuIds)
+                if self.m_requestedEntities:
+                    for entity in self.m_requestedEntities:
+                        self.m_dcgmGroup.AddEntity(entity.entityGroupId,
+                                                   entity.entityId)
+            else:
+                self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithEntities(
+                    groupName, self.m_requestedEntities)
+        self.SetupGpuIdBusMappings()
+        self.SetupGpuIdUUIdMappings()
+        self.GetFieldMetadata()
+        self.AddFieldWatches()
+    ############################################################################
+    '''
+    Has DcgmReader use but not own a handle. Currently for the unit tests.
+    '''
+    def SetHandle(self, handle):
+        self.m_dcgmHandle = pydcgm.DcgmHandle(handle)
+        self.InitializeFromHandle()
+    ############################################################################
+    '''
+    Reconnect function checks if connection handle is present. If the handle is
+    none, it creates the handle and gets the default DCGM group. It then maps
+    gpuIds to BusID, set the meta data of the field ids and adds watches to the
+    field Ids mentioned in the idToWatch list.
+    '''
+    def Reconnect(self):
+        if self.m_dcgmHandle is not None:
+            return
+        self.LogDebug("Connection handle is None. Trying to reconnect")
+        self.m_dcgmHandle = pydcgm.DcgmHandle(
+            None, self.m_dcgmHostName, dcgm_structs.DCGM_OPERATION_MODE_AUTO)
+        self.m_closeHandle = True
+        self.LogDebug("Connected to nv-hostengine")
+        self.InitializeFromHandle()
+    ###########################################################################
+    '''
+    Populate the g_gpuIdToBusId map. This map contains mapping from
+    gpuID to the BusID.
+    '''
+    def SetupGpuIdBusMappings(self):
+        self.m_gpuIdToBusId = {}
+        gpuIds = self.m_dcgmGroup.GetGpuIds()
+        for gpuId in gpuIds:
+            gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
+            self.m_gpuIdToBusId[gpuId] = gpuInfo.identifiers.pciBusId
+    ###########################################################################
+    '''
+    Add watches to the fields which are passed in init function in idToWatch
+    list. It also updates the field values for the first time.
+    '''
+    def AddFieldWatches(self):
+        maxKeepSamples = 0  #No limit. Handled by m_maxKeepAge
+        for interval, fieldGroup in self.m_fieldGroups.items():
+            self.LogDebug("AddWatchFields: interval = " + str(interval) + "\n")
+            self.m_dcgmGroup.samples.WatchFields(fieldGroup, interval,
+                                                 self.m_maxKeepAge,
+                                                 maxKeepSamples)
+        self.m_dcgmSystem.UpdateAllFields(1)
+        self.LogDebug("AddWatchFields exit\n")
+    ###########################################################################
+    '''
+    If the groupID already exists, we delete that group and create a new fieldgroup with
+    the fields mentioned in idToWatch. Then information of each field is acquired from its id.
+    '''
+    def GetFieldMetadata(self):
+        self.m_fieldIdToInfo = {}
+        self.m_fieldGroups = {}
+        self.m_fieldGroup = None
+        allFieldIds = []
+        # Initialize groups for all field intervals.
+        self.LogDebug("GetFieldMetaData:\n")
+        intervalIndex = 0
+        for interval, fieldIds in self.m_publishFields.items():
+            self.LogDebug("sampling interval = " + str(interval) + ":\n")
+            for fieldId in fieldIds:
+                self.LogDebug("   fieldId: " + str(fieldId) + "\n")
+            intervalIndex += 1
+            fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex)
+            findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(
+                fieldGroupName)
+            self.LogDebug("fieldGroupName: " + fieldGroupName + "\n")
+            # Remove our field group if it exists already
+            if findByNameId is not None:
+                self.LogDebug("fieldGroupId: " + findByNameId + "\n")
+                delFieldGroup = pydcgm.DcgmFieldGroup(
+                    dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId)
+                delFieldGroup.Delete()
+                del (delFieldGroup)
+            self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup(
+                self.m_dcgmHandle, fieldGroupName, fieldIds)
+            for fieldId in fieldIds:
+                if fieldId not in allFieldIds:
+                    allFieldIds += [fieldId]
+                self.m_fieldIdToInfo[
+                    fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId)
+                if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[
+                        fieldId] == None:
+                    self.LogError(
+                        "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid."
+                        % (fieldId))
+                    raise dcgm_structs.DCGMError(
+                        dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
+        # Initialize a field group of ALL fields.
+        fieldGroupName = self.m_fieldGroupName
+        findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName)
+        # Remove our field group if it exists already
+        if findByNameId is not None:
+            delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle,
+                                                  fieldGroupId=findByNameId)
+            delFieldGroup.Delete()
+            del (delFieldGroup)
+        self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle,
+                                                  fieldGroupName, allFieldIds)
+    ###########################################################################
+    '''
+    This function attempts to connect to DCGM and calls the implemented
+    CustomDataHandler in the child class with field values.
+    @params:
+    self.m_dcgmGroup.samples.GetLatest(self.m_fieldGroup).values : The field
+    values for each field. This dictionary contains fieldInfo for each field id
+    requested to be watched.
+    '''
+    def Process(self):
+        with self.m_lock:
+            try:
+                self.Reconnect()
+                # The first call just clears the collection set.
+                if not self.m_requestedEntities:
+                    self.dfvc = self.m_dcgmGroup.samples.GetAllSinceLastCall(
+                        self.dfvc, self.m_fieldGroup)
+                    self.CustomDataHandler(self.dfvc.values)
+                    self.dfvc.EmptyValues()
+                else:
+                    self.dfvec = self.m_dcgmGroup.samples.GetAllSinceLastCall_v2(
+                        self.dfvec, self.m_fieldGroup)
+                    self.CustomDataHandler_v2(self.dfvec.values)
+                    self.dfvec.EmptyValues()
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError("Can't connect to nv-hostengine. Is it down?")
+                self.SetDisconnected()
+    ###########################################################################
+    def LogInfo(self, msg):
+        logging.info(msg)
+    ###########################################################################
+    def LogDebug(self, msg):
+        logging.debug(msg)
+    ###########################################################################
+    def LogError(self, msg):
+        logging.error(msg)
+    ###########################################################################
+    '''
+    This function gets each value as a dictionary of dictionaries. The dictionary
+    returned is each gpu id mapped to a dictionary of it's field values. Each
+    field value dictionary is the field name mapped to the value or the field
+    id mapped to value depending on the parameter mapById.
+    '''
+    def GetLatestGpuValuesAsDict(self, mapById):
+        systemDictionary = {}
+        with self.m_lock:
+            try:
+                self.Reconnect()
+                fvs = self.m_dcgmGroup.samples.GetLatest(
+                    self.m_fieldGroup).values
+                for gpuId in list(fvs.keys()):
+                    systemDictionary[gpuId] = {
+                    }  # initialize the gpu's dictionary
+                    gpuFv = fvs[gpuId]
+                    for fieldId in list(gpuFv.keys()):
+                        val = gpuFv[fieldId][-1]
+                        if val.isBlank:
+                            continue
+                        if mapById == False:
+                            fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                            systemDictionary[gpuId][
+                                fieldTag] = val.value if isinstance(
+                                    val.value, bytes) else val.value
+                        else:
+                            systemDictionary[gpuId][
+                                fieldId] = val.value if isinstance(
+                                    val.value, bytes) else val.value
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError(
+                    "Can't connection to nv-hostengine. Please verify that it is running."
+                )
+                self.SetDisconnected()
+        return systemDictionary
+    ###########################################################################
+    '''
+    This function gets value as a dictionary of dictionaries of lists. The
+    dictionary returned is each gpu id mapped to a dictionary of it's field
+    value lists. Each field value dictionary is the field name mapped to the
+    list of values or the field id mapped to list of values depending on the
+    parameter mapById. The list of values are the values for each field since
+    the last retrieval.
+    '''
+    def GetAllGpuValuesAsDictSinceLastCall(self, mapById):
+        systemDictionary = {}
+        with self.m_lock:
+            try:
+                self.Reconnect()
+                report = self.fvs is not None
+                self.fvs = self.m_dcgmGroup.samples.GetAllSinceLastCall(
+                    self.fvs, self.m_fieldGroup)
+                if report:
+                    for gpuId in list(self.fvs.values.keys()):
+                        systemDictionary[gpuId] = {
+                        }  # initialize the gpu's dictionary
+                        gpuFv = self.fvs.values[gpuId]
+                        for fieldId in list(gpuFv.keys()):
+                            for val in gpuFv[fieldId]:
+                                if val.isBlank:
+                                    continue
+                                if mapById == False:
+                                    fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                                    if not fieldTag in systemDictionary[gpuId]:
+                                        systemDictionary[gpuId][fieldTag] = []
+                                    systemDictionary[gpuId][fieldTag].append(
+                                        val)
+                                else:
+                                    if not fieldId in systemDictionary[gpuId]:
+                                        systemDictionary[gpuId][fieldId] = []
+                                    systemDictionary[gpuId][fieldId].append(val)
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError(
+                    "Can't connection to nv-hostengine. Please verify that it is running."
+                )
+                self.SetDisconnected()
+        if self.fvs is not None:
+            self.fvs.EmptyValues()
+        return systemDictionary
+    ###########################################################################
+    def GetLatestGpuValuesAsFieldIdDict(self):
+        return self.GetLatestGpuValuesAsDict(True)
+    ###########################################################################
+    def GetLatestGpuValuesAsFieldNameDict(self):
+        return self.GetLatestGpuValuesAsDict(False)
+    ###########################################################################
+    def GetAllGpuValuesAsFieldIdDictSinceLastCall(self):
+        return self.GetAllGpuValuesAsDictSinceLastCall(True)
+    ###########################################################################
+    def GetAllGpuValuesAsFieldNameDictSinceLastCall(self):
+        return self.GetAllGpuValuesAsDictSinceLastCall(False)

model_analyzer/monitor/dcgm/DcgmStatus.py ADDED Viewed

@@ -0,0 +1,57 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+class DcgmStatus:
+    def __init__(self):
+        self.handle = dcgm_agent.dcgmStatusCreate()
+        self.errors = []
+    def __del__(self):
+        dcgm_agent.dcgmStatusDestroy(self.handle)
+    '''
+    Take any errors stored in our handle and update self.errors with them
+    '''
+    def UpdateErrors(self):
+        errorCount = dcgm_agent.dcgmStatusGetCount(self.handle)
+        if errorCount < 1:
+            return
+        for i in range(errorCount):
+            self.errors.append(dcgm_agent.dcgmStatusPopError(self.handle))
+    '''
+    Throw an exception if any errors are stored in our status handle
+    The exception text will contain all of the errors
+    '''
+    def ThrowExceptionOnErrors(self):
+        #Make sure we've captured all errors before looking at them
+        self.UpdateErrors()
+        if len(self.errors) < 1:
+            return
+        errorString = "Errors: "
+        for value in self.errors:
+            errorString += "\"%s\"" % value
+            raise dcgm_structs.DCGMError(value.status)