PyPI - mindstudio-probe - Versions diffs - 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl - Mend

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/METADATA +2 -2
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/RECORD +90 -79
msprobe/README.md +7 -5
msprobe/core/common/const.py +6 -0
msprobe/core/common/db_manager.py +35 -4
msprobe/core/common/file_utils.py +105 -27
msprobe/core/common/framework_adapter.py +7 -6
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/utils.py +14 -3
msprobe/core/compare/find_first/analyzer.py +8 -7
msprobe/core/compare/find_first/graph.py +11 -3
msprobe/core/compare/find_first/utils.py +2 -1
msprobe/core/compare/highlight.py +13 -6
msprobe/core/compare/multiprocessing_compute.py +17 -10
msprobe/core/compare/utils.py +14 -5
msprobe/core/data_dump/data_collector.py +18 -21
msprobe/core/data_dump/data_processor/pytorch_processor.py +43 -20
msprobe/core/data_dump/json_writer.py +18 -8
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +37 -3
msprobe/core/service.py +18 -5
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +7 -5
msprobe/docs/02.config_introduction.md +14 -1
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/06.data_dump_MindSpore.md +1 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +295 -0
msprobe/docs/10.accuracy_compare_PyTorch.md +46 -5
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/15.free_benchmarking_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +2 -0
msprobe/docs/21.visualization_PyTorch.md +15 -80
msprobe/docs/22.visualization_MindSpore.md +20 -104
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/mindspore/cell_processor.py +33 -5
msprobe/mindspore/compare/common_dir_compare.py +22 -26
msprobe/mindspore/compare/utils.py +1 -2
msprobe/mindspore/debugger/precision_debugger.py +1 -1
msprobe/mindspore/dump/cell_dump_process.py +73 -62
msprobe/mindspore/dump/graph_mode_cell_dump.py +21 -10
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +2 -0
msprobe/msprobe.py +6 -4
msprobe/pytorch/api_accuracy_checker/common/config.py +36 -3
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +24 -0
msprobe/pytorch/api_accuracy_checker/compare/compare.py +12 -2
msprobe/pytorch/api_accuracy_checker/config.yaml +6 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +132 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/__init__.py +0 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +205 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +378 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +239 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +250 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +198 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/common/utils.py +22 -2
msprobe/pytorch/compare/utils.py +3 -3
msprobe/pytorch/debugger/debugger_config.py +10 -0
msprobe/pytorch/dump/module_dump/hook_wrapper.py +34 -7
msprobe/pytorch/dump/module_dump/module_processer.py +23 -10
msprobe/pytorch/hook_module/api_register.py +6 -1
msprobe/pytorch/monitor/module_hook.py +28 -9
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/pt_config.py +57 -2
msprobe/pytorch/pytorch_service.py +11 -2
msprobe/visualization/builder/graph_builder.py +170 -64
msprobe/visualization/builder/graph_merger.py +0 -1
msprobe/visualization/builder/msprobe_adapter.py +1 -1
msprobe/visualization/db_utils.py +25 -2
msprobe/visualization/graph/base_node.py +0 -24
msprobe/visualization/graph/graph.py +5 -14
msprobe/visualization/graph_service.py +29 -53
msprobe/visualization/utils.py +11 -1
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.2.0.dist-info → mindstudio_probe-8.3.0.dist-info}/top_level.txt +0 -0

msprobe/core/common/file_utils.py CHANGED Viewed

@@ -12,29 +12,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import atexit
 import csv
 import fcntl
 import io
+import json
+import multiprocessing
 import os
 import pickle
-from multiprocessing import shared_memory
-import stat
-import json
 import re
 import shutil
+import stat
 import sys
 import zipfile
-import multiprocessing
-import yaml
+from multiprocessing import shared_memory
 import numpy as np
 import pandas as pd
+import yaml
+from msprobe.core.common.const import FileCheckConst, CompareConst, Const
 from msprobe.core.common.decorator import recursion_depth_decorator
-from msprobe.core.common.log import logger
 from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.const import FileCheckConst, CompareConst
 from msprobe.core.common.global_lock import global_lock, is_main_process
+from msprobe.core.common.log import logger
 proc_lock = multiprocessing.Lock()
@@ -46,16 +48,15 @@ class FileChecker:
     Attributes:
         file_path: The file or dictionary path to be verified.
         path_type: file or dictionary
-        ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability
+        ability(str): one of [FileCheckConst.READ_ABLE, FileCheckConst.WRITE_ABLE, FileCheckConst.READ_WRITE_ABLE]
         file_type(str): The correct file type for file
     """
-    def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True):
+    def __init__(self, file_path, path_type, ability=None, file_type=None):
         self.file_path = file_path
         self.path_type = self._check_path_type(path_type)
-        self.ability = ability
+        self.ability = self._check_ability_type(ability)
         self.file_type = file_type
-        self.is_script = is_script
     @staticmethod
     def _check_path_type(path_type):
@@ -64,9 +65,17 @@ class FileChecker:
             raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
         return path_type
+    @staticmethod
+    def _check_ability_type(ability):
+        ability_list = [FileCheckConst.READ_ABLE, FileCheckConst.WRITE_ABLE, FileCheckConst.READ_WRITE_ABLE]
+        if ability and ability not in ability_list:
+            logger.error(f'The ability must be one of {ability_list}.')
+            raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
+        return ability
     def common_check(self):
         """
-        功能：用户校验基本文件权限：软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符
+        功能：基本文件权限校验，包括文件存在性、软连接、文件长度、文件类型、文件读写权限、文件属组、文件路径特殊字符、文件后缀等
         注意：文件后缀的合法性，非通用操作，可使用其他独立接口实现
         """
         check_path_exists(self.file_path)
@@ -75,13 +84,13 @@ class FileChecker:
         check_path_length(self.file_path)
         check_path_type(self.file_path, self.path_type)
         self.check_path_ability()
-        if self.is_script:
-            check_path_owner_consistent(self.file_path)
+        check_path_owner_consistent(self.file_path)
         check_path_pattern_valid(self.file_path)
         check_common_file_size(self.file_path)
         check_file_suffix(self.file_path, self.file_type)
+        check_path_no_others_write(self.file_path)
         if self.path_type == FileCheckConst.FILE:
-            check_dirpath_before_read(self.file_path)
+            check_dirpath_permission(self.file_path)
         return self.file_path
     def check_path_ability(self):
@@ -137,7 +146,8 @@ class FileOpen:
         check_path_pattern_valid(self.file_path)
         if os.path.exists(self.file_path):
             check_common_file_size(self.file_path)
-            check_dirpath_before_read(self.file_path)
+            check_path_no_others_write(self.file_path)
+            check_dirpath_permission(self.file_path)
     def check_ability_and_owner(self):
         if self.mode in self.SUPPORT_READ_MODE:
@@ -172,7 +182,7 @@ def check_path_exists(path):
     if not os.path.exists(path):
         logger.error('The file path %s does not exist.' % path)
         raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
 def check_path_not_exists(path):
     if os.path.exists(path):
@@ -256,12 +266,15 @@ def check_path_type(file_path, file_type):
             raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
-def check_others_writable(directory):
-    dir_stat = os.stat(directory)
-    is_writable = (
-        bool(dir_stat.st_mode & stat.S_IWGRP) or  # 组可写
-        bool(dir_stat.st_mode & stat.S_IWOTH)     # 其他用户可写
-    )
+def check_group_writable(file_path):
+    path_stat = os.stat(file_path)
+    is_writable = bool(path_stat.st_mode & stat.S_IWGRP)
+    return is_writable
+def check_others_writable(file_path):
+    path_stat = os.stat(file_path)
+    is_writable = bool(path_stat.st_mode & stat.S_IWOTH)
     return is_writable
@@ -309,7 +322,7 @@ def check_path_before_create(path):
                                  'The file path {} contains special characters.'.format(path))
-def check_dirpath_before_read(path):
+def check_dirpath_permission(path):
     path = os.path.realpath(path)
     dirpath = os.path.dirname(path)
     if dedup_log('check_dirpath_before_read', dirpath):
@@ -319,15 +332,16 @@ def check_dirpath_before_read(path):
             check_path_owner_consistent(dirpath)
         except FileCheckException:
             logger.warning(f"The directory {dirpath} is not yours.")
-def check_file_or_directory_path(path, isdir=False):
+def check_file_or_directory_path(path, isdir=False, is_strict=False):
     """
     Function Description:
         check whether the path is valid
     Parameter:
         path: the path to check
         isdir: the path is dir or file
+        is_strict: whether to perform stricter validation (e.g., verify group cannot write to path)
     Exception Description:
         when invalid data throw exception
     """
@@ -337,6 +351,33 @@ def check_file_or_directory_path(path, isdir=False):
         path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE)
     path_checker.common_check()
+    if is_strict:
+        if check_group_writable(path):
+            raise FileCheckException(
+                FileCheckException.FILE_PERMISSION_ERROR,
+                f"The directory/file must not allow write access to group. Directory/File path: {path}"
+            )
+def check_path_no_others_write(file_path):
+    if dedup_log('check_path_no_others_write', file_path):
+        if check_group_writable(file_path):
+            logger.warning(f"The directory/file path is writable by group: {file_path}.")
+    if check_others_writable(file_path):
+        raise FileCheckException(
+            FileCheckException.FILE_PERMISSION_ERROR,
+            f"The directory/file must not allow write access to others. Directory/File path: {file_path}"
+        )
+def check_path_no_group_others_write(file_path):
+    if check_group_writable(file_path) or check_others_writable(file_path):
+        raise FileCheckException(
+            FileCheckException.FILE_PERMISSION_ERROR,
+            f"The directory/file must not allow write access to group or others. Directory/File path: {file_path}"
+        )
 def change_mode(path, mode):
     if not os.path.exists(path) or os.path.islink(path):
@@ -388,6 +429,14 @@ def check_file_type(path):
         raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+def root_privilege_warning():
+    if os.getuid() == 0:
+        logger.warning(
+            "msprobe is being run as root. "
+            "To avoid security risks, it is recommended to switch to a regular user to run it."
+        )
 def load_yaml(yaml_path):
     path_checker = FileChecker(yaml_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.YAML_SUFFIX)
     checked_path = path_checker.common_check()
@@ -422,6 +471,26 @@ def load_json(json_path):
     return data
+def load_construct_json(json_path):
+    construct_dict_o = load_json(json_path)
+    if Const.MEGATRON_MICRO_STEP_NUMBER in construct_dict_o:
+        construct_dict = {}
+        micro_step_dict = {Const.MEGATRON_MICRO_STEP_NUMBER: construct_dict_o.get(Const.MEGATRON_MICRO_STEP_NUMBER)}
+        del construct_dict_o[Const.MEGATRON_MICRO_STEP_NUMBER]
+        for key, value in construct_dict_o.items():
+            if isinstance(value, list):
+                if len(value) != 2:
+                    logger.error(f'Parse construct json file "{os.path.basename(json_path)}" failed.')
+                    raise RuntimeError()
+                construct_dict[key] = value[0]
+                micro_step_dict[key] = value[1]
+            else:
+                construct_dict[key] = value
+                micro_step_dict[key] = 0
+        return construct_dict, micro_step_dict
+    return construct_dict_o, {}
 def save_json(json_path, data, indent=None, mode="w"):
     check_path_before_create(json_path)
     json_path = os.path.realpath(json_path)
@@ -520,6 +589,9 @@ def move_directory(src_path, dst_path):
     check_file_or_directory_path(src_path, isdir=True)
     check_path_before_create(dst_path)
     try:
+        if os.path.exists(dst_path):
+            logger.warning(f"The destination directory {dst_path} already exists, it will be removed.")
+            shutil.rmtree(dst_path)
         shutil.move(src_path, dst_path)
     except Exception as e:
         logger.error(f"move directory {src_path} to {dst_path} failed")
@@ -945,7 +1017,13 @@ class SharedDict:
     def _safe_load(self):
         with io.BytesIO(self._shm.buf[:]) as buff:
             try:
-                self._dict = SafeUnpickler(buff).load()
+                data = SafeUnpickler(buff).load()
+                if not isinstance(data, dict):
+                    logger.debug(f"Data from shared memory is '{type(data)}' type, expected 'dict'.")
+                    self._dict = {}
+                    self._changed = True
+                else:
+                    self._dict = data
             except Exception as e:
                 logger.debug(f'shared dict is unreadable, reason: {e}, create new dict.')
                 self._dict = {}

msprobe/core/common/framework_adapter.py CHANGED Viewed

@@ -12,10 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.import functools
 import functools
 from msprobe.core.common.const import Const
-from msprobe.core.common.file_utils import check_file_or_directory_path
-from msprobe.core.common.file_utils import save_npy
+from msprobe.core.common.file_utils import check_file_or_directory_path, save_npy
 class FrameworkDescriptor:
@@ -103,7 +104,7 @@ class FmkAdp:
     @classmethod
     def tensor_norm(cls, tensor):
         return cls.process_tensor(tensor, lambda x: x.norm())
     @classmethod
     def save_tensor(cls, tensor, filepath):
         if cls.fmk == Const.PT_FRAMEWORK:
@@ -151,7 +152,7 @@ class FmkAdp:
     @classmethod
     def load_checkpoint(cls, path, to_cpu=True, weights_only=True):
-        check_file_or_directory_path(path)
+        check_file_or_directory_path(path, is_strict=not weights_only)
         if cls.fmk == Const.PT_FRAMEWORK:
             try:
                 if to_cpu:
@@ -161,9 +162,9 @@ class FmkAdp:
             except Exception as e:
                 raise RuntimeError(f"load pt file {path} failed: {e}") from e
         return mindspore.load_checkpoint(path)
     @classmethod
     def asnumpy(cls, tensor):
         if cls.fmk == Const.PT_FRAMEWORK:
             return tensor.float().numpy()
-        return tensor.float().asnumpy()
+        return tensor.float().asnumpy()

msprobe/core/common/megatron_utils.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+class MegatronStepInfo:
+    is_megatron = False
+    is_forward = False
+    is_backward = False
+    forward_micro_step = -1
+    backward_micro_step = -1
+    @classmethod
+    def reset(cls):
+        """重置所有类属性到初始状态"""
+        cls.is_megatron = False
+        cls.is_forward = False
+        cls.is_backward = False
+        cls.forward_micro_step = -1
+        cls.backward_micro_step = -1
+def wrap_megatron_step(func, is_forward=True):
+    @wraps(func)
+    def wrapped_func(*args, **kwargs):
+        if not MegatronStepInfo.is_megatron:
+            MegatronStepInfo.is_megatron = True
+        if is_forward:
+            MegatronStepInfo.is_forward = True
+            MegatronStepInfo.is_backward = False
+            MegatronStepInfo.forward_micro_step += 1
+        else:
+            MegatronStepInfo.is_forward = False
+            MegatronStepInfo.is_backward = True
+            MegatronStepInfo.backward_micro_step += 1
+        return func(*args, **kwargs)
+    return wrapped_func
+def get_micro_step():
+    return MegatronStepInfo.forward_micro_step if MegatronStepInfo.is_forward else MegatronStepInfo.backward_micro_step
+def is_megatron():
+    return MegatronStepInfo.is_megatron

msprobe/core/common/utils.py CHANGED Viewed

@@ -28,7 +28,7 @@ import numpy as np
 from msprobe.core.common.const import Const, CompareConst
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json)
+from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_path, load_json, load_construct_json)
 from msprobe.core.common.log import logger
 device = collections.namedtuple('device', ['type', 'index'])
@@ -83,7 +83,8 @@ class MsprobeBaseException(Exception):
     INVALID_API_NAME_ERROR = 36
     CROSS_FRAME_ERROR = 37
     MISSING_THRESHOLD_ERROR = 38
-    WRONG_THRESHOLD_ERROR = 38
+    WRONG_THRESHOLD_ERROR = 39
+    MULTIPROCESS_ERROR = 40
     def __init__(self, code, error_info: str = ""):
         super(MsprobeBaseException, self).__init__()
@@ -348,8 +349,18 @@ def get_stack_construct_by_dump_json_path(dump_json_path):
     stack_json = os.path.join(directory, "stack.json")
     construct_json = os.path.join(directory, "construct.json")
+    stack_json_exist = os.path.exists(stack_json)
+    construct_json_exist = os.path.exists(construct_json)
+    if not stack_json_exist and not construct_json_exist:
+        logger.info("stack.json and construct.json not found")
+        return {}, {}
+    if not stack_json_exist or not construct_json_exist:
+        logger.error("stack.json or construct.json not found, please check.")
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
     stack = load_json(stack_json)
-    construct = load_json(construct_json)
+    construct, _ = load_construct_json(construct_json)
     return stack, construct

msprobe/core/compare/find_first/analyzer.py CHANGED Viewed

@@ -47,7 +47,6 @@ class DiffAnalyzer:
             analyze_func()
             if self._diff_nodes:
                 self._gen_analyze_info()
-                self._post_process()
                 return
         logger.info('Cannot find any diff node, no need to generate analyze file.')
@@ -56,12 +55,6 @@ class DiffAnalyzer:
         self._resolve_input_path(self._output_path)
         logger.info("Pre Process completed.")
-    def _post_process(self):
-        for rank_path in self._paths.values():
-            dump_path = rank_path.dump_path
-            logger.debug(f"Remove {dump_path} success")
-        logger.info("Post Process completed.")
     """
     这里需要生成stack，但是直接用dict中自带就行，在op_items.NPU_Stack_Info中
     """
@@ -105,6 +98,8 @@ class DiffAnalyzer:
                 logger.warning(f'Rank {path.rank} has no dump data!')
                 continue
             for op_name, op_data in dump_data.items():
+                if is_ignore_op(op_name):
+                    continue
                 if is_communication_op(op_name):
                     self._first_comm_nodes[path.rank] = op_name
                     break
@@ -131,10 +126,16 @@ class DiffAnalyzer:
         for rank, nodes in list(self._rank_comm_nodes_dict.items())[:-1]:
             searched_ranks.add(rank)
             seen_nodes = set()
+            last_node = None
             for cur_node in nodes.values():
+                is_overflow = last_node and hasattr(last_node, 'layer') and hasattr(cur_node, 'layer') and \
+                last_node.layer >= cur_node.layer
+                if is_overflow:
+                    cur_node.layer = last_node.layer + 1
                 conn_info = cur_node.find_connected_nodes()
                 if not conn_info.get('ranks'):
                     conn_info['ranks'] = self._rank_comm_nodes_dict.keys()
+                last_node = cur_node
                 if not self._find_connection(conn_info, cur_node, searched_ranks, seen_nodes):
                     logger.debug(f'Cannot find connected communication node for "{cur_node.node_id}".')

msprobe/core/compare/find_first/graph.py CHANGED Viewed

@@ -52,19 +52,25 @@ class DataNode:
         metrics = {}
         for cmp_data in self.op_data:
             name = cmp_data.get(CompareConst.NPU_NAME)
+            # 构建度量指标字典
+            metrics = {}
             if CompareConst.NPU_MAX in cmp_data:
                 metrics = {CompareConst.NPU_MAX: cmp_data.get(CompareConst.NPU_MAX),
                         CompareConst.NPU_MIN: cmp_data.get(CompareConst.NPU_MIN),
                         CompareConst.NPU_MEAN: cmp_data.get(CompareConst.NPU_MEAN),
                         CompareConst.NPU_NORM: cmp_data.get(CompareConst.NPU_NORM)}
             elif CompareConst.NPU_MD5 in cmp_data:
-                metrics = {CompareConst.NPU_MD5: cmp_data.get(CompareConst.NPU_MD5)}
+                metrics[CompareConst.NPU_MD5] = cmp_data.get(CompareConst.NPU_MD5)
+            if CompareConst.NPU_P2POP_PEER in cmp_data:
+                metrics[CompareConst.NPU_P2POP_PEER] = cmp_data.get(CompareConst.NPU_P2POP_PEER)
             if cmp_data.get(CompareConst.STACK) != CompareConst.N_A and not self.stack:
                 self.stack = cmp_data.get(CompareConst.STACK)
-            if Const.INPUT in name:
+            if cmp_data.get('state') == "input":
                 self.inputs[name] = metrics
-            elif Const.OUTPUT in name:
+            elif cmp_data.get('state') == "output":
                 self.outputs[name] = metrics
     def gen_node_info(self, path: RankPath):
@@ -161,6 +167,8 @@ class CommunicationNode:
                 if val and val.startswith('[') and val.endswith(']'):
                     val = [int(part) for part in val.strip('[]').split(',')]
                     ranks.update(val)
+            elif v.get(CompareConst.NPU_P2POP_PEER) != "None":
+                ranks.add(v.get(CompareConst.NPU_P2POP_PEER))
         return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
                 'type': DiffAnalyseConst.OPPOSITE_DIR.get(self.type, DiffAnalyseConst.LINK)}

msprobe/core/compare/find_first/utils.py CHANGED Viewed

@@ -120,7 +120,8 @@ def is_communication_op(op_name):
 def is_ignore_op(op_name):
     ignore_keywords = [
         'Torch.empty',
-        'Torch.fill'
+        'Torch.fill',
+        'Tensor.__setitem__'
     ]
     return any(keyword in op_name for keyword in ignore_keywords)

msprobe/core/compare/highlight.py CHANGED Viewed

@@ -26,7 +26,7 @@ from tqdm import tqdm
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.core.common.file_utils import save_workbook
 from msprobe.core.common.log import logger
-from msprobe.core.common.utils import get_header_index
+from msprobe.core.common.utils import get_header_index, CompareException
 from msprobe.core.compare.utils import table_value_is_valid, gen_api_batches
 from msprobe.core.compare.config import ModeConfig
@@ -359,18 +359,25 @@ class HighLight:
         def err_call(args):
             logger.error("Multiprocessing malicious value check failed! Reason: {}".format(args))
-            try:
-                pool.close()
-            except OSError:
-                logger.error("Pool terminate failed")
         result_df_columns = result_df.columns.tolist()
         for column in result_df_columns:
             self.value_check(column)
+        async_results = []
         for df_chunk in chunks:
-            pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
+            result = pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
+            async_results.append(result)
         pool.close()
+        for ar in async_results:
+            try:
+                ar.get(timeout=3600)
+            except Exception as e:
+                logger.error(f"Task failed with exception: {e}")
+                pool.terminate()
+                raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
         pool.join()
     def df_malicious_value_check(self, result_df):

msprobe/core/compare/multiprocessing_compute.py CHANGED Viewed

@@ -52,16 +52,20 @@ def _ms_graph_handle_multi_process(func, result_df, mode):
     def err_call(args):
         logger.error('multiprocess compare failed! Reason: {}'.format(args))
-        try:
-            pool.close()
-        except OSError as e:
-            logger.error(f'pool terminate failed: {str(e)}')
     for df_chunk in df_chunks:
         result = pool.apply_async(func, args=(df_chunk, mode), error_callback=err_call)
         results.append(result)
-    final_results = [r.get() for r in results]
     pool.close()
+    try:
+        final_results = [r.get(timeout=3600) for r in results]
+    except Exception as e:
+        logger.error(f"Task failed with exception: {e}")
+        pool.terminate()
+        raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
     pool.join()
     return pd.concat(final_results, ignore_index=True)
@@ -277,10 +281,6 @@ class CompareRealData:
         def err_call(args):
             logger.error('multiprocess compare failed! Reason: {}'.format(args))
-            try:
-                pool.close()
-            except OSError:
-                logger.error("pool terminate failed")
         progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100)
@@ -298,7 +298,14 @@ class CompareRealData:
                                       )
             results.append(result)
-        final_results = [r.get() for r in results]
         pool.close()
+        try:
+            final_results = [r.get(timeout=3600) for r in results]
+        except Exception as e:
+            logger.error(f"Task failed with exception: {e}")
+            pool.terminate()
+            raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
         pool.join()
         return pd.concat(final_results, ignore_index=True)

msprobe/core/compare/utils.py CHANGED Viewed

@@ -695,10 +695,6 @@ def get_sorted_ranks(npu_dump_dir, bench_dump_dir):
 def multi_statistics_compare(func, func_args):
     def err_call(args):
         logger.error(f'Multiprocess statistics compare failed! Reason: {args}')
-        try:
-            pool.close()
-        except OSError:
-            logger.error("Pool terminate failed")
     compare_func, input_param_nr_list, output_path, kwargs = func_args
@@ -715,9 +711,22 @@ def multi_statistics_compare(func, func_args):
             chunks[i].append(input_param_nr_list[param_num - remainder + i])
     pool = multiprocessing.Pool(process_num)
+    async_results = []
     for chunk in chunks:
-        pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
+        result = pool.apply_async(func, args=(compare_func, chunk, output_path, kwargs), error_callback=err_call)
+        async_results.append(result)
     pool.close()
+    for ar in async_results:
+        try:
+            ar.get(timeout=3600)
+        except Exception as e:
+            logger.error(f"Task failed with exception: {e}")
+            pool.terminate()
+            raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
     pool.join()

mindstudio-probe 8.2.0__py3-none-any.whl → 8.3.0__py3-none-any.whl

mindstudio-probe 8.2.0py3-none-any.whl → 8.3.0py3-none-any.whl