PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/core/compare/find_first/utils.py ADDED Viewed

@@ -0,0 +1,189 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from dataclasses import dataclass
+import sys
+import time
+import psutil
+from msprobe.core.common.file_utils import check_file_or_directory_path, load_json
+from msprobe.core.common.const import Const
+@dataclass
+class RankPath:
+    rank: int
+    dump_path: str
+    def __init__(self, rank, dump_path):
+        self.rank = rank
+        check_file_or_directory_path(dump_path)
+        self.dump_path = dump_path
+class FileCache:
+    """
+    lazy load file
+    """
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super().__new__(cls, *args, **kwargs)
+        return cls._instance
+    def __init__(self):
+        self._max_memory_usage = psutil.virtual_memory().available / 4  # 最大占用当前可用内存空间的1/4
+        self._cache = OrderedDict()
+        self._access_cnt = {}
+        self._access_time = {}
+        self._size = {}
+    @staticmethod
+    def _sizeof(obj):
+        seen = set()
+        objs = [obj]
+        size = 0
+        while objs:
+            obj = objs.pop()
+            obj_id = id(obj)
+            if obj_id in seen:
+                continue
+            seen.add(obj_id)
+            size += sys.getsizeof(obj)
+            if isinstance(obj, dict):
+                objs.extend(obj.keys())
+                objs.extend(obj.values())
+            elif isinstance(obj, (list, tuple, set, frozenset)):
+                objs.extend(obj)
+        return size
+    def load_json(self, json_path):
+        if json_path in self._cache:
+            self._access_cnt[json_path] += 1
+            self._access_time[json_path] = time.monotonic()
+            self._cache.move_to_end(json_path)
+            return self._cache[json_path]
+        self._cleanup()
+        return self._load(json_path)
+    def _load(self, json_path):
+        data = load_json(json_path)
+        self._add_to_cache(json_path, data)
+        return data
+    def _add_to_cache(self, key, data):
+        if key in self._cache:
+            self._cache.move_to_end(key)
+        else:
+            self._cache[key] = data
+            self._access_cnt[key] = 0
+            self._access_time[key] = time.monotonic()
+            self._size[key] = self._sizeof(data)
+    def _calc_cache_size(self):
+        return sys.getsizeof(self._cache) + sum(self._size.values())
+    def _cleanup(self):
+        while self._calc_cache_size() > self._max_memory_usage and self._cache:
+            least_frequent_key = min(self._access_cnt.keys(), key=lambda k: self._access_cnt[k])
+            least_recent_key = min(self._access_time.keys(), key=lambda k: self._access_time[k])
+            largest_key = max(self._cache.keys(), key=lambda k: self._size[k])
+            key_to_rm = min([least_frequent_key, least_recent_key, largest_key],
+                            key=lambda k: (self._access_cnt[k], self._access_time[k], -self._size[k]))
+            del self._cache[key_to_rm]
+            del self._access_cnt[key_to_rm]
+            del self._access_time[key_to_rm]
+            del self._size[key_to_rm]
+def is_communication_op(op_name):
+    # 定义通信算子的关键字，覆盖各种通信操作，如all_reduce, send, broadcast等
+    # 从wrap文件中读取，先硬编码在文件中
+    return (op_name.startswith((Const.DISTRIBUTED, Const.MINT_DIST_API_TYPE_PREFIX, Const.MS_API_TYPE_COM)) and
+            any(keyword in op_name for keyword in DiffAnalyseConst.COMMUNICATION_KEYWORDS))
+def is_ignore_op(op_name):
+    ignore_keywords = [
+        'Torch.empty',
+        'Torch.fill',
+        'Tensor.__setitem__'
+    ]
+    return any(keyword in op_name for keyword in ignore_keywords)
+class DiffAnalyseConst:
+    COMMUNICATION_KEYWORDS = {
+        'send',  # send 算子
+        'recv',  # recv 算子
+        'broadcast',  # broadcast 算子
+        'all_reduce',  # all_reduce 算子
+        'reduce',  # reduce 算子
+        'all_gather',  # all_gather 算子
+        'gather',  # gather 算子
+        'isend',  # isend 算子
+        'irecv',  # irecv 算子
+        'scatter',  # scatter 算子
+        'reduce_scatter',  # reduce_scatter 算子
+        '_reduce_scatter_base',  # _reduce_scatter_base 算子
+        '_all_gather_base',  # _all_gather_base 算子
+        'all_to_all_single',  # all_to_all_single 算子
+        'all_to_all',  # all_to_all 算子
+        'all_gather_into_tensor',  # all_gather_into_tensor 算子
+        'reduce_scatter_tensor',  # reduce_scatter_tensor 算子
+        'send_object_list',  # send_object_list 算子
+        'recv_object_list'  # recv_object_list 算子
+    }
+    P2P_API_MAPPING = {'send': 'recv', 'recv': 'send', 'isend': 'irecv', 'irecv': 'isend',
+                       'send_object_list': 'recv_object_list', 'recv_object_list': 'send_object_list'}
+    SRC = 'src'
+    DST = 'dst'
+    SRC_GROUP = 'group_src'
+    DST_GROUP = 'group_dst'
+    LINK = 'link'
+    DIRECTED_API = {'send': DST, 'recv': SRC, 'isend': DST, 'irecv': SRC, 'broadcast': SRC, 'scatter': SRC,
+                    'gather': DST, 'send_object_list': DST, 'recv_object_list': SRC}
+    OPPOSITE_DIR = {SRC: DST, DST: SRC}
+    DUMP_FILE = "dump.json"
+    CONSTRUCT_FILE = "construct.json"
+    STACK_FILE = "stack.json"
+def analyze_diff_in_group(nodes_group):
+    diff_nodes = []
+    def get_compute_ops_from_comm_nodes(comm_nodes):
+        for comm_node in comm_nodes:
+            for op_node in comm_node.compute_ops:
+                op_node.layer = comm_node.layer
+                diff_nodes.append(op_node)
+    def get_comm_ops(comm_nodes):
+        for node in comm_nodes:
+            node.data.layer = node.layer
+            diff_nodes.append(node.data)
+    # 先看src或link中input是否有异常
+    src_list = list(filter(lambda node: node.type in [DiffAnalyseConst.SRC, DiffAnalyseConst.LINK], nodes_group))
+    input_diff_nodes = list(filter(lambda node: node.is_diff, src_list))
+    # 如果有异常回溯计算节点找到异常来源
+    # 使用cpu模拟节点进行计算，查看结果是否有问题。需要对所有计算节点录入/映射，暂不实现。
+    get_compute_ops_from_comm_nodes(nodes_group)
+    # 筛选入参没问题但出参有问题的通信节点
+    output_diff_nodes = list(filter(lambda node: node.data.is_diff, nodes_group))
+    get_comm_ops(output_diff_nodes)
+    return diff_nodes

msprobe/core/compare/highlight.py CHANGED Viewed

@@ -16,10 +16,8 @@
 import abc
 import math
 import multiprocessing
-import re
 from collections import namedtuple
-import numpy as np
 import openpyxl
 from openpyxl.styles import PatternFill
 from openpyxl.utils.dataframe import dataframe_to_rows
@@ -28,8 +26,8 @@ from tqdm import tqdm
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.core.common.file_utils import save_workbook
 from msprobe.core.common.log import logger
-from msprobe.core.common.utils import get_header_index, safe_get_value
-from msprobe.core.compare.utils import table_value_is_valid, get_name_and_state, CompareException
+from msprobe.core.common.utils import get_header_index, CompareException
+from msprobe.core.compare.utils import table_value_is_valid, gen_api_batches
 from msprobe.core.compare.config import ModeConfig
@@ -54,10 +52,12 @@ class CheckOrderMagnitude(HighlightCheck):
         api_in, api_out, num = info
         max_diff_index = get_header_index(CompareConst.MAX_DIFF if dump_mode == Const.SUMMARY
                                           else CompareConst.MAX_ABS_ERR, dump_mode)
-        if abs(api_in[max_diff_index]) > abs(api_out[max_diff_index]):
+        max_diff_in = abs(api_in[max_diff_index])
+        max_diff_out = abs(api_out[max_diff_index])
+        if max_diff_in > max_diff_out or (max_diff_in <= 1 or max_diff_out <= 1):
             return
-        in_order = 0 if abs(api_in[max_diff_index]) < 1 else math.log10(abs(api_in[max_diff_index]))
-        out_order = 0 if abs(api_out[max_diff_index]) < 1 else math.log10(abs(api_out[max_diff_index]))
+        in_order = 0 if max_diff_in < 1 else math.log10(max_diff_in)
+        out_order = 0 if max_diff_out < 1 else math.log10(max_diff_out)
         if out_order - in_order >= CompareConst.ORDER_MAGNITUDE_DIFF_YELLOW:
             add_highlight_row_info(color_columns.yellow, num,
                                    "maximum absolute error of both input/parameters and output exceed 1, "
@@ -102,20 +102,28 @@ class CheckMaxRelativeDiff(HighlightCheck):
     """检查最大相对差异"""
     def apply(self, info, color_columns, dump_mode):
+        def get_number(data):
+            """统计量相对值如果为正常百分数据，str格式并以%结尾"""
+            if isinstance(data, str) and data.endswith("%"):
+                return float(data[:-1]) / 100
+            return data
         api_in, api_out, num = info
-        max_diff_index = get_header_index(CompareConst.MAX_DIFF, dump_mode)
-        bench_max_index = get_header_index(CompareConst.BENCH_MAX, dump_mode)
-        input_max_relative_diff = np.abs(
-            np.divide(api_in[max_diff_index], max(Const.FLOAT_EPSILON, api_in[bench_max_index])))
-        output_max_relative_diff = np.abs(
-            np.divide(api_out[max_diff_index], max(Const.FLOAT_EPSILON, api_out[bench_max_index])))
-        if not isinstance(input_max_relative_diff, (float, int)) or not isinstance(output_max_relative_diff,
-                                                                                   (float, int)):
+        max_rel_diff = get_header_index(CompareConst.MAX_RELATIVE_ERR, dump_mode)
+        input_max_relative_diff = api_in[max_rel_diff]  # 内部数据，长度总是和表头一致，不会越界
+        output_max_relative_diff = api_out[max_rel_diff]
+        input_max_relative_diff = get_number(input_max_relative_diff)
+        output_max_relative_diff = get_number(output_max_relative_diff)
+        if not isinstance(output_max_relative_diff, (float, int)):
             return
         if output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_RED:
             add_highlight_row_info(color_columns.red, num, "maximum relative error exceeds 0.5")
-        elif (output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_YELLOW and
-              input_max_relative_diff < CompareConst.MAX_RELATIVE_IN_YELLOW):
+        if not isinstance(input_max_relative_diff, (float, int)):
+            return
+        if (output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_YELLOW and
+                input_max_relative_diff < CompareConst.MAX_RELATIVE_IN_YELLOW):
             add_highlight_row_info(color_columns.yellow, num,
                                    "The output's maximum relative error exceeds 0.1, "
                                    "while the input/parameter's is below 0.01")
@@ -139,12 +147,25 @@ class CheckOverflow(HighlightCheck):
             add_highlight_row_info(color_columns.red, num, "maximum absolute error exceeds 1e+10")
+class CheckReqGradConsist(HighlightCheck):
+    """检查requires_grad是否一致"""
+    def apply(self, info, color_columns, dump_mode):
+        line, num = info
+        req_grad_consist_index = get_header_index(CompareConst.REQ_GRAD_CONSIST, dump_mode)
+        if not line[req_grad_consist_index]:
+            add_highlight_row_info(color_columns.yellow, num, "requires_grad is inconsistent")
 class HighlightRules:
     """高亮规则集合，用于检查API的误差"""
     # 适用于每行的规则
     basic_rules = {
         "check_overflow": CheckOverflow()
     }
+    consist_rules = {
+        "check_req_grad_consist": CheckReqGradConsist()
+    }
     # 用于比较输入和输出的规则
     # 真实数据检查规则
@@ -160,64 +181,10 @@ class HighlightRules:
     }
-class ApiBatch:
-    def __init__(self, api_name: str, start: int):
-        self.api_name = api_name
-        self.start = start
-        self.input_len = 1  # input的数量
-        self.params_end_index = start + 1  # params的结束index
-        self.output_end_index = start + 1  # output的结束index
-        self.params_grad_end_index = start + 1  # params_grad的结束index
-        # 内部state的标志("input", "output", "parameters", "parameters_grad"),
-        # 用于控制计算input_len, output_end_index, params_end_index, self.params_grad_end_index
-        self._state = Const.INPUT  # api_batch初始化为input
-    def set_state(self, state: str):
-        """设置当前状态"""
-        if state in {Const.INPUT, Const.OUTPUT, Const.KWARGS, Const.PARAMS, Const.PARAMS_GRAD}:
-            self._state = state
-        else:
-            raise ValueError(f"Invalid state: {state}")
-    def increment(self, state: str):
-        self.set_state(state)
-        if self._state == Const.INPUT or self._state == Const.KWARGS:
-            self.input_len += 1
-            self.params_end_index += 1
-            self.output_end_index += 1
-        if self._state == Const.PARAMS:
-            self.params_end_index += 1
-            self.output_end_index += 1
-        if self._state == Const.OUTPUT:
-            self.output_end_index += 1
-        self.params_grad_end_index += 1
 class HighLight:
-    def __init__(self, mode_config: ModeConfig):
+    def __init__(self, mode_config: ModeConfig, rank):
         self.mode_config = mode_config
-    @staticmethod
-    def api_batches_update(api_batches, api_name, state, index):
-        """
-        当一个api的所有item更新完后，input, output的索引范围：
-        input: [start: start+input_len]
-        output: [start+input_len: output_end_index]
-        params: [output_end_index: params_end_index]
-        """
-        if not api_batches:
-            api_batches.append(ApiBatch(api_name, index))
-        else:
-            api_batch = api_batches[-1]
-            if api_batch.api_name == api_name or (
-                    not re.search(Const.REGEX_FORWARD_BACKWARD, api_name) and api_name in api_batch.api_name):
-                try:
-                    api_batch.increment(state)
-                except ValueError as e:
-                    logger.error(f"api_batch: {api_batch} with invalid state, please check! {e}")
-                    raise CompareException(CompareException.INVALID_STATE_ERROR) from e
-            else:
-                api_batches.append(ApiBatch(api_name, index))
+        self.rank = rank
     @staticmethod
     def check_indices_numeric(api_items, indices: list):
@@ -232,7 +199,7 @@ class HighLight:
         if CompareConst.NPU_MD5 in result_df.columns:
             return
-        err_msg = result_df.get(CompareConst.ERROR_MESSAGE)
+        err_msg = result_df.get(CompareConst.ERROR_MESSAGE).copy()
         red_lines_num_set = highlight_dict.get('red_rows')
         for color in ['red', 'yellow']:
@@ -273,12 +240,11 @@ class HighLight:
     def find_compare_result_error_rows(self, result_df, highlight_dict):
         """将dataframe根据API分组，并找到有误差的算子用于高亮"""
         result = result_df.values
-        api_batches = []
-        for i, res_i in enumerate(result):
-            api_full_name = safe_get_value(res_i, 0, "res_i")
-            api_name, state = get_name_and_state(api_full_name)
-            self.api_batches_update(api_batches, api_name, state, i)
-        with tqdm(total=len(api_batches), desc="API/Module Analyse Progress", unit="item", ncols=100) as progress_bar:
+        header = result_df.columns.tolist()
+        api_batches = gen_api_batches(result, header)
+        default_bar_desc = 'API/Module Analyse Progress'
+        bar_desc_add_rank = f'[{self.rank}]' + default_bar_desc if self.rank else default_bar_desc
+        with tqdm(total=len(api_batches), desc=bar_desc_add_rank, unit="item", ncols=100) as progress_bar:
             for api_batch in api_batches:
                 self.find_error_rows(result[api_batch.start: api_batch.params_grad_end_index], api_batch,
                                      highlight_dict)
@@ -328,6 +294,13 @@ class HighLight:
                 api_info = ApiInfo(api_input=api_in, api_output=api_out, num_pointer=index)
                 self.apply_comparison_rules(api_info, color_columns)
+        # 对单行API的输入或输出进行requires_grad是否一致判断
+        for i, line in enumerate(result):
+            index = api_batch_start + i
+            line_info = LineInfo(line_data=line, num_pointer=index)
+            for rule in HighlightRules.consist_rules.values():
+                rule.apply(line_info, color_columns, self.mode_config.dump_mode)
         red_lines_num_set = {x[0] for x in red_lines}
         yellow_lines_num_set = {x[0] for x in yellow_lines}
         highlight_dict.get('red_rows', set()).update(red_lines_num_set)
@@ -349,28 +322,19 @@ class HighLight:
         self.update_highlight_err_msg(result_df, highlight_dict)  # add highlight err_msg
+        self.df_malicious_value_check(result_df)
         wb = openpyxl.Workbook()
         ws = wb.active
-        # write header
-        logger.info('Initializing Excel file.')
-        self.handle_multi_process_malicious_value_check(self.df_malicious_value_check, result_df)
         result_df_convert = result_df.applymap(self.compare_result_df_convert)
         for row in dataframe_to_rows(result_df_convert, index=False, header=True):
             ws.append(row)
         # 对可疑数据标色
         logger.info('Coloring Excel in progress.')
+        red_fill = PatternFill(start_color=CompareConst.RED, end_color=CompareConst.RED, fill_type="solid")
+        yellow_fill = PatternFill(start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid")
         col_len = len(result_df.columns)
-        red_fill = PatternFill(
-            start_color=CompareConst.RED, end_color=CompareConst.RED, fill_type="solid"
-        )
-        yellow_fill = PatternFill(
-            start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid",
-        )
         for i in highlight_dict.get("red_rows", []):
             for j in range(1, col_len + 1):
                 ws.cell(row=i + 2, column=j).fill = red_fill  # 2因为ws.cell中的row或column需要>=1,数据从第2行开始
@@ -378,7 +342,6 @@ class HighLight:
             for j in range(1, col_len + 1):
                 ws.cell(row=i + 2, column=j).fill = yellow_fill
-        logger.info('Saving Excel file to disk: %s' % file_path)
         save_workbook(wb, file_path)
     def handle_multi_process_malicious_value_check(self, func, result_df):
@@ -396,22 +359,32 @@ class HighLight:
         def err_call(args):
             logger.error("Multiprocessing malicious value check failed! Reason: {}".format(args))
-            try:
-                pool.close()
-            except OSError:
-                logger.error("Pool terminate failed")
         result_df_columns = result_df.columns.tolist()
         for column in result_df_columns:
             self.value_check(column)
+        async_results = []
         for df_chunk in chunks:
-            pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
+            result = pool.apply_async(func, args=(df_chunk, result_df_columns,), error_callback=err_call)
+            async_results.append(result)
         pool.close()
+        for ar in async_results:
+            try:
+                ar.get(timeout=3600)
+            except Exception as e:
+                logger.error(f"Task failed with exception: {e}")
+                pool.terminate()
+                raise CompareException(CompareException.MULTIPROCESS_ERROR) from e
         pool.join()
-    def df_malicious_value_check(self, df_chunk, result_df_columns):
-        for row in df_chunk.itertuples(index=False):
+    def df_malicious_value_check(self, result_df):
+        result_df_columns = result_df.columns.tolist()
+        for column in result_df_columns:
+            self.value_check(column)
+        for row in result_df.itertuples(index=False):
             api_name = row[0]
             for i, value in enumerate(row):
                 self.value_check(value, api_name, i, result_df_columns)

msprobe/core/compare/layer_mapping/layer_mapping.py CHANGED Viewed

@@ -18,12 +18,12 @@ from collections import defaultdict
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.core.common.file_utils import load_json, load_yaml, save_yaml
-from msprobe.core.common.utils import (add_time_with_yaml,
-                                       detect_framework_by_dump_json,
-                                       get_stack_construct_by_dump_json_path)
+from msprobe.core.common.utils import add_time_with_yaml, detect_framework_by_dump_json, \
+    get_stack_construct_by_dump_json_path, CompareException
 from msprobe.core.compare.layer_mapping.data_scope_parser import get_dump_data_items
 from msprobe.core.compare.utils import read_op, reorder_op_name_list
 from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.core.common.log import logger
 class LayerTrie:
@@ -63,7 +63,11 @@ class LayerTrie:
             node = node.children[name]
         if index >= len(node.data_items[state]):
             return default_value
-        return node.data_items[state][index]
+        if node.data_items[state]:
+            return node.data_items[state][index]
+        else:
+            logger.error(f"node.data_items of state:{state} is empty, please check.")
+            raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR)
     def save_to_yaml(self, output_path):
         result = {f"{self.type_name} @ {self}": self.convert_to_dict(self)}
@@ -208,7 +212,8 @@ def generate_data_mapping(npu_json_path, bench_json_path, api_mapping, output_pa
     def read_full_op_names(data, op_name):
         op_parsed_list = read_op(data.get(op_name, {}), op_name)
         full_op_names = [op_parsed.get('full_op_name') for op_parsed in op_parsed_list]
-        return full_op_names
+        states = [op_parsed.get(Const.STATE) for op_parsed in op_parsed_list]
+        return full_op_names, states
     def generate_op_data_mapping(npu_op_name, npu_full_op_names, bench_op_name, bench_full_op_names):
         suffix_to_full_op_name = {}
@@ -228,10 +233,10 @@ def generate_data_mapping(npu_json_path, bench_json_path, api_mapping, output_pa
     for npu_op_name, bench_op_name in api_mapping.items():
         if not npu_op_name:
             continue
-        npu_full_op_names = read_full_op_names(npu_data, npu_op_name)
-        bench_full_op_names = read_full_op_names(bench_data, bench_op_name)
-        npu_full_op_names_reorder = reorder_op_name_list(npu_full_op_names)
-        bench_full_op_names_reorder = reorder_op_name_list(bench_full_op_names)
+        npu_full_op_names, npu_states = read_full_op_names(npu_data, npu_op_name)
+        bench_full_op_names, bench_states = read_full_op_names(bench_data, bench_op_name)
+        npu_full_op_names_reorder, _ = reorder_op_name_list(npu_full_op_names, npu_states)
+        bench_full_op_names_reorder, _ = reorder_op_name_list(bench_full_op_names, bench_states)
         mapping = generate_op_data_mapping(npu_op_name, npu_full_op_names_reorder,
                                            bench_op_name, bench_full_op_names_reorder)
         data_mapping.update(mapping)

msprobe/core/compare/merge_result/merge_result.py CHANGED Viewed

@@ -109,8 +109,8 @@ def check_index_dump_mode_consistent(dump_mode, rank_num):
         return []
     dump_mode_compare_index_map = {
-        Const.ALL: CompareConst.ALL_COMPARE_INDEX,
-        Const.SUMMARY: CompareConst.SUMMARY_COMPARE_INDEX
+        Const.ALL: CompareConst.ALL_COMPARE_INDEX + [CompareConst.REQ_GRAD_CONSIST],
+        Const.SUMMARY: CompareConst.SUMMARY_COMPARE_INDEX + [CompareConst.REQ_GRAD_CONSIST]
     }
     valid_compare_index = dump_mode_compare_index_map.get(dump_mode)

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl