PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/core/compare/diff_analyze/first_diff_analyze.py ADDED Viewed

@@ -0,0 +1,135 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from tqdm import tqdm
+from msprobe.core.common.const import Const, CompareConst
+from msprobe.core.common.utils import logger, CompareException
+from msprobe.core.common.file_utils import load_yaml
+from msprobe.core.compare.config import ModeConfig
+from msprobe.core.compare.utils import gen_api_batches
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+diff_threshold_yaml_path = os.path.join(cur_dir, 'diff_analyze_threshold.yaml')
+ignore_op_list_yaml_path = os.path.join(cur_dir, 'ignore_op_list.yaml')
+ignore_list = load_yaml(ignore_op_list_yaml_path)
+thresholds = load_yaml(diff_threshold_yaml_path)
+cmp_metrics = thresholds.get('compare_metrics')
+class FirstDiffAnalyze:
+    def __init__(self, mode_config: ModeConfig, rank):
+        self.mode_config = mode_config
+        self.rank = rank
+    @staticmethod
+    def single_metric_diff_check(cmp_metric, metric_value):
+        threshold = thresholds.get(cmp_metric, None)
+        if threshold is None:
+            logger.error(f"Check diff or {cmp_metric} need to configure the threshold. "
+                         f"Please configure it in 'diff_analyze_threshold.yaml'.")
+            raise CompareException(CompareException.MISSING_THRESHOLD_ERROR)
+        if not isinstance(threshold, list) or len(threshold) != 1:
+            logger.error(f"{cmp_metric} threshold configure wrong. Please check.")
+            raise CompareException(CompareException.WRONG_THRESHOLD_ERROR)
+        if isinstance(metric_value, str) and metric_value.endswith('%'):
+            metric_value_float = float(metric_value[:-1]) / 100
+            if metric_value_float > threshold[0]:
+                return True
+        return False
+    def single_api_check(self, result_slice, header, api_name=None):
+        """
+        单个api差异检查
+        :param result_slice: 数据切片
+        :param header: 列名列表
+        :return: {'is_same': bool, 'op_items': list[dict]}
+        """
+        single_check_result = {
+            'is_same': True,
+            'op_items': []
+        }
+        column_indices = {name: idx for idx, name in enumerate(header)}
+        output_idx = -1
+        for line in result_slice:
+            op_item = {
+                column_name: line[column_indices[column_name]]
+                for column_name in header
+            }
+            single_check_result['op_items'].append(op_item)
+            if op_item['state'] != 'output':
+                continue
+            output_idx += 1
+            if output_idx in ignore_list.get(api_name, []):
+                continue
+            # set is_same
+            if self.mode_config.dump_mode == Const.MD5:
+                if line[column_indices[CompareConst.RESULT]] == CompareConst.DIFF:
+                    single_check_result['is_same'] = False
+            else:
+                for cmp_metric in cmp_metrics:
+                    metric_value = line[column_indices[cmp_metric]]
+                    if self.single_metric_diff_check(cmp_metric, metric_value):
+                        single_check_result['is_same'] = False
+                        break
+        return single_check_result
+    def check(self, result_df):
+        """
+        比对后循环遍历api检查差异
+        example：
+        {
+            'Functional.conv2d.0.forward': {
+                'is_same': true,
+                'op_items': [
+                    {
+                        'NPU name': 'Functional.conv2d.0.forward.input.0',
+                        'Bench name': 'Functional.conv2d.0.forward.input.0',
+                        'xxx': 1,
+                        'NormRelativeErr': 2,
+                        'yyy': 3,
+                        ...
+                    }
+                ]
+            }
+        }
+        """
+        result = result_df.values
+        header = result_df.columns.tolist()
+        api_batches = gen_api_batches(result, header)
+        check_result = {}
+        default_bar_desc = 'API/Module diff check Progress'
+        bar_desc_add_rank = f'[{self.rank}]' + default_bar_desc if self.rank else default_bar_desc
+        with tqdm(total=len(api_batches), desc=bar_desc_add_rank, unit="api/module", ncols=100) as progress_bar:
+            for api_batch in api_batches:
+                result_slice = result[api_batch.start: api_batch.params_grad_end_index]
+                api_compo = api_batch.api_name.split('.')
+                # suppose name is Tensor.MatMul.0.forward
+                if len(api_compo) < 4:
+                    continue
+                # get MatMul as api_name
+                api_name = api_compo[-3]
+                check_result[api_batch.api_name] = self.single_api_check(result_slice, header, api_name)
+                progress_bar.update(1)
+        return check_result

msprobe/core/compare/diff_analyze/ignore_op_list.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+npu_fusion_attention:
+  - 4
+  - 5

msprobe/core/compare/find_first/__init__.py ADDED Viewed

File without changes

msprobe/core/compare/find_first/analyzer.py ADDED Viewed

@@ -0,0 +1,282 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from collections import defaultdict
+import os
+from itertools import dropwhile, chain
+from msprobe.core.common import const
+from msprobe.core.common.file_utils import check_file_or_directory_path, save_json, make_dir
+from msprobe.core.common.log import logger
+from msprobe.core.common.const import Const
+from msprobe.core.compare.find_first.data_processor import DataProcessor
+from msprobe.core.compare.find_first.utils import (RankPath, FileCache, is_communication_op, is_ignore_op,
+                                                   DiffAnalyseConst, analyze_diff_in_group)
+from msprobe.core.compare.find_first.graph import DataNode, CommunicationNode
+class DiffAnalyzer:
+    def __init__(self, npu_path, bench_path, output_path, data_frame=Const.PT_FRAMEWORK):
+        self._bench_path = bench_path
+        self._npu_path = npu_path
+        self._output_path = output_path
+        self.pre_processor = DataProcessor(data_frame)
+        self._paths = {}
+        self._diff_nodes = []  # 记录所有异常节点
+        self._cache = FileCache()
+        self._first_comm_nodes = {}  # 记录各rank下首个通信节点的node_id
+        self._after_comm_diffs = {}  # 记录各rank下发生在通信节点之后的异常计算节点
+        self._rank_comm_nodes_dict = {}  # 记录各rank的通信节点
+    def analyze(self):
+        self._pre_process()
+        for analyze_func in [self._pre_analyze, self._analyze, self._post_analyze]:
+            analyze_func()
+            if self._diff_nodes:
+                self._gen_analyze_info()
+                return
+        logger.info('Cannot find any diff node, no need to generate analyze file.')
+    def _pre_process(self):
+        self.pre_processor.process(self._npu_path, self._bench_path, self._output_path)
+        self._resolve_input_path(self._output_path)
+        logger.info("Pre Process completed.")
+    """
+    这里需要生成stack，但是直接用dict中自带就行，在op_items.NPU_Stack_Info中
+    """
+    def _resolve_input_path(self, result_input_path):
+        contents = os.listdir(result_input_path)
+        rank_paths = {}
+        for path in contents:
+            # 检查文件名是否符合compare_result_rank{rank_id}_{timestamp}.json格式
+            if not path.startswith('compare_result_rank'):
+                continue
+            if not path.endswith('.json'):
+                continue
+            # 从文件名中提取rank_id
+            try:
+                path_ele_list = path.split('_')
+                if len(path_ele_list) <= 2:
+                    continue
+                rank_part = path_ele_list[2]
+                if not rank_part.startswith('rank'):
+                    continue
+                rank_str = rank_part.strip('rank')  # 去掉'rank'前缀
+                rank = int(rank_str) if rank_str else 0
+            except (IndexError, ValueError):
+                continue
+            # 构建完整的json文件路径
+            dump_path = os.path.join(result_input_path, path)
+            rank_paths[rank] = RankPath(rank, dump_path)
+        # 按照rank id排序后添加到self._paths中
+        for rank in sorted(rank_paths.keys()):
+            self._paths[rank] = rank_paths[rank]
+    def _pre_analyze(self):
+        logger.info('Start searching diff node before communication.')
+        for path in self._paths.values():
+            dump_data = self._cache.load_json(path.dump_path)
+            if not dump_data:
+                logger.warning(f'Rank {path.rank} has no dump data!')
+                continue
+            for op_name, op_data in dump_data.items():
+                if is_ignore_op(op_name):
+                    continue
+                if is_communication_op(op_name):
+                    self._first_comm_nodes[path.rank] = op_name
+                    break
+                data_node = DataNode(op_name, path.rank, op_data)
+                if data_node.is_diff:
+                    self._diff_nodes.append(data_node)
+                    break
+    def _analyze(self):
+        logger.info('Start searching diff node during communication.')
+        self._rank_comm_nodes_dict = {rank: self._analyze_comm_nodes(rank) for rank in self._paths}
+        self._connect_comm_nodes()
+        self._pruning()
+        self._search_first_diff()
+    def _post_analyze(self):
+        logger.info('Start searching diff node after communication.')
+        for nodes in self._after_comm_diffs.values():
+            if nodes:
+                self._diff_nodes.append(nodes[0])
+    def _connect_comm_nodes(self):
+        searched_ranks = set()
+        for rank, nodes in list(self._rank_comm_nodes_dict.items())[:-1]:
+            searched_ranks.add(rank)
+            seen_nodes = set()
+            last_node = None
+            for cur_node in nodes.values():
+                is_overflow = last_node and hasattr(last_node, 'layer') and hasattr(cur_node, 'layer') and \
+                last_node.layer >= cur_node.layer
+                if is_overflow:
+                    cur_node.layer = last_node.layer + 1
+                conn_info = cur_node.find_connected_nodes()
+                if not conn_info.get('ranks'):
+                    conn_info['ranks'] = self._rank_comm_nodes_dict.keys()
+                last_node = cur_node
+                if not self._find_connection(conn_info, cur_node, searched_ranks, seen_nodes):
+                    logger.debug(f'Cannot find connected communication node for "{cur_node.node_id}".')
+    def _find_connection(self, conn_info, cur_node, searched_ranks, seen_nodes):
+        def connect(search_node):
+            seen_nodes.add(search_node.node_id)
+            if search_node.type == DiffAnalyseConst.DST:
+                cur_node.add_dst(search_node)
+            elif search_node.type == DiffAnalyseConst.SRC:
+                search_node.layer = cur_node.layer
+                search_node.add_dst(cur_node)
+            else:
+                cur_node.add_link(search_node)
+        found = cur_node.connected
+        for connected_rank in conn_info['ranks']:
+            if connected_rank in searched_ranks:
+                continue
+            tar_id_prefix = f'{connected_rank}.{conn_info["api"]}'
+            for search_id, search_node in self._rank_comm_nodes_dict[connected_rank].items():
+                if search_id in seen_nodes:
+                    continue
+                if not (search_id.startswith(tar_id_prefix) and search_node.type == conn_info.get('type')):
+                    continue
+                search_conn_ranks = search_node.find_connected_nodes().get('ranks')
+                if ((not search_conn_ranks and search_node.api not in DiffAnalyseConst.DIRECTED_API) or
+                    cur_node.rank in search_conn_ranks):  # 有些无向通信算子没有填ProcessGroup，默认连接所有rank
+                    connect(search_node)
+                    found = True
+                    break
+        return found
+    def _analyze_comm_nodes(self, rank):
+        path = self._paths[rank]
+        data = self._cache.load_json(path.dump_path)
+        communication_nodes = {}
+        if rank not in self._first_comm_nodes:  # 此rank没有通信节点
+            return communication_nodes
+        last_node_id = None  # 记录上一个通信节点的node_id
+        compute_ops = []  # 记录两个通信节点之间的计算节点
+        sub_layer = 0  # 记录两个通信算子之间异常计算节点的调用序数
+        for op_name in dropwhile(lambda k: k != self._first_comm_nodes[rank], data):
+            node_id = f'{rank}.{op_name}'
+            op_data = data[op_name]
+            if is_communication_op(op_name):
+                comm_node = CommunicationNode(node_id, rank, DataNode(op_name, rank, op_data, sub_layer=sub_layer),
+                                              compute_ops=compute_ops)
+                if last_node_id:
+                    communication_nodes.get(last_node_id).add_next(comm_node)
+                communication_nodes[node_id] = comm_node
+                last_node_id = node_id
+                compute_ops = []
+                sub_layer = 0
+            elif not is_ignore_op(op_name):
+                data_node = DataNode(op_name, rank, op_data, sub_layer=sub_layer)
+                if data_node.is_diff:
+                    compute_ops.append(data_node)
+                sub_layer += 1
+        if compute_ops:
+            self._after_comm_diffs[rank] = compute_ops
+        return communication_nodes
+    def _pruning(self):
+        deleted_node_id = []
+        for nodes in self._rank_comm_nodes_dict.values():
+            for node_id in list(nodes.keys()):
+                node = nodes[node_id]
+                if node.is_diff or node.compute_ops:
+                    continue
+                deleted_node_id.append(node_id)
+                node.delete()
+                del nodes[node_id]
+        logger.debug(f'After pruning, following nodes are removed: [{", ".join(deleted_node_id)}]')
+    def _search_first_diff(self):
+        nodes_queues = []
+        for comm_nodes in self._rank_comm_nodes_dict.values():
+            nodes_queues.append(sorted(list(comm_nodes.values()), key=lambda x: x.layer))
+        seen_nodes = set()
+        def get_next_node(node_list):
+            while node_list:
+                next_node = node_list.pop(0)
+                if next_node.node_id not in seen_nodes:
+                    return next_node
+            return None
+        def find_all_members(ori_node):
+            ids = get_relative_ids(ori_node)
+            id_queue = list(chain(*[get_relative_ids(self._get_node_by_id(n_id)).difference(ids) for n_id in ids]))
+            while id_queue:
+                new_id = id_queue.pop(0)
+                ids.add(new_id)
+                id_queue.extend(get_relative_ids(self._get_node_by_id(new_id)).difference(ids))
+            return ids
+        def get_relative_ids(ori_node):
+            if not ori_node:
+                return set()
+            return ({ori_node.node_id} | ori_node.link_nodes.keys() | ori_node.src_nodes.keys() |
+                    ori_node.dst_nodes.keys())
+        while any(nodes_queues):
+            groups = []
+            all_ids_in_groups = set()
+            for nodes in nodes_queues:
+                node = get_next_node(nodes)
+                if not node:
+                    continue
+                if not groups or node.node_id not in all_ids_in_groups:
+                    new_group = find_all_members(node)
+                    groups.append(new_group)
+                    all_ids_in_groups.update(new_group)
+            for group in groups:
+                seen_nodes.update(group)
+                self._diff_nodes.extend(analyze_diff_in_group([self._get_node_by_id(n_id) for n_id in group]))
+            if self._diff_nodes:
+                # 找出所有layer和sub_layer最小的节点
+                min_layer_sublayer = min((x.layer, x.sub_layer) for x in self._diff_nodes)
+                self._diff_nodes = [
+                                        node
+                                        for node in self._diff_nodes
+                                        if (node.layer, node.sub_layer) == min_layer_sublayer
+                                   ]
+                return
+    def _get_node_by_id(self, node_id):
+        splits = node_id.split(Const.SEP, 1)
+        if len(splits) < 2 or not splits[0].isdigit():
+            logger.error(f'invalid node_id {node_id}')
+            raise RuntimeError(f'invalid node_id {node_id}')
+        rank = int(splits[0])
+        return self._rank_comm_nodes_dict.get(rank, {}).get(node_id)
+    def _gen_analyze_info(self):
+        if not os.path.exists(self._output_path):
+            make_dir(self._output_path)
+        file_name = f'diff_analyze_{time.time_ns()}.json'
+        result_file = os.path.join(self._output_path, file_name)
+        result_content = defaultdict(list)
+        for node in self._diff_nodes:
+            result_content[f'rank_{node.rank}'].append(node.gen_node_info(self._paths[node.rank]))
+        save_json(result_file, result_content, 2)
+        logger.info(f"The analyze result is saved in: {result_file}")

msprobe/core/compare/find_first/data_processor.py ADDED Viewed

@@ -0,0 +1,35 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from msprobe.core.common.const import Const
+from msprobe.core.common.log import logger
+class DataProcessor:
+    def __init__(self, data_frame):
+        self.data_frame = data_frame
+        if self.data_frame == Const.PT_FRAMEWORK:
+            from msprobe.pytorch.compare.distributed_compare import compare_distributed
+            self.process_func = compare_distributed
+        elif self.data_frame == Const.MS_FRAMEWORK:
+            from msprobe.mindspore.compare.distributed_compare import ms_compare_distributed
+            self.process_func = ms_compare_distributed
+        else:
+            raise ValueError(f"Unsupported data_frame: {self.data_frame}")
+    def process(self, npu_path, bench_path, output_path):
+        logger.info("Start comparing data ......")
+        return self.process_func(npu_path, bench_path, output_path, first_diff_analyze=True)

msprobe/core/compare/find_first/graph.py ADDED Viewed

@@ -0,0 +1,188 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from msprobe.core.common.const import Const
+from msprobe.core.common.log import logger
+from msprobe.core.common.const import CompareConst
+from msprobe.core.compare.find_first.utils import RankPath, DiffAnalyseConst
+@dataclass
+class DataNode:
+    op_name: str
+    rank: int
+    inputs: dict
+    outputs: dict
+    op_data: list
+    layer: int = 0  # 和communication_node的layer保持一致
+    sub_layer: int = 0  # 调用顺序，越小表示越先调用
+    def __init__(self, op_name, rank, op_data, **kwargs):
+        self.op_name = op_name
+        self.rank = rank
+        self.stack = None
+        self.inputs = {}
+        self.outputs = {}
+        self.is_diff = False
+        self.parse_data(op_data)
+        self.sub_layer = kwargs.get('sub_layer', 0)
+    def find_stack(self):
+        for item in self.stack:
+            if len(item) >= 2 and self.op_name in item[0]:
+                return item[1]
+        return {}
+    def parse_data(self, op_data):
+        self.is_diff = not op_data.get("is_same", True)
+        self.op_data = op_data.get("op_items") # 这里拿到的是比对column，是一个list，有若干行
+        metrics = {}
+        for cmp_data in self.op_data:
+            name = cmp_data.get(CompareConst.NPU_NAME)
+            # 构建度量指标字典
+            metrics = {}
+            if CompareConst.NPU_MAX in cmp_data:
+                metrics = {CompareConst.NPU_MAX: cmp_data.get(CompareConst.NPU_MAX),
+                        CompareConst.NPU_MIN: cmp_data.get(CompareConst.NPU_MIN),
+                        CompareConst.NPU_MEAN: cmp_data.get(CompareConst.NPU_MEAN),
+                        CompareConst.NPU_NORM: cmp_data.get(CompareConst.NPU_NORM)}
+            elif CompareConst.NPU_MD5 in cmp_data:
+                metrics[CompareConst.NPU_MD5] = cmp_data.get(CompareConst.NPU_MD5)
+            if CompareConst.NPU_P2POP_PEER in cmp_data:
+                metrics[CompareConst.NPU_P2POP_PEER] = cmp_data.get(CompareConst.NPU_P2POP_PEER)
+            if cmp_data.get(CompareConst.STACK) != CompareConst.N_A and not self.stack:
+                self.stack = cmp_data.get(CompareConst.STACK)
+            if cmp_data.get('state') == "input":
+                self.inputs[name] = metrics
+            elif cmp_data.get('state') == "output":
+                self.outputs[name] = metrics
+    def gen_node_info(self, path: RankPath):
+        data_info_list = {Const.INPUT: self.inputs, Const.OUTPUT: self.outputs}
+        return {'op_name': self.op_name,
+                'data_info': data_info_list,
+                'stack_info': self.stack}
+class CommunicationNode:
+    def __init__(self, node_id, rank, data: DataNode, layer=0, **kwargs):
+        self.node_id = node_id
+        self.rank = rank
+        self.data = data
+        self.is_diff = data.is_diff
+        self.layer = layer
+        op_name_split = self.data.op_name.split(Const.SEP)
+        if len(op_name_split) < 4:
+            logger.error(f'invalid op_name: {self.data.op_name}')
+            raise RuntimeError(f'invalid op_name: {self.data.op_name}')
+        self.api = op_name_split[1]
+        self.call_cnt = op_name_split[2]
+        self.pre_node = kwargs.get('pre_node')
+        self.link_nodes = kwargs.get('link_nodes', {})
+        self.dst_nodes = kwargs.get('dst_nodes', {})
+        self.src_nodes = kwargs.get('src_nodes', {})
+        self.next_nodes = kwargs.get('next_nodes', {})
+        self.compute_ops = kwargs.get('compute_ops', [])
+        self.type = self._resolve_type()
+        self.connected = False
+    def add_next(self, node):
+        self.next_nodes[node.node_id] = node
+        node.pre_node = self
+        node.layer = self.layer + 1
+        node.data.layer = node.layer
+    def add_link(self, node):
+        self.link_nodes[node.node_id] = node
+        node.link_nodes[self.node_id] = self
+        node.layer = self.layer
+        node.data.layer = node.layer
+        self.connected = True
+        node.connected = True
+    def add_dst(self, node):
+        self.dst_nodes[node.node_id] = node
+        node.src_nodes[self.node_id] = self
+        node.layer = self.layer
+        node.data.layer = node.layer
+        self.connected = True
+        node.connected = True
+    def delete(self):
+        for node in self.next_nodes.values():
+            node.pre_node = None
+        for node in self.dst_nodes.values():
+            if node.src_nodes:
+                node.src_nodes.pop(self.node_id)
+        for node in self.src_nodes.values():
+            if node.dst_nodes:
+                node.dst_nodes.pop(self.node_id)
+        for node in self.link_nodes.values():
+            if node.link_nodes:
+                node.link_nodes.pop(self.node_id)
+        if self.pre_node:
+            if self.pre_node.next_nodes:
+                self.pre_node.next_nodes.pop(self.node_id)
+    def find_connected_nodes(self):
+        """
+        根据 api/类型/入参/调用次数 确定相连接的node的op_name
+        """
+        tar_api = DiffAnalyseConst.P2P_API_MAPPING.get(self.api, self.api)
+        ranks = set()
+        # 遍历DST和SRC相关的input，获取对应的rank值
+        # 遍历inputs获取所有rank值
+        for input_name, v in self.data.inputs.items():
+            # 检查key是否包含DST/SRC相关标识
+            target_types = [DiffAnalyseConst.DST, DiffAnalyseConst.DST_GROUP,
+                          DiffAnalyseConst.SRC, DiffAnalyseConst.SRC_GROUP]
+            if any(keyword in input_name for keyword in target_types):
+                # 优先使用MD5值，如果没有则使用NPU_MAX值
+                rank_val = 0
+                if CompareConst.NPU_MD5 in v:
+                    rank_val = int(v.get(CompareConst.NPU_MD5, 0))
+                else:
+                    rank_val = int(v.get(CompareConst.NPU_MAX, 0))
+                if rank_val:
+                    ranks.add(rank_val)
+            elif input_name.endswith('.group'):
+                # 优先使用MD5值，如果没有则使用NPU_MAX值
+                val = v.get(CompareConst.NPU_MD5) if CompareConst.NPU_MD5 in v else v.get(CompareConst.NPU_MAX)
+                if val and val.startswith('[') and val.endswith(']'):
+                    val = [int(part) for part in val.strip('[]').split(',')]
+                    ranks.update(val)
+            elif v.get(CompareConst.NPU_P2POP_PEER) != "None":
+                ranks.add(v.get(CompareConst.NPU_P2POP_PEER))
+        return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
+                'type': DiffAnalyseConst.OPPOSITE_DIR.get(self.type, DiffAnalyseConst.LINK)}
+    def _resolve_type(self):
+        # 遍历SRC和DST相关的输入，根据rank值判断节点类型
+        for prefix, node_type in [(DiffAnalyseConst.SRC, DiffAnalyseConst.SRC),
+                                (DiffAnalyseConst.DST, DiffAnalyseConst.DST)]:
+            for k, v in self.data.inputs.items():
+                if prefix in k or f"{prefix}_GROUP" in k:
+                    # 优先使用MD5值，如果没有则使用NPU_MAX值
+                    compare_val = v.get(CompareConst.NPU_MD5) if CompareConst.NPU_MD5 in v \
+                                  else v.get(CompareConst.NPU_MAX)
+                    return node_type if compare_val == self.rank \
+                           else DiffAnalyseConst.OPPOSITE_DIR[node_type]
+        return DiffAnalyseConst.LINK

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl