PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/METADATA +3 -3
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/RECORD +168 -150
msprobe/README.md +27 -22
msprobe/core/common/const.py +129 -60
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +25 -2
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +43 -33
msprobe/core/compare/acc_compare.py +43 -74
msprobe/core/compare/check.py +2 -6
msprobe/core/compare/highlight.py +2 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -1
msprobe/core/compare/merge_result/merge_result.py +16 -9
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +19 -12
msprobe/core/compare/npy_compare.py +30 -12
msprobe/core/compare/utils.py +30 -10
msprobe/core/data_dump/api_registry.py +176 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +94 -10
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +33 -33
msprobe/core/data_dump/data_processor/pytorch_processor.py +99 -18
msprobe/core/data_dump/json_writer.py +61 -40
msprobe/core/grad_probe/constant.py +1 -0
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/docs/01.installation.md +27 -1
msprobe/docs/02.config_introduction.md +27 -23
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +103 -16
msprobe/docs/06.data_dump_MindSpore.md +76 -32
msprobe/docs/07.accuracy_checker_PyTorch.md +11 -1
msprobe/docs/08.accuracy_checker_online_PyTorch.md +3 -1
msprobe/docs/09.accuracy_checker_MindSpore.md +5 -3
msprobe/docs/10.accuracy_compare_PyTorch.md +59 -33
msprobe/docs/11.accuracy_compare_MindSpore.md +40 -16
msprobe/docs/12.overflow_check_PyTorch.md +3 -1
msprobe/docs/13.overflow_check_MindSpore.md +4 -2
msprobe/docs/14.data_parse_PyTorch.md +1 -7
msprobe/docs/18.online_dispatch.md +1 -1
msprobe/docs/19.monitor.md +332 -273
msprobe/docs/21.visualization_PyTorch.md +42 -13
msprobe/docs/22.visualization_MindSpore.md +43 -13
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/27.dump_json_instruction.md +301 -27
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +4 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +32 -7
msprobe/mindspore/api_accuracy_checker/api_runner.py +70 -22
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +602 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -1
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +2 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +130 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/const.py +61 -0
msprobe/mindspore/common/utils.py +48 -18
msprobe/mindspore/compare/ms_compare.py +27 -19
msprobe/mindspore/compare/ms_graph_compare.py +6 -5
msprobe/mindspore/debugger/debugger_config.py +31 -6
msprobe/mindspore/debugger/precision_debugger.py +45 -14
msprobe/mindspore/dump/dump_tool_factory.py +5 -3
msprobe/mindspore/dump/hook_cell/api_register.py +142 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +9 -10
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +24 -26
msprobe/mindspore/dump/jit_dump.py +21 -15
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +22 -56
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -1
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +10 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +2 -0
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +873 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +309 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/service.py +114 -34
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +12 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +4 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +5 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +25 -6
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -19
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/{parse.py → bench_functions/mish.py} +6 -4
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +50 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +97 -4
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +24 -1
msprobe/pytorch/dump/module_dump/module_dump.py +4 -3
msprobe/pytorch/dump/module_dump/module_processer.py +21 -35
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +8 -2
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/hook_module/api_register.py +131 -0
msprobe/pytorch/hook_module/hook_module.py +19 -14
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +173 -75
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +18 -14
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +238 -193
msprobe/pytorch/monitor/module_metric.py +9 -6
msprobe/pytorch/monitor/optimizer_collect.py +100 -67
msprobe/pytorch/monitor/unittest/test_monitor.py +1 -1
msprobe/pytorch/monitor/utils.py +76 -44
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +9 -0
msprobe/pytorch/online_dispatch/dump_compare.py +3 -0
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +2 -1
msprobe/pytorch/pt_config.py +30 -29
msprobe/pytorch/service.py +114 -32
msprobe/visualization/builder/graph_builder.py +75 -10
msprobe/visualization/builder/msprobe_adapter.py +7 -6
msprobe/visualization/compare/graph_comparator.py +42 -38
msprobe/visualization/compare/mode_adapter.py +0 -19
msprobe/visualization/graph/base_node.py +11 -3
msprobe/visualization/graph/distributed_analyzer.py +71 -3
msprobe/visualization/graph/graph.py +0 -11
msprobe/visualization/graph/node_op.py +4 -3
msprobe/visualization/graph_service.py +4 -5
msprobe/visualization/utils.py +12 -35
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -205
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -75
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.3.0.dist-info}/top_level.txt +0 -0

msprobe/visualization/compare/graph_comparator.py CHANGED Viewed

@@ -17,12 +17,14 @@ import re
 from msprobe.visualization.builder.msprobe_adapter import compare_node, get_compare_mode, run_real_data
 from msprobe.visualization.utils import GraphConst, load_json_file, load_data_json_file, get_csv_df
 from msprobe.visualization.graph.graph import Graph, NodeOp
-from msprobe.visualization.graph.node_colors import NodeColors
 from msprobe.visualization.compare.mode_adapter import ModeAdapter
 from msprobe.core.common.const import Const
+from msprobe.core.common.decorator import recursion_depth_decorator
 class GraphComparator:
+    MAX_DEPTH = 1000
     def __init__(self, graphs, dump_path_param, args, mapping_dict=None):
         self.graph_n = graphs[0]
         self.graph_b = graphs[1]
@@ -41,7 +43,7 @@ class GraphComparator:
         else:
             self._compare_nodes(self.graph_n.root)
         self._postcompare()
     def add_compare_result_to_node(self, node, compare_result_list):
         """
         将比对结果添加到节点的输入输出数据中
@@ -66,43 +68,8 @@ class GraphComparator:
             self.ma.parse_result(node, [compare_in_dict, compare_out_dict]))
         node.data[GraphConst.JSON_INDEX_KEY] = precision_index
         node.data.update(other_dict)
-    def _parse_param(self, dump_path_param, output_path):
-        self.dump_path_param = dump_path_param
-        self.output_path = output_path
-        compare_mode = get_compare_mode(self.dump_path_param)
-        self.ma = ModeAdapter(compare_mode)
-        self.data_n_dict = load_data_json_file(dump_path_param.get('npu_json_path'))
-        self.data_b_dict = load_data_json_file(dump_path_param.get('bench_json_path'))
-        self.stack_json_data = load_json_file(dump_path_param.get('stack_json_path'))
-    def _postcompare(self):
-        self._handle_api_collection_index()
-        if not self.ma.compare_mode == GraphConst.REAL_DATA_COMPARE:
-            return
-        df = get_csv_df(True, self.ma.csv_data, self.ma.compare_mode)
-        df = run_real_data(self.dump_path_param, df, self.framework, True if self.mapping_dict else False)
-        compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()}
-        for node in self.ma.compare_nodes:
-            precision_index, _ = self.ma.parse_result(node, [compare_data_dict])
-            node.data[GraphConst.JSON_INDEX_KEY] = precision_index
-    def _handle_api_collection_index(self):
-        """
-        api集合的指标, md5模式使用集合中所有api最小的指标，statistics和tensor模式使用集合中所有api最大的指标
-        md5模式下指标为0代表最差，statistics和tensor模式下指标为1代表最差
-        """
-        for node in self.graph_n.root.subnodes:
-            if node.op == NodeOp.api_collection:
-                precision_index = GraphConst.MAX_INDEX_KEY if self.ma.compare_mode == GraphConst.MD5_COMPARE \
-                    else GraphConst.MIN_INDEX_KEY
-                for api in node.subnodes:
-                    precision_index = min(precision_index,
-                                          api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MAX_INDEX_KEY)) \
-                        if self.ma.compare_mode == GraphConst.MD5_COMPARE \
-                        else max(precision_index, api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MIN_INDEX_KEY))
-                node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+    @recursion_depth_decorator('GraphComparator._compare_nodes', max_depth=MAX_DEPTH)
     def _compare_nodes(self, node_n):
         """
         递归遍历NPU树中的节点，如果在Bench中找到具有相同名称的节点，检查他们的祖先和参数信息，检查一致则及逆行精度数据对比
@@ -126,6 +93,7 @@ class GraphComparator:
         for subnode in node_n.subnodes:
             self._compare_nodes(subnode)
+    @recursion_depth_decorator('GraphComparator._compare_nodes_fuzzy', max_depth=MAX_DEPTH)
     def _compare_nodes_fuzzy(self, node_n):
         if node_n.op != NodeOp.function_api:
             # 模块经过模糊匹配
@@ -146,6 +114,42 @@ class GraphComparator:
         for sub_node in node_n.subnodes:
             self._compare_nodes_fuzzy(sub_node)
+    def _parse_param(self, dump_path_param, output_path):
+        self.dump_path_param = dump_path_param
+        self.output_path = output_path
+        compare_mode = get_compare_mode(self.dump_path_param)
+        self.ma = ModeAdapter(compare_mode)
+        self.data_n_dict = load_data_json_file(dump_path_param.get('npu_json_path'))
+        self.data_b_dict = load_data_json_file(dump_path_param.get('bench_json_path'))
+        self.stack_json_data = load_json_file(dump_path_param.get('stack_json_path'))
+    def _postcompare(self):
+        self._handle_api_collection_index()
+        if not self.ma.compare_mode == GraphConst.REAL_DATA_COMPARE:
+            return
+        df = get_csv_df(True, self.ma.csv_data, self.ma.compare_mode)
+        df = run_real_data(self.dump_path_param, df, self.framework, True if self.mapping_dict else False)
+        compare_data_dict = {row[0]: row.tolist() for _, row in df.iterrows()}
+        for node in self.ma.compare_nodes:
+            precision_index, _ = self.ma.parse_result(node, [compare_data_dict])
+            node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+    def _handle_api_collection_index(self):
+        """
+        api集合的指标, md5模式使用集合中所有api最小的指标，statistics和tensor模式使用集合中所有api最大的指标
+        md5模式下指标为0代表最差，statistics和tensor模式下指标为1代表最差
+        """
+        for node in self.graph_n.root.subnodes:
+            if node.op == NodeOp.api_collection:
+                precision_index = GraphConst.MAX_INDEX_KEY if self.ma.compare_mode == GraphConst.MD5_COMPARE \
+                    else GraphConst.MIN_INDEX_KEY
+                for api in node.subnodes:
+                    precision_index = min(precision_index,
+                                          api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MAX_INDEX_KEY)) \
+                        if self.ma.compare_mode == GraphConst.MD5_COMPARE \
+                        else max(precision_index, api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MIN_INDEX_KEY))
+                node.data[GraphConst.JSON_INDEX_KEY] = precision_index
     def _get_and_add_result(self, node_n, node_b):
         compare_result_list = compare_node([node_n.id, node_b.id],
                                            [self.data_n_dict, self.data_b_dict],

msprobe/visualization/compare/mode_adapter.py CHANGED Viewed

@@ -14,7 +14,6 @@
 # limitations under the License.
 import json
-import math
 from msprobe.core.common.const import CompareConst, Const
 from msprobe.visualization.utils import ToolTip, GraphConst, str2float
@@ -157,24 +156,6 @@ class ModeAdapter:
             return
         self.csv_data.extend(compare_result_list)
-    def add_error_key(self, node_data):
-        """
-        根据不同的模式进行提供不同错误信息
-        """
-        for key, value in node_data.items():
-            if not isinstance(value, dict):
-                continue
-            if self.compare_mode == GraphConst.SUMMARY_COMPARE:
-                message = [CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR,
-                           CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR]
-            elif self.compare_mode == GraphConst.REAL_DATA_COMPARE:
-                message = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]
-            else:
-                # 输出件优化
-                message = []
-            value[GraphConst.ERROR_KEY] = message
-            node_data[key] = value
     def get_tool_tip(self):
         """
         用于前端展示字段的具体含义

msprobe/visualization/graph/base_node.py CHANGED Viewed

@@ -12,10 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from msprobe.core.overflow_check.level import OverflowLevel
-from msprobe.visualization.graph.node_op import NodeOp
 from msprobe.visualization.utils import GraphConst
 from msprobe.visualization.builder.msprobe_adapter import format_node_data, compare_data, compare_data_fuzzy
+from msprobe.core.common.log import logger
 class BaseNode:
@@ -34,6 +35,7 @@ class BaseNode:
         self.micro_step_id = None
         self.overflow_level = None
         self.matched_distributed = {}
+        self.batch_p2p_info = []
     def __str__(self):
         info = f'id:\t{self.id}'
@@ -92,8 +94,8 @@ class BaseNode:
         result = {
             'id': self.id,
             'node_type': self.op.value,
-            'output_data': format_node_data(self.output_data),
-            'input_data': format_node_data(self.input_data),
+            'output_data': format_node_data(self.output_data, self.id),
+            'input_data': format_node_data(self.input_data, self.id),
             'upnode': self.upnode.id if self.upnode else 'None',
             'subnodes': [node.id for node in self.subnodes],
             'matched_node_link': self.matched_node_link,
@@ -113,7 +115,13 @@ class BaseNode:
         """
         ancestors = []
         current_node = self.upnode
+        seen_nodes = set()
         while current_node:
+            if current_node.id in seen_nodes:
+                logger.warning(f'Detected a cycle in the node structure and cannot get node ancestors, '
+                               f'current node is {current_node.id}.')
+                return []
+            seen_nodes.add(current_node.id)
             ancestors.append(current_node.id)
             current_node = current_node.upnode
         return list(reversed(ancestors))

msprobe/visualization/graph/distributed_analyzer.py CHANGED Viewed

@@ -115,7 +115,9 @@ class DistributedAnalyzer:
                 if not node_id.startswith(Const.DISTRIBUTED) or node.matched_distributed:
                     continue
                 api_name, distributed_type = self._get_distributed_name_and_type(node_id)
-                if distributed_type == DistributedType.P2P:
+                if api_name == GraphConst.BATCH_P2P:
+                    self._batch_p2p_match(node, rank)
+                elif distributed_type == DistributedType.P2P:
                     self._p2p_match(node, rank, api_name)
                 else:
                     self._collective_match(node, rank, api_name)
@@ -138,12 +140,16 @@ class DistributedAnalyzer:
         for rank, graph in self.graphs.items():
             group_count = {}
             group_info = {}
+            batch_p2p_count = {}
             nodes = graph.node_map
             for node_id, node in nodes.items():
                 if not node_id.startswith(Const.DISTRIBUTED):
                     continue
                 api_name, distributed_type = self._get_distributed_name_and_type(node_id)
-                if distributed_type == DistributedType.P2P:
+                if api_name == GraphConst.BATCH_P2P:
+                    self._make_batch_p2p_mapping(node, rank, batch_p2p_count)
+                    continue
+                elif distributed_type == DistributedType.P2P:
                     config_info = self.config.get(api_name)
                     target_rank = self._get_target_rank(node, rank, config_info[1])
                     if target_rank is None:
@@ -162,7 +168,32 @@ class DistributedAnalyzer:
                 unique_group_id = group_id + Const.REPLACEMENT_CHARACTER + str(group_count.get(group_id))
                 group_info[unique_group_id] = node_id
                 group_info[node_id] = unique_group_id
-            self.group_node_mapping[rank] = group_info
+            if rank not in self.group_node_mapping:
+                self.group_node_mapping[rank] = {}
+            self.group_node_mapping[rank].update(group_info)
+    def _make_batch_p2p_mapping(self, node, rank, batch_p2p_count):
+        """
+        给batch_isend_irecv接口的每个p2p内容赋予唯一标识
+        """
+        if rank not in self.group_node_mapping:
+            self.group_node_mapping[rank] = {}
+        params = []
+        for info_dict in node.batch_p2p_info:
+            op = info_dict.get(GraphConst.OP)
+            target_rank = info_dict.get(GraphConst.PEER)
+            if op is None or target_rank is None:
+                logger.warning('Cannot get param op or peer.')
+                continue
+            group_id = op + Const.REPLACEMENT_CHARACTER + Const.RANK + str(target_rank) + \
+                       Const.REPLACEMENT_CHARACTER + info_dict.get(GraphConst.GROUP_ID, '')
+            batch_p2p_count[group_id] = batch_p2p_count.get(group_id, 0) + 1
+            # 例如: isend_rank0_5a4d31ad765260ba50eb190f1f9fd163_1
+            unique_group_id = group_id + Const.REPLACEMENT_CHARACTER + str(batch_p2p_count.get(group_id))
+            params.append(unique_group_id)
+            self.group_node_mapping.get(rank)[unique_group_id] = node.id
+        if params:
+            self.group_node_mapping.get(rank)[node.id] = params
     def _get_distributed_name_and_type(self, node_id):
         if Const.SEP not in node_id:
@@ -316,3 +347,40 @@ class DistributedAnalyzer:
         if nodes_info:
             matched_distributed['nodes_info'] = nodes_info
             node.matched_distributed = matched_distributed
+    def _batch_p2p_match(self, node, rank):
+        """
+        批量点对点匹配
+        针对torch.distributed.batch_isend_irecv接口，其入参是一个包含点对点通信信息的集合，需要遍历集合对每个点对点通信信息进行匹配
+        :param node: 当前集体通信节点
+        :param rank: 当前节点所属rank
+        :return:
+        """
+        unique_group_ids = self.group_node_mapping.get(rank, {}).get(node.id)
+        if not unique_group_ids:
+            return
+        matched_distributed = [] if len(unique_group_ids) > 1 else {}
+        for unique_group_id in unique_group_ids:
+            try:
+                id_info = unique_group_id.split(Const.REPLACEMENT_CHARACTER)
+                api_name = id_info[0]
+                target_api_name = self.config.get(api_name)[0]
+                target_rank = int(id_info[1].replace(Const.RANK, ''))
+            except Exception as e:
+                logger.warning(f'Failed to parse batch p2p parameter with error info: {e}.')
+                continue
+            target_node = self._get_target_node(rank, unique_group_id, api_name, target_rank, target_api_name)
+            if not target_node:
+                continue
+            communications_type = self.config.get(api_name)[2]
+            index = target_node.data.get(GraphConst.OVERFLOW_LEVEL, CompareConst.NAN) if self.overflow_check \
+                else target_node.data.get(GraphConst.JSON_INDEX_KEY, CompareConst.NAN)
+            matched_info = {
+                'communications_type': communications_type,
+                'nodes_info': {target_rank: [str(index), target_node.id]}
+            }
+            matched_distributed.append(matched_info) if isinstance(matched_distributed, list) \
+                else matched_distributed.update(matched_info)
+        if matched_distributed:
+            node.matched_distributed = matched_distributed

msprobe/visualization/graph/graph.py CHANGED Viewed

@@ -20,9 +20,6 @@ from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const
-MAX_RECUR_LEVEL = 100
 class Graph:
     def __init__(self, model_name, data_path='', dump_data=None):
         self.node_map = {}
@@ -67,7 +64,6 @@ class Graph:
         ancestors_b = node_b.get_ancestors()
         return node_b, ancestors_n, ancestors_b
     @staticmethod
     def fuzzy_match(node_n, node_b):
         if not node_n or not node_b or not node_n.fuzzy_eq(node_b):
@@ -76,13 +72,6 @@ class Graph:
         ancestors_b = node_b.get_ancestors()
         return node_b, ancestors_n, ancestors_b
-    @staticmethod
-    def dfs(node, result):
-        info = node.to_dict()
-        result[node.id] = info
-        for subnode in node.subnodes:
-            Graph.dfs(subnode, result)
     @staticmethod
     def split_nodes_by_micro_step(nodes):
         """

msprobe/visualization/graph/node_op.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from enum import Enum
 import re
 from msprobe.visualization.builder.msprobe_adapter import op_patterns
+from msprobe.core.common.log import logger
 class NodeOp(Enum):
@@ -23,7 +24,6 @@ class NodeOp(Enum):
     function_api = 1
     api_collection = 9
     @staticmethod
     def get_node_op(node_name: str):
         """
@@ -32,8 +32,9 @@ class NodeOp(Enum):
         for op in NodeOp:
             index = op.value
             if index < 0 or index >= len(op_patterns):
-                raise Exception("NodeOp and op_patterns in MsprobeAdapter do not match")
+                continue
             pattern = op_patterns[index]
             if re.match(pattern, node_name):
                 return op
-        raise Exception(f"Cannot parse node_name {node_name} into NodeOp")
+        logger.warning(f"Cannot parse node_name {node_name} into NodeOp, default parsing as module.")
+        return NodeOp.module

msprobe/visualization/graph_service.py CHANGED Viewed

@@ -16,8 +16,8 @@
 import os
 import time
 import json
-from msprobe.core.common.file_utils import (FileOpen, check_file_type, create_directory, FileChecker,
-                                            check_file_or_directory_path)
+from msprobe.core.common.file_utils import (check_file_type, create_directory, FileChecker,
+                                            check_file_or_directory_path, load_json)
 from msprobe.core.common.const import FileCheckConst, Const
 from msprobe.core.common.utils import CompareException
 from msprobe.core.overflow_check.checker import AnomalyDetector
@@ -159,7 +159,7 @@ def _compare_graph_steps(input_param, args):
     bench_steps = sorted(check_and_return_dir_contents(dump_step_b, Const.STEP))
     if npu_steps != bench_steps:
-        logger.error('The number of steps in the two runs are different. Unable to match the steps.')
+        logger.error('The number of steps in the two runs is different. Unable to match the steps.')
         raise CompareException(CompareException.INVALID_PATH_ERROR)
     for folder_step in npu_steps:
@@ -220,8 +220,7 @@ def _graph_service_parser(parser):
 def _graph_service_command(args):
-    with FileOpen(args.input_path, "r") as file:
-        input_param = json.load(file)
+    input_param = load_json(args.input_path)
     npu_path = input_param.get("npu_path")
     bench_path = input_param.get("bench_path")
     check_file_or_directory_path(npu_path, isdir=True)

msprobe/visualization/utils.py CHANGED Viewed

@@ -42,14 +42,6 @@ def load_data_json_file(file_path):
     return load_json_file(file_path).get(GraphConst.DATA_KEY, {})
-def save_json_file(file_path, data):
-    """
-    保存json文件
-    """
-    with FileOpen(file_path, 'w') as f:
-        f.write(json.dumps(data, indent=4))
 def get_csv_df(stack_mode, csv_data, compare_mode):
     """
     调用acc接口写入csv
@@ -73,14 +65,6 @@ def str2float(percentage_str):
         return 0
-def is_integer(s):
-    try:
-        int(s)
-        return True
-    except Exception:
-        return False
 def check_directory_content(input_path):
     """
     检查input_path内容, 是否全是step{数字}命名的文件夹(例如step0), 或者全是rank{数字}命名的文件夹(例如rank0), 或者全是文件
@@ -143,18 +127,17 @@ class ToolTip:
         '当最大相对误差越接近0表示其计算的误差越小。'
         '当dump数据中存在0或Nan时，比对结果中最大相对误差则出现inf或Nan的情况，属于正常现象'
     )
-    SMALL_VALUE_TIP = '{}, 由于{}小于{}, 建议不参考此相对误差，请参考绝对误差'
 class GraphConst:
     CONSTRUCT_FILE = 'construct.json'
     DUMP_FILE = 'dump.json'
     STACK_FILE = 'stack.json'
-    GRAPH_FILE = 'graph.vis'
     ERROR_KEY = 'error_key'
     SUMMARY_COMPARE = 0
     MD5_COMPARE = 1
     REAL_DATA_COMPARE = 2
+    STRUCTURE_COMPARE = 3
     JSON_NPU_KEY = 'NPU'
     JSON_BENCH_KEY = 'Bench'
     JSON_TIP_KEY = 'ToolTip'
@@ -163,35 +146,22 @@ class GraphConst:
     JSON_DATA_KEY = 'dump_data_dir'
     JSON_TASK_KEY = 'task'
     DATA_KEY = 'data'
-    REAL_DATA_TH = 0.1
-    MAX_RELATIVE_ERR_TH = 0.5
     ROUND_TH = 6
     JSON_INDEX_KEY = 'precision_index'
     MATCHED_DISTRIBUTED = 'matched_distributed'
     OVERFLOW_LEVEL = 'overflow_level'
     MAX_INDEX_KEY = 1
     MIN_INDEX_KEY = 0
-    SUGGEST_KEY = 'text'
-    TAG_NA = 'na'
-    OUTPUT_INDEX_TWO = -2
-    OUTPUT_INDEX_THREE = -3
-    OUTPUT_MIN_LEN = 3
     INPUT = '.input.'
     OUTPUT = '.output.'
     STR_MAX_LEN = 50
-    SMALL_VALUE = 1e-3
     MD5_INDEX_LIST = [CompareConst.RESULT]
-    REAL_DATA_INDEX_LIST = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR,
-                            CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]
-    SUMMARY_INDEX_LIST = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF,
-                          CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR,
-                          CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR]
-    VALUE_INDEX_LIST = [Const.MAX, Const.MIN, Const.MEAN, Const.NORM]
+    REAL_DATA_INDEX_LIST = CompareConst.ALL_COMPARE_INDEX
+    SUMMARY_INDEX_LIST = CompareConst.SUMMARY_COMPARE_INDEX
     APIS_BETWEEN_MODULES = 'Apis_Between_Modules'
     NULL = 'null'
     NONE = 'None'
     VALUE = 'value'
-    BRACE = '{}'
     DESCRIPTION = 'description'
     COLORS = 'Colors'
     MICRO_STEPS = 'MicroSteps'
@@ -200,13 +170,15 @@ class GraphConst:
     DUMP_MODE_TO_GRAPHCOMPARE_MODE_MAPPING = {
         Const.ALL: REAL_DATA_COMPARE,
         Const.SUMMARY: SUMMARY_COMPARE,
-        Const.MD5: MD5_COMPARE
+        Const.MD5: MD5_COMPARE,
+        Const.STRUCTURE: STRUCTURE_COMPARE
     }
     GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING = {
         REAL_DATA_COMPARE: Const.ALL,
         SUMMARY_COMPARE: Const.SUMMARY,
-        MD5_COMPARE: Const.MD5
+        MD5_COMPARE: Const.MD5,
+        STRUCTURE_COMPARE: Const.STRUCTURE
     }
     RANKS = 'ranks'
@@ -215,3 +187,8 @@ class GraphConst:
     SRC = 'src'
     DST = 'dst'
+    BATCH_P2P = 'batch_isend_irecv'
+    OP = 'op'
+    PEER = 'peer'
+    GROUP_ID = 'group_id'

mindstudio-probe 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl