PyPI - mindstudio-probe - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +1 -1
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/RECORD +85 -66
msprobe/README.md +2 -2
msprobe/core/common/const.py +34 -9
msprobe/core/common/inplace_ops.yaml +1 -0
msprobe/core/common/utils.py +14 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +1 -1
msprobe/core/compare/merge_result/merge_result.py +8 -7
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/utils.py +10 -0
msprobe/core/data_dump/data_collector.py +58 -13
msprobe/core/data_dump/data_processor/base.py +92 -8
msprobe/core/data_dump/data_processor/factory.py +3 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +17 -4
msprobe/core/data_dump/data_processor/pytorch_processor.py +58 -7
msprobe/core/data_dump/json_writer.py +26 -8
msprobe/docs/01.installation.md +25 -0
msprobe/docs/02.config_introduction.md +14 -12
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +34 -15
msprobe/docs/06.data_dump_MindSpore.md +45 -22
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -2
msprobe/docs/19.monitor.md +257 -260
msprobe/docs/21.visualization_PyTorch.md +10 -0
msprobe/docs/22.visualization_MindSpore.md +11 -0
msprobe/docs/27.dump_json_instruction.md +24 -20
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/mindspore/__init__.py +1 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +26 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/compute_element.py +47 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/common/utils.py +20 -2
msprobe/mindspore/debugger/debugger_config.py +25 -2
msprobe/mindspore/debugger/precision_debugger.py +25 -6
msprobe/mindspore/dump/hook_cell/api_registry.py +2 -0
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +8 -2
msprobe/mindspore/service.py +95 -21
msprobe/pytorch/__init__.py +0 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +1 -1
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/utils.py +71 -0
msprobe/pytorch/debugger/debugger_config.py +19 -9
msprobe/pytorch/debugger/precision_debugger.py +14 -0
msprobe/pytorch/dump/module_dump/module_processer.py +10 -30
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +2 -1
msprobe/pytorch/hook_module/wrap_distributed.py +4 -0
msprobe/pytorch/monitor/anomaly_detect.py +14 -29
msprobe/pytorch/monitor/csv2tb.py +10 -12
msprobe/pytorch/monitor/module_hook.py +123 -104
msprobe/pytorch/monitor/module_metric.py +6 -6
msprobe/pytorch/monitor/optimizer_collect.py +45 -63
msprobe/pytorch/monitor/utils.py +8 -43
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +103 -24
msprobe/visualization/builder/graph_builder.py +31 -5
msprobe/visualization/builder/msprobe_adapter.py +7 -5
msprobe/visualization/graph/base_node.py +3 -2
msprobe/visualization/graph/distributed_analyzer.py +80 -3
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +3 -4
msprobe/visualization/utils.py +10 -2
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0

msprobe/visualization/builder/graph_builder.py CHANGED Viewed

@@ -26,6 +26,7 @@ from msprobe.visualization.utils import save_json_file, GraphConst
 class GraphBuilder:
     backward_pattern = re.compile(r"(\.backward\.)(\d+)$")
+    forward_pattern = re.compile(r"(\.forward\.)(\d+)$")
     # 匹配以大写字母开头，后接任意字母，并以Template(结尾
     template_pattern = re.compile(r'\b[A-Z][a-zA-Z]*Template\(')
@@ -113,12 +114,17 @@ class GraphBuilder:
         如果backward节点的父级节点是null，则尝试从同名的forward节点寻找父级节点
         """
         # 匹配以.backward.后跟一个或多个数字结尾的模式
-        backward_pattern = r"(\.backward\.)(\d+)$"
-        forward_pattern = r"(\.forward\.)(\d+)$"
-        if re.search(backward_pattern, subnode_id) and not upnode_id:
-            forward_upnode_id = construct_dict.get(re.sub(backward_pattern, r".forward.\2", subnode_id))
+        if GraphBuilder.backward_pattern.search(subnode_id) and not upnode_id:
+            forward_upnode_id = construct_dict.get(GraphBuilder.backward_pattern.sub(r".forward.\2", subnode_id))
             if forward_upnode_id:
-                new_upnode_id = re.sub(forward_pattern, r".backward.\2", forward_upnode_id)
+                new_upnode_id = GraphBuilder.forward_pattern.sub(r".backward.\2", forward_upnode_id)
+                if new_upnode_id in construct_dict:
+                    return new_upnode_id
+        # 匹配以.backward结尾的节点
+        if subnode_id.endswith(Const.SEP + Const.BACKWARD) and not upnode_id:
+            forward_upnode_id = construct_dict.get(subnode_id.replace(Const.BACKWARD, Const.FORWARD))
+            if forward_upnode_id:
+                new_upnode_id = forward_upnode_id.replace(Const.FORWARD, Const.BACKWARD)
                 if new_upnode_id in construct_dict:
                     return new_upnode_id
         return upnode_id
@@ -148,6 +154,8 @@ class GraphBuilder:
             input_data, output_data = get_input_output(node_data, node.id)
             # 更新数据
             node.set_input_output(input_data, output_data)
+            if GraphConst.BATCH_P2P in name:
+                GraphBuilder._extract_batch_p2p_info(node, node_data)
             # 反向节点使用对应前向节点的堆栈信息
             # 模块命名举例：Module.module.module.GPTModel.backward.0; API命名举例：Tensor.permute.1.backward
             if (not node_stack_info and
@@ -164,6 +172,24 @@ class GraphBuilder:
         node.add_upnode(upnode)
         return node
+    @staticmethod
+    def _is_valid_batch_p2p_output(param_list):
+        if not isinstance(param_list, list) or not param_list:
+            return False
+        if not isinstance(param_list[0], list) or not param_list[0]:
+            return False
+        return True
+    @staticmethod
+    def _extract_batch_p2p_info(node, node_data):
+        param_list = node_data.get(Const.OUTPUT, [])
+        # 数据格式："output": [[{param1}, {param2}, ...]]
+        if GraphBuilder._is_valid_batch_p2p_output(param_list):
+            for param in param_list[0]:
+                info = {GraphConst.OP: param.get(GraphConst.OP), GraphConst.PEER: param.get(GraphConst.PEER),
+                        GraphConst.GROUP_ID: param.get(GraphConst.GROUP_ID)}
+                node.batch_p2p_info.append(info)
     @staticmethod
     def _collect_apis_between_modules(graph):
         """

msprobe/visualization/builder/msprobe_adapter.py CHANGED Viewed

@@ -23,7 +23,7 @@ from msprobe.core.compare.acc_compare import ModeConfig
 # 用于将节点名字解析成对应的NodeOp的规则
 op_patterns = [
     # NodeOp.module
-    r'^(Module.|Cell.)',
+    r'^(Module.|Cell.|optimizer|clip_grad)',
     # NodeOp.function_api
     r'^(Tensor.|Torch.|Functional.|NPU.|VF.|Distributed.|Aten.|Mint.|Primitive.|Jit.|MintFunctional.)'
 ]
@@ -57,8 +57,8 @@ def run_real_data(dump_path_param, csv_path, framework, is_cross_frame=False):
         from msprobe.pytorch.compare.pt_compare import PTComparator
         return PTComparator(mode_config).do_multi_process(dump_path_param, csv_path)
     else:
-        from msprobe.mindspore.compare.ms_compare import MSComparator
-        ms_comparator = MSComparator(mode_config)
+        from msprobe.mindspore.compare.ms_compare import MSComparator, MappingConfig
+        ms_comparator = MSComparator(mode_config, MappingConfig())
         ms_comparator.cross_frame = is_cross_frame
         return ms_comparator.do_multi_process(dump_path_param, csv_path)
@@ -120,11 +120,13 @@ def compare_data_fuzzy(data_dict_list1, data_dict_list2):
     return True
-def format_node_data(data_dict):
+def format_node_data(data_dict, node_id=None):
     """
-    批量进行节点数据的输出
+    删除节点数据中不需要展示的字段
     """
     del_list = ['requires_grad', 'full_op_name']
+    if node_id and GraphConst.BATCH_P2P in node_id:
+        del_list.extend(['op', 'peer', 'tag', 'group_id'])
     for _, value in data_dict.items():
         if not isinstance(value, dict):
             continue

msprobe/visualization/graph/base_node.py CHANGED Viewed

@@ -34,6 +34,7 @@ class BaseNode:
         self.micro_step_id = None
         self.overflow_level = None
         self.matched_distributed = {}
+        self.batch_p2p_info = []
     def __str__(self):
         info = f'id:\t{self.id}'
@@ -92,8 +93,8 @@ class BaseNode:
         result = {
             'id': self.id,
             'node_type': self.op.value,
-            'output_data': format_node_data(self.output_data),
-            'input_data': format_node_data(self.input_data),
+            'output_data': format_node_data(self.output_data, self.id),
+            'input_data': format_node_data(self.input_data, self.id),
             'upnode': self.upnode.id if self.upnode else 'None',
             'subnodes': [node.id for node in self.subnodes],
             'matched_node_link': self.matched_node_link,

msprobe/visualization/graph/distributed_analyzer.py CHANGED Viewed

@@ -107,6 +107,15 @@ class DistributedAnalyzer:
             return None, None
         return group_ranks, group_id
+    @staticmethod
+    def _get_batch_group_info(node, rank):
+        for data in node.input_data.values():
+            group_id = data.get('group_id')
+            if group_id is not None:
+                return group_id
+        logger.warning(f'The group_id of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
+        return None
     def distributed_match(self):
         for rank, graph in self.graphs.items():
             nodes = graph.node_map
@@ -115,7 +124,9 @@ class DistributedAnalyzer:
                 if not node_id.startswith(Const.DISTRIBUTED) or node.matched_distributed:
                     continue
                 api_name, distributed_type = self._get_distributed_name_and_type(node_id)
-                if distributed_type == DistributedType.P2P:
+                if api_name == GraphConst.BATCH_P2P:
+                    self._batch_p2p_match(node, rank)
+                elif distributed_type == DistributedType.P2P:
                     self._p2p_match(node, rank, api_name)
                 else:
                     self._collective_match(node, rank, api_name)
@@ -138,12 +149,16 @@ class DistributedAnalyzer:
         for rank, graph in self.graphs.items():
             group_count = {}
             group_info = {}
+            batch_p2p_count = {}
             nodes = graph.node_map
             for node_id, node in nodes.items():
                 if not node_id.startswith(Const.DISTRIBUTED):
                     continue
                 api_name, distributed_type = self._get_distributed_name_and_type(node_id)
-                if distributed_type == DistributedType.P2P:
+                if api_name == GraphConst.BATCH_P2P:
+                    self._make_batch_p2p_mapping(node, rank, batch_p2p_count)
+                    continue
+                elif distributed_type == DistributedType.P2P:
                     config_info = self.config.get(api_name)
                     target_rank = self._get_target_rank(node, rank, config_info[1])
                     if target_rank is None:
@@ -162,7 +177,32 @@ class DistributedAnalyzer:
                 unique_group_id = group_id + Const.REPLACEMENT_CHARACTER + str(group_count.get(group_id))
                 group_info[unique_group_id] = node_id
                 group_info[node_id] = unique_group_id
-            self.group_node_mapping[rank] = group_info
+            if rank not in self.group_node_mapping:
+                self.group_node_mapping[rank] = {}
+            self.group_node_mapping[rank].update(group_info)
+    def _make_batch_p2p_mapping(self, node, rank, batch_p2p_count):
+        """
+        给batch_isend_irecv接口的每个p2p内容赋予唯一标识
+        """
+        if rank not in self.group_node_mapping:
+            self.group_node_mapping[rank] = {}
+        params = []
+        for info_dict in node.batch_p2p_info:
+            op = info_dict.get(GraphConst.OP)
+            target_rank = info_dict.get(GraphConst.PEER)
+            if op is None or target_rank is None:
+                logger.warning('Cannot get param op or peer.')
+                continue
+            group_id = op + Const.REPLACEMENT_CHARACTER + Const.RANK + str(target_rank) + \
+                       Const.REPLACEMENT_CHARACTER + info_dict.get(GraphConst.GROUP_ID, '')
+            batch_p2p_count[group_id] = batch_p2p_count.get(group_id, 0) + 1
+            # 例如: isend_rank0_5a4d31ad765260ba50eb190f1f9fd163_1
+            unique_group_id = group_id + Const.REPLACEMENT_CHARACTER + str(batch_p2p_count.get(group_id))
+            params.append(unique_group_id)
+            self.group_node_mapping.get(rank)[unique_group_id] = node.id
+        if params:
+            self.group_node_mapping.get(rank)[node.id] = params
     def _get_distributed_name_and_type(self, node_id):
         if Const.SEP not in node_id:
@@ -316,3 +356,40 @@ class DistributedAnalyzer:
         if nodes_info:
             matched_distributed['nodes_info'] = nodes_info
             node.matched_distributed = matched_distributed
+    def _batch_p2p_match(self, node, rank):
+        """
+        批量点对点匹配
+        针对torch.distributed.batch_isend_irecv接口，其入参是一个包含点对点通信信息的集合，需要遍历集合对每个点对点通信信息进行匹配
+        :param node: 当前集体通信节点
+        :param rank: 当前节点所属rank
+        :return:
+        """
+        unique_group_ids = self.group_node_mapping.get(rank, {}).get(node.id)
+        if not unique_group_ids:
+            return
+        matched_distributed = [] if len(unique_group_ids) > 1 else {}
+        for unique_group_id in unique_group_ids:
+            try:
+                id_info = unique_group_id.split(Const.REPLACEMENT_CHARACTER)
+                api_name = id_info[0]
+                target_api_name = self.config.get(api_name)[0]
+                target_rank = int(id_info[1].replace(Const.RANK, ''))
+            except Exception as e:
+                logger.warning(f'Failed to parsing batch p2p parameter with error info: {e}.')
+                continue
+            target_node = self._get_target_node(rank, unique_group_id, api_name, target_rank, target_api_name)
+            if not target_node:
+                continue
+            communications_type = self.config.get(api_name)[2]
+            index = target_node.data.get(GraphConst.OVERFLOW_LEVEL, CompareConst.NAN) if self.overflow_check \
+                else target_node.data.get(GraphConst.JSON_INDEX_KEY, CompareConst.NAN)
+            matched_info = {
+                'communications_type': communications_type,
+                'nodes_info': {target_rank: [str(index), target_node.id]}
+            }
+            matched_distributed.append(matched_info) if isinstance(matched_distributed, list) \
+                else matched_distributed.update(matched_info)
+        if matched_distributed:
+            node.matched_distributed = matched_distributed

msprobe/visualization/graph/node_op.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from enum import Enum
 import re
 from msprobe.visualization.builder.msprobe_adapter import op_patterns
+from msprobe.core.common.log import logger
 class NodeOp(Enum):
@@ -32,8 +33,9 @@ class NodeOp(Enum):
         for op in NodeOp:
             index = op.value
             if index < 0 or index >= len(op_patterns):
-                raise Exception("NodeOp and op_patterns in MsprobeAdapter do not match")
+                continue
             pattern = op_patterns[index]
             if re.match(pattern, node_name):
                 return op
-        raise Exception(f"Cannot parse node_name {node_name} into NodeOp")
+        logger.warning(f"Cannot parsing node_name {node_name} into NodeOp, default parsing as module.")
+        return NodeOp.module

msprobe/visualization/graph_service.py CHANGED Viewed

@@ -16,8 +16,8 @@
 import os
 import time
 import json
-from msprobe.core.common.file_utils import (FileOpen, check_file_type, create_directory, FileChecker,
-                                            check_file_or_directory_path)
+from msprobe.core.common.file_utils import (check_file_type, create_directory, FileChecker,
+                                            check_file_or_directory_path, load_json)
 from msprobe.core.common.const import FileCheckConst, Const
 from msprobe.core.common.utils import CompareException
 from msprobe.core.overflow_check.checker import AnomalyDetector
@@ -220,8 +220,7 @@ def _graph_service_parser(parser):
 def _graph_service_command(args):
-    with FileOpen(args.input_path, "r") as file:
-        input_param = json.load(file)
+    input_param = load_json(args.input_path)
     npu_path = input_param.get("npu_path")
     bench_path = input_param.get("bench_path")
     check_file_or_directory_path(npu_path, isdir=True)

msprobe/visualization/utils.py CHANGED Viewed

@@ -155,6 +155,7 @@ class GraphConst:
     SUMMARY_COMPARE = 0
     MD5_COMPARE = 1
     REAL_DATA_COMPARE = 2
+    STRUCTURE_COMPARE = 3
     JSON_NPU_KEY = 'NPU'
     JSON_BENCH_KEY = 'Bench'
     JSON_TIP_KEY = 'ToolTip'
@@ -200,13 +201,15 @@ class GraphConst:
     DUMP_MODE_TO_GRAPHCOMPARE_MODE_MAPPING = {
         Const.ALL: REAL_DATA_COMPARE,
         Const.SUMMARY: SUMMARY_COMPARE,
-        Const.MD5: MD5_COMPARE
+        Const.MD5: MD5_COMPARE,
+        Const.STRUCTURE: STRUCTURE_COMPARE
     }
     GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING = {
         REAL_DATA_COMPARE: Const.ALL,
         SUMMARY_COMPARE: Const.SUMMARY,
-        MD5_COMPARE: Const.MD5
+        MD5_COMPARE: Const.MD5,
+        STRUCTURE_COMPARE: Const.STRUCTURE
     }
     RANKS = 'ranks'
@@ -215,3 +218,8 @@ class GraphConst:
     SRC = 'src'
     DST = 'dst'
+    BATCH_P2P = 'batch_isend_irecv'
+    OP = 'op'
+    PEER = 'peer'
+    GROUP_ID = 'group_id'

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mindstudio_probe-1.2.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

mindstudio-probe 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl