PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/visualization/builder/msprobe_adapter.py CHANGED Viewed

@@ -28,7 +28,7 @@ op_patterns = [
     # NodeOp.module
     r'^(Module.|Cell.|optimizer|clip_grad)',
     # NodeOp.function_api
-    r'^(Tensor.|Torch.|Functional.|NPU.|VF.|Distributed.|Aten.|Mint.|Primitive.|Jit.|MintFunctional.)'
+    r'^(Tensor.|Torch.|Functional.|NPU.|VF.|Distributed.|Aten.|Mint.|Primitive.|Jit.|MintFunctional.|MindSpeed.)'
 ]
@@ -54,7 +54,13 @@ def run_real_data(dump_path_param, csv_path, framework, is_cross_frame=False):
         framework: 框架类型, pytorch或mindspore
         is_cross_frame: 是否进行跨框架比对，仅支持mindspore比pytorch, 其中pytorch为标杆
     """
-    mode_config = ModeConfig(stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.ALL)
+    config_dict = {
+        'stack_mode': False,
+        'auto_analyze': True,
+        'fuzzy_match': False,
+        'dump_mode': Const.ALL
+    }
+    mode_config = ModeConfig(**config_dict)
     if framework == Const.PT_FRAMEWORK:
         from msprobe.pytorch.compare.pt_compare import read_real_data
@@ -125,7 +131,7 @@ def format_node_data(data_dict, node_id=None, compare_mode=None):
     """
     删除节点数据中不需要展示的字段
     """
-    del_list = ['requires_grad', 'full_op_name']
+    del_list = ['state', 'full_op_name']
     if GraphConst.MD5_COMPARE != compare_mode:
         del_list.append(Const.MD5)
     if node_id and GraphConst.BATCH_P2P in node_id:
@@ -140,31 +146,27 @@ def format_node_data(data_dict, node_id=None, compare_mode=None):
     return data_dict
-def compare_node(node_ids, data_dicts, stack_json_data, compare_mode):
+def compare_node(node_n, node_b, compare_mode):
     """
     调用acc_compare.py中的get_accuracy获得精度对比指标
     真实数据对比模式无法获得精度对比指标，需要调用多进程比对接口
     Returns: 包含参数信息和对比指标（真实数据对比模式除外）的list
     """
-    merge_n = _parse_node(node_ids[0], data_dicts[0], stack_json_data, compare_mode)
-    merge_b = _parse_node(node_ids[1], data_dicts[1], stack_json_data, compare_mode)
-    result = []
     dump_mode = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(compare_mode)
+    merge_n = _parse_node(node_n, dump_mode)
+    merge_b = _parse_node(node_b, dump_mode)
+    result = []
     get_accuracy(result, merge_n, merge_b, dump_mode)
     return result
-def _parse_node(node_id, data_dict, stack_json_data, compare_mode):
+def _parse_node(node, dump_mode):
     """
     转换节点，使其能够作为acc_compare.py中的get_accuracy的入参
     """
-    dump_mode = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(compare_mode)
-    op_parsed_list = read_op(data_dict.get(node_id, {}), node_id)
-    if node_id in stack_json_data:
-        op_parsed_list.append(
-            {'full_op_name': node_id, 'full_info': stack_json_data[node_id]})
-    else:
-        op_parsed_list.append({'full_op_name': node_id, 'full_info': None})
+    op_parsed_list = []
+    op_parsed_list.extend(node.input_data.values())
+    op_parsed_list.extend(node.output_data.values())
     result = merge_tensor(op_parsed_list, dump_mode)
     if not result:
         result['op_name'] = []

msprobe/visualization/compare/graph_comparator.py CHANGED Viewed

@@ -35,13 +35,15 @@ class GraphComparator:
         self.fuzzy_match = args.fuzzy_match
         self.pattern = re.compile(r'\.\d+\.')
         self.is_cross_framework = is_cross_framework
+        self.parallel_merge = args.parallel_merge if hasattr(args, 'parallel_merge') else False
+        self.rank_pattern = re.compile(r"_rank\d+")
     def compare(self):
         """
         比较函数，初始化结束后单独调用。比较结果写入graph_n
         """
         if self.fuzzy_match:
-            self._compare_nodes_fuzzy(self.graph_n.root)
+            self._compare_nodes_fuzzy(self.graph_n.root, False if self.parallel_merge else True)
         else:
             self._compare_nodes(self.graph_n.root)
         self._postcompare()
@@ -98,11 +100,12 @@ class GraphComparator:
         while node_list:
             compare_single_node(node_list.pop(0))
-    def _compare_nodes_fuzzy(self, node_root):
+    def _compare_nodes_fuzzy(self, node_root, check_shape=True):
         def compare_single_nodes_fuzzy(node_n):
             if node_n.op != NodeOp.function_api:
                 # 模块经过模糊匹配
-                node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(node_n, self.graph_b.node_map.get(node_n.id))
+                node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(node_n, self.graph_b.node_map.get(node_n.id),
+                                                                     check_shape)
                 if node_b:
                     self._process_matched_nodes(node_n, node_b, ancestors_n, ancestors_b)
                     # 匹配上的两个模块中的所有api, 忽略dump调用次数，按照名称一致+模块中的调用顺序进行匹配
@@ -113,7 +116,7 @@ class GraphComparator:
                         if not api_node_n:
                             continue
                         api_node_b, ancestors_n, ancestors_b = Graph.fuzzy_match(
-                            api_node_n, self.graph_b.node_map.get(recount_result_b.get(recount_node_id)))
+                            api_node_n, self.graph_b.node_map.get(recount_result_b.get(recount_node_id)), check_shape)
                         if api_node_b:
                             self._process_matched_nodes(api_node_n, api_node_b, ancestors_n, ancestors_b)
             node_list.extend(node_n.subnodes)
@@ -147,21 +150,26 @@ class GraphComparator:
         api集合的指标, md5模式使用集合中所有api最小的指标，statistics和tensor模式使用集合中所有api最大的指标
         md5模式下指标为0代表最差，statistics和tensor模式下指标为1代表最差
         """
+        def handle_api_collection_index(api_collection_node):
+            precision_index = GraphConst.MAX_INDEX_KEY if self.ma.compare_mode == GraphConst.MD5_COMPARE \
+                else GraphConst.MIN_INDEX_KEY
+            for api in api_collection_node.subnodes:
+                precision_index = min(precision_index,
+                                      api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MAX_INDEX_KEY)) \
+                    if self.ma.compare_mode == GraphConst.MD5_COMPARE \
+                    else max(precision_index, api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MIN_INDEX_KEY))
+            api_collection_node.data[GraphConst.JSON_INDEX_KEY] = precision_index
         for node in self.graph_n.root.subnodes:
-            if node.op == NodeOp.api_collection:
-                precision_index = GraphConst.MAX_INDEX_KEY if self.ma.compare_mode == GraphConst.MD5_COMPARE \
-                    else GraphConst.MIN_INDEX_KEY
-                for api in node.subnodes:
-                    precision_index = min(precision_index,
-                                          api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MAX_INDEX_KEY)) \
-                        if self.ma.compare_mode == GraphConst.MD5_COMPARE \
-                        else max(precision_index, api.data.get(GraphConst.JSON_INDEX_KEY, GraphConst.MIN_INDEX_KEY))
-                node.data[GraphConst.JSON_INDEX_KEY] = precision_index
+            if node.op == NodeOp.api_collection and node.id.startswith(GraphConst.APIS_BETWEEN_MODULES_ALL_RANKS):
+                for sub_node in node.subnodes:
+                    handle_api_collection_index(sub_node)
+                handle_api_collection_index(node)
+            elif node.op == NodeOp.api_collection:
+                handle_api_collection_index(node)
     def _get_and_add_result(self, node_n, node_b):
-        compare_result_list = compare_node([node_n.id, node_b.id],
-                                           [self.data_n_dict, self.data_b_dict],
-                                           self.stack_json_data, self.ma.compare_mode)
+        compare_result_list = compare_node(node_n, node_b, self.ma.compare_mode)
         if compare_result_list:
             self.ma.add_csv_data(compare_result_list)
             self.add_compare_result_to_node(node_n, compare_result_list)
@@ -178,6 +186,8 @@ class GraphComparator:
             if sub_node.op == NodeOp.function_api:
                 # 忽略dump调用次数
                 count_removed_id = self.pattern.sub(Const.SEP, sub_node.id)
+                if self.rank_pattern.search(count_removed_id):
+                    count_removed_id = self.rank_pattern.sub('', count_removed_id)
                 node_count[count_removed_id] = node_count.get(count_removed_id, 0) + 1
                 # 赋予模块中的调用顺序
                 recount_node_id = count_removed_id + str(node_count.get(count_removed_id))

msprobe/visualization/db_utils.py ADDED Viewed

@@ -0,0 +1,252 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sqlite3
+import json
+import re
+from msprobe.core.common.log import logger
+from msprobe.core.common.file_utils import change_mode, check_path_before_create, FileChecker
+from msprobe.core.common.const import FileCheckConst
+from msprobe.visualization.utils import GraphConst
+from msprobe.visualization.builder.msprobe_adapter import format_node_data
+TEXT_PRIMARY_KEY = 'TEXT PRIMARY KEY'
+TEXT_NOT_NULL = 'TEXT NOT NULL'
+INTEGER_NOT_NULL = 'INTEGER NOT NULL'
+TEXT = 'TEXT'
+INTEGER = 'INTEGER'
+node_columns = {
+    'id': TEXT_PRIMARY_KEY,
+    'graph_id': TEXT_NOT_NULL,
+    'node_order': INTEGER_NOT_NULL,
+    'node_name': TEXT_NOT_NULL,
+    'node_type': TEXT_NOT_NULL,
+    'up_node': TEXT,
+    'sub_nodes': TEXT,
+    'precision_index': INTEGER,
+    'overflow_level': TEXT,
+    'micro_step_id': INTEGER_NOT_NULL,
+    'matched_node_link': TEXT,
+    'stack_id': TEXT,
+    'parallel_merge_info': TEXT,
+    'matched_distributed': TEXT,
+    'modified': INTEGER_NOT_NULL,
+    'input_data': TEXT,
+    'output_data': TEXT,
+    'data_source': TEXT,
+    'dump_data_dir': TEXT,
+    'step': INTEGER_NOT_NULL,
+    'rank': INTEGER_NOT_NULL
+}
+config_columns = {
+    'id': TEXT_PRIMARY_KEY,
+    'graph_type': TEXT_NOT_NULL,
+    'task': TEXT,
+    'tool_tip': TEXT,
+    'micro_steps': INTEGER,
+    'overflow_check': INTEGER,
+    'node_colors': TEXT_NOT_NULL,
+    'rank_list': TEXT_NOT_NULL,
+    'step_list': TEXT_NOT_NULL
+}
+stack_columns = {
+    'id': TEXT_PRIMARY_KEY,
+    'stack_info': TEXT
+}
+indexes = {
+    "index1": ["step", "rank", "data_source", "up_node", "node_order"],
+    "index2": ["step", "rank", "data_source", "node_name"],
+    "index3": ["step", "rank", "data_source", "node_order"],
+    "index4": ["step", "rank", "node_order"],
+    "index5": ["step", "rank", "micro_step_id", "node_order"],
+    "index6": ["step", "rank", "modified", "matched_node_link"]
+}
+SAFE_NAME_PATTERN = re.compile(r'^[a-zA-Z0-9_]+$')
+def is_safe_identifier(name):
+    """验证标识符是否安全（防止SQL注入）"""
+    return isinstance(name, str) and SAFE_NAME_PATTERN.match(name) is not None
+def create_table_sql_from_dict(table_name, columns_dict):
+    """
+    根据提供的表名和列定义字典生成CREATE TABLE SQL语句。
+    """
+    if not is_safe_identifier(table_name):
+        raise ValueError(f"Invalid table name: {table_name} - potential SQL injection risk!")
+    sql = f"CREATE TABLE IF NOT EXISTS {table_name} (\n"
+    column_definitions = []
+    for column_name, column_type in columns_dict.items():
+        if not is_safe_identifier(column_name):
+            raise ValueError(f"Invalid column name: {column_name} - potential SQL injection risk!")
+        column_definitions.append(f"    {column_name} {column_type}")
+    sql += ",\n".join(column_definitions)
+    sql += "\n);"
+    return sql
+def create_insert_sql_from_dict(table_name, columns_dict, ignore_insert=False):
+    """
+    根据提供的表名和数据字典生成INSERT INTO SQL语句。
+    """
+    if not is_safe_identifier(table_name):
+        raise ValueError(f"Invalid table name: {table_name} - potential SQL injection risk!")
+    columns = list(columns_dict.keys())
+    for column_name in columns:
+        if not is_safe_identifier(column_name):
+            raise ValueError(f"Invalid column name: {column_name} - potential SQL injection risk!")
+    placeholders = ["?"] * len(columns)
+    columns_string = ", ".join(columns)
+    placeholders_string = ", ".join(placeholders)
+    sql_prefix = "INSERT OR IGNORE INTO" if ignore_insert else "INSERT INTO"
+    sql = f"{sql_prefix} {table_name} ({columns_string}) VALUES ({placeholders_string})"
+    return sql
+def to_db(db_path, create_table_sql, insert_sql, data, db_insert_size=1000):
+    if not os.path.exists(db_path):
+        check_path_before_create(db_path)
+    else:
+        FileChecker(db_path, FileCheckConst.FILE, FileCheckConst.READ_WRITE_ABLE,
+                    FileCheckConst.DB_SUFFIX).common_check()
+    try:
+        conn = sqlite3.connect(db_path)
+    except sqlite3.Error as e:
+        logger.error(f"Unable to create database connection: {e}")
+        raise RuntimeError("Unable to create database connection") from e
+    try:
+        cursor = conn.cursor()
+        cursor.execute(create_table_sql)
+        if len(data) == 1:
+            cursor.execute(insert_sql, data[0])
+            conn.commit()
+        else:
+            for i in range(0, len(data), db_insert_size):
+                batch = data[i:i + db_insert_size]
+                cursor.executemany(insert_sql, batch)
+                conn.commit()
+    except sqlite3.Error as e:
+        logger.error(f"An sqlite3 error occurred: {e}")
+        raise RuntimeError("An sqlite3 error occurred") from e
+    finally:
+        conn.close()
+def add_table_index(db_path):
+    FileChecker(db_path, FileCheckConst.FILE, FileCheckConst.READ_WRITE_ABLE, FileCheckConst.DB_SUFFIX).common_check()
+    try:
+        conn = sqlite3.connect(db_path)
+    except sqlite3.Error as e:
+        logger.error(f"Unable to create database connection: {e}")
+        raise RuntimeError("Unable to create database connection") from e
+    try:
+        cursor = conn.cursor()
+        for index_name, columns in indexes.items():
+            if not is_safe_identifier(index_name):
+                raise ValueError(f"Invalid index name: {index_name} - potential SQL injection risk!")
+            for column in columns:
+                if not is_safe_identifier(column):
+                    raise ValueError(f"Invalid column name in index: {column} - potential SQL injection risk!")
+            columns_str = ', '.join(columns)
+            index_sql = f'''
+                CREATE INDEX IF NOT EXISTS {index_name} ON tb_nodes ({columns_str});
+            '''
+            cursor.execute(index_sql)
+        conn.commit()
+    except sqlite3.Error as e:
+        logger.error(f"Failed to add table index: {e}")
+        raise RuntimeError("Failed to add table index") from e
+    finally:
+        conn.close()
+def post_process_db(db_path):
+    add_table_index(db_path)
+    change_mode(db_path, FileCheckConst.DATA_FILE_AUTHORITY)
+def node_to_db(graph, db_name):
+    create_table_sql = create_table_sql_from_dict('tb_nodes', node_columns)
+    insert_sql = create_insert_sql_from_dict('tb_nodes', node_columns)
+    data = []
+    stack_dict = {}
+    for i, node in enumerate(graph.get_sorted_nodes()):
+        stack_info_text = json.dumps(node.stack_info)
+        if stack_info_text not in stack_dict:
+            stack_dict[stack_info_text] = get_stack_unique_id(graph, stack_dict)
+        data.append((get_node_unique_id(graph, node), get_graph_unique_id(graph), i, node.id, node.op.value,
+                     node.upnode.id if node.upnode else '',
+                     json.dumps([node.id for node in node.subnodes]) if node.subnodes else '',
+                     node.data.get(GraphConst.JSON_INDEX_KEY), node.data.get(GraphConst.OVERFLOW_LEVEL),
+                     node.micro_step_id if node.micro_step_id is not None else 0, json.dumps(node.matched_node_link),
+                     stack_dict.get(stack_info_text),
+                     json.dumps(node.parallel_merge_info) if node.parallel_merge_info else '',
+                     json.dumps(node.matched_distributed), 0,
+                     json.dumps(format_node_data(node.input_data, node.id, graph.compare_mode)),
+                     json.dumps(format_node_data(node.output_data, node.id, graph.compare_mode)),
+                     graph.data_source, graph.data_path, graph.step, graph.rank))
+    to_db(db_name, create_table_sql, insert_sql, data)
+    stack_to_db(stack_dict, db_name)
+def config_to_db(config, db_name):
+    create_table_sql = create_table_sql_from_dict('tb_config', config_columns)
+    insert_sql = create_insert_sql_from_dict('tb_config', config_columns, ignore_insert=True)
+    data = [("1", "compare" if config.graph_b else "build", config.task, config.tool_tip, config.micro_steps,
+             config.overflow_check, json.dumps(config.node_colors), json.dumps(config.rank_list),
+             json.dumps(config.step_list))]
+    to_db(db_name, create_table_sql, insert_sql, data)
+def stack_to_db(stack_dict, db_name):
+    create_table_sql = create_table_sql_from_dict('tb_stack', stack_columns)
+    insert_sql = create_insert_sql_from_dict('tb_stack', stack_columns)
+    data = []
+    for stack_info_text, unique_id in stack_dict.items():
+        data.append((unique_id, stack_info_text))
+    to_db(db_name, create_table_sql, insert_sql, data)
+def get_graph_unique_id(graph):
+    return f'{graph.data_source}_{graph.step}_{graph.rank}'
+def get_node_unique_id(graph, node):
+    return f'{get_graph_unique_id(graph)}_{node.id}'
+def get_stack_unique_id(graph, stack_dict):
+    return f'{get_graph_unique_id(graph)}_{len(stack_dict)}'

msprobe/visualization/graph/base_node.py CHANGED Viewed

@@ -36,6 +36,8 @@ class BaseNode:
         self.overflow_level = None
         self.matched_distributed = {}
         self.batch_p2p_info = []
+        self.rank = 0
+        self.parallel_merge_info = []
     def __str__(self):
         info = f'id:\t{self.id}'
@@ -87,28 +89,6 @@ class BaseNode:
         self.matched_node_link = ancestors
         node.matched_node_link = ancestors
-    def to_dict(self, compare_mode=None):
-        """
-        输出数据
-        """
-        result = {
-            'id': self.id,
-            'node_type': self.op.value,
-            'output_data': format_node_data(self.output_data, self.id, compare_mode),
-            'input_data': format_node_data(self.input_data, self.id, compare_mode),
-            'upnode': self.upnode.id if self.upnode else 'None',
-            'subnodes': [node.id for node in self.subnodes],
-            'matched_node_link': self.matched_node_link,
-            'suggestions': self.suggestions,
-            'stack_info': self.stack_info
-        }
-        if self.micro_step_id is not None:
-            result['micro_step_id'] = self.micro_step_id
-        result['data'] = self.data
-        if self.matched_distributed:
-            result[GraphConst.MATCHED_DISTRIBUTED] = self.matched_distributed
-        return result
     def get_ancestors(self):
         """
         获取节点所有祖先的列表

msprobe/visualization/graph/distributed_analyzer.py CHANGED Viewed

@@ -82,7 +82,7 @@ class DistributedAnalyzer:
         """
         target_rank = node.input_data.get(f'{node.id}{GraphConst.INPUT}{parameter}', {}).get('value')
         if target_rank is None:
-            logger.warning(f'The parameter {parameter} of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
+            logger.debug(f'The parameter {parameter} of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
         return target_rank
     @staticmethod
@@ -95,15 +95,15 @@ class DistributedAnalyzer:
         """
         group = node.input_data.get(f'{node.id}{GraphConst.INPUT}group', {})
         if not group:
-            logger.warning(f'The kwarg group of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
+            logger.debug(f'The kwarg group of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
             return None, None
         group_ranks = group.get('group_ranks')
         if not group_ranks:
-            logger.warning(f'The group_ranks of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
+            logger.debug(f'The group_ranks of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
             return None, None
         group_id = group.get('group_id')
         if not group_id:
-            logger.warning(f'The group_id of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
+            logger.debug(f'The group_id of node {node.id} does not exist, {CANNOT_MATCH}{rank}')
             return None, None
         return group_ranks, group_id
@@ -183,7 +183,7 @@ class DistributedAnalyzer:
             op = info_dict.get(GraphConst.OP)
             target_rank = info_dict.get(GraphConst.PEER)
             if op is None or target_rank is None:
-                logger.warning('Cannot get param op or peer.')
+                logger.debug('Cannot get param op or peer.')
                 continue
             group_id = op + Const.REPLACEMENT_CHARACTER + Const.RANK + str(target_rank) + \
                        Const.REPLACEMENT_CHARACTER + info_dict.get(GraphConst.GROUP_ID, '')
@@ -215,7 +215,7 @@ class DistributedAnalyzer:
         """
         target_graph = self.graphs.get(target_rank)
         if not target_graph:
-            logger.warning(f'Graph data does not exist, {CANNOT_MATCH}{target_rank}')
+            logger.debug(f'Graph data does not exist, {CANNOT_MATCH}{target_rank}')
             return None
         target_group_mapping = self.group_node_mapping.get(target_rank)
         # p2p通信，想要获取目标节点，需要替换unique_group_id中的rank和api name,
@@ -226,7 +226,7 @@ class DistributedAnalyzer:
         target_node_id = target_group_mapping.get(target_unique_group_id, '')
         target_node = target_graph.node_map.get(target_node_id)
         if not target_node:
-            logger.warning(f'Node {target_node_id} does not exist, {CANNOT_MATCH}{target_rank}')
+            logger.debug(f'Node {target_node_id} does not exist, {CANNOT_MATCH}{target_rank}')
             return None
         return target_node
@@ -276,13 +276,13 @@ class DistributedAnalyzer:
         source_rank = (target_node.input_data.get(f'{target_node.id}{GraphConst.INPUT}{target_config_info[1]}', {})
                        .get('value'))
         if source_rank is None:
-            logger.warning(
+            logger.debug(
                 f'The kwarg {target_config_info[1]} of node {target_node.id} does not exist, '
                 f'{CANNOT_MATCH}{source_rank}')
             return
         if source_rank != rank:
             # 点对点通信，待匹配目标节点包含的rank信息要与当前rank一致
-            logger.warning(
+            logger.debug(
                 f'{node.id} of rank{rank} is expected to communicate with {target_node.id} of rank{target_rank}, '
                 f'but the data shows that {target_node.id} communicates with rank{source_rank}.'
                 f'The rank is inconsistent, cannot match distributed node')
@@ -291,7 +291,7 @@ class DistributedAnalyzer:
         # 点对点通信，两个匹配节点的输出数据要一致
         if not DistributedAnalyzer._node_output_all_equal(node.output_data.get(node.id + '.output.0'),
                                                           target_node.output_data.get(target_node.id + '.output.0')):
-            logger.warning(f'{node.id} output of rank{rank} is different from the {target_node.id} '
+            logger.debug(f'{node.id} output of rank{rank} is different from the {target_node.id} '
                            f'output of rank{target_rank}, cannot match distributed node')
             return
@@ -332,7 +332,7 @@ class DistributedAnalyzer:
             if not target_group_id:
                 continue
             if group_id != target_group_id:
-                logger.warning(
+                logger.debug(
                     f'{node.id} of rank{rank} is expected to communicate with {target_node.id} of rank{target_rank}'
                     f', but the data shows that the group id of the two nodes are different, '
                     f'cannot match distributed node')
@@ -368,7 +368,7 @@ class DistributedAnalyzer:
                 target_api_name = self.config.get(api_name)[0]
                 target_rank = int(id_info[1].replace(Const.RANK, ''))
             except Exception as e:
-                logger.warning(f'Failed to parse batch p2p parameter with error info: {e}.')
+                logger.debug(f'Failed to parse batch p2p parameter with error info: {e}.')
                 continue
             target_node = self._get_target_node(rank, unique_group_id, api_name, target_rank, target_api_name)
             if not target_node:

msprobe/visualization/graph/graph.py CHANGED Viewed

@@ -18,16 +18,22 @@ from msprobe.visualization.graph.node_op import NodeOp
 from msprobe.visualization.utils import GraphConst
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import Const
+from msprobe.core.common.decorator import recursion_depth_decorator
 class Graph:
-    def __init__(self, model_name, data_path='', dump_data=None):
+    def __init__(self, model_name, data_path='', dump_data=None, micro_step_num=None):
         self.node_map = {}
         self.node_id_map = {}
         self.add_node(NodeOp.module, model_name)
         self.root = self.get_node(model_name)
         self.data_path = data_path
         self.dump_data = dump_data
+        self.data_source = GraphConst.JSON_NPU_KEY
+        self.step = 0
+        self.rank = 0
+        self.compare_mode = GraphConst.SUMMARY_COMPARE
+        self.micro_step_num = micro_step_num
     def __str__(self):
         infos = [f'{str(self.node_map.get(node_id))}' for node_id in self.node_map]
@@ -65,8 +71,10 @@ class Graph:
         return node_b, ancestors_n, ancestors_b
     @staticmethod
-    def fuzzy_match(node_n, node_b):
-        if not node_n or not node_b or not node_n.fuzzy_eq(node_b):
+    def fuzzy_match(node_n, node_b, check_shape=True):
+        if not node_n or not node_b:
+            return None, [], []
+        if check_shape and not node_n.fuzzy_eq(node_b):
             return None, [], []
         ancestors_n = node_n.get_ancestors()
         ancestors_b = node_b.get_ancestors()
@@ -116,6 +124,25 @@ class Graph:
             result[micro_step].append(node)
         return result
+    def get_sorted_nodes(self):
+        """
+        通过深度优先遍历graph，获得排过序的node列表
+        """
+        visited = set()
+        order = []
+        @recursion_depth_decorator('msprobe.visualization.graph.graph.Graph.get_nodes_order.visit', max_depth=500)
+        def visit(node):
+            if node.id in visited:
+                return
+            visited.add(node.id)
+            for sub_node in node.subnodes:
+                visit(sub_node)
+            order.append(node)
+        visit(self.root)
+        return order
     def add_node(self, node_op, node_id, up_node=None, id_accumulation=False):
         """
         在graph中进行节点的添加
@@ -146,19 +173,6 @@ class Graph:
         """
         return self.node_map.get(node_id, None)
-    def to_dict(self, compare_mode=None):
-        """
-        用于数据输出
-        """
-        result = {}
-        result[GraphConst.JSON_ROOT_KEY] = self.root.id if self.root else 'None'
-        result[GraphConst.JSON_DATA_KEY] = self.data_path
-        result[GraphConst.JSON_NODE_KEY] = {}
-        for node_id in self.node_map:
-            info = self.node_map.get(node_id).to_dict(compare_mode)
-            result[GraphConst.JSON_NODE_KEY][node_id] = info
-        return result
     def paging_by_micro_step(self, graph_other=None):
         """
         给graph首层节点增加micro step标记，供前端分页展示，有助于在处理大规模图数据时进行优化和管理
@@ -168,6 +182,18 @@ class Graph:
             graph_other: 可选参数，另一个graph
         Returns: 分批的数量
         """
+        @recursion_depth_decorator(
+            'msprobe.visualization.graph.graph.Graph.paging_by_micro_step.propagate_micro_step_id', max_depth=500)
+        def propagate_micro_step_id(node):
+            if node.upnode is not None and node.micro_step_id is None:
+                node.micro_step_id = node.upnode.micro_step_id
+            for sub_node in node.subnodes:
+                propagate_micro_step_id(sub_node)
+        if self.micro_step_num is not None:
+            return self.micro_step_num + 1
         batches_n = Graph.split_nodes_by_micro_step(self.root.subnodes)
         for batch_number, nodes in batches_n.items():
             for node in nodes:
@@ -177,6 +203,7 @@ class Graph:
                     node_other = graph_other.get_node(node.matched_node_link[-1])
                     if node_other:
                         node_other.micro_step_id = batch_number
+        propagate_micro_step_id(self.root)
         # 遍历graph_other根节点下的所有子节点，确保未匹配节点也有micro_step_id
         if graph_other:
             for node in graph_other.root.subnodes:
@@ -186,6 +213,7 @@ class Graph:
                     except ValueError:
                         micro_step_id = 0
                     node.micro_step_id = micro_step_id
+            propagate_micro_step_id(graph_other.root)
         return len(batches_n)
     def overflow_check(self):

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl