PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/visualization/graph/graph.py CHANGED Viewed

@@ -146,7 +146,7 @@ class Graph:
         """
         return self.node_map.get(node_id, None)
-    def to_dict(self):
+    def to_dict(self, compare_mode=None):
         """
         用于数据输出
         """
@@ -155,7 +155,7 @@ class Graph:
         result[GraphConst.JSON_DATA_KEY] = self.data_path
         result[GraphConst.JSON_NODE_KEY] = {}
         for node_id in self.node_map:
-            info = self.node_map.get(node_id).to_dict()
+            info = self.node_map.get(node_id).to_dict(compare_mode)
             result[GraphConst.JSON_NODE_KEY][node_id] = info
         return result

msprobe/visualization/graph_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -15,83 +15,93 @@
 import os
 import time
-import json
+from copy import deepcopy
+from multiprocessing import cpu_count, Pool
 from msprobe.core.common.file_utils import (check_file_type, create_directory, FileChecker,
                                             check_file_or_directory_path, load_json)
 from msprobe.core.common.const import FileCheckConst, Const
-from msprobe.core.common.utils import CompareException
-from msprobe.core.overflow_check.checker import AnomalyDetector
+from msprobe.core.common.utils import CompareException, get_dump_mode
 from msprobe.visualization.compare.graph_comparator import GraphComparator
-from msprobe.visualization.utils import GraphConst, check_directory_content
-from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig
+from msprobe.visualization.utils import GraphConst, check_directory_content, SerializableArgs
+from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig, GraphInfo, BuildGraphTaskInfo
 from msprobe.core.common.log import logger
 from msprobe.visualization.graph.node_colors import NodeColors
 from msprobe.core.compare.layer_mapping import generate_api_mapping_by_layer_mapping
 from msprobe.core.compare.utils import check_and_return_dir_contents
+from msprobe.core.common.utils import detect_framework_by_dump_json
 from msprobe.visualization.graph.distributed_analyzer import DistributedAnalyzer
 current_time = time.strftime("%Y%m%d%H%M%S")
-def _compare_graph(input_param, args):
-    logger.info('Start building model graphs...')
-    # 对两个数据进行构图
-    dump_path_n = input_param.get('npu_path')
-    dump_path_b = input_param.get('bench_path')
-    construct_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.CONSTRUCT_FILE),
-                                   FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check()
-    construct_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.CONSTRUCT_FILE),
-                                   FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check()
-    data_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.DUMP_FILE), FileCheckConst.FILE,
-                              FileCheckConst.READ_ABLE).common_check()
-    data_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.DUMP_FILE), FileCheckConst.FILE,
-                              FileCheckConst.READ_ABLE).common_check()
-    stack_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.STACK_FILE), FileCheckConst.FILE,
-                               FileCheckConst.READ_ABLE).common_check()
-    stack_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.STACK_FILE), FileCheckConst.FILE,
-                               FileCheckConst.READ_ABLE).common_check()
-    graph_n = GraphBuilder.build(construct_path_n, data_path_n, stack_path_n, complete_stack=args.complete_stack)
-    graph_b = GraphBuilder.build(construct_path_b, data_path_b, stack_path_b, complete_stack=args.complete_stack)
-    logger.info('Model graphs built successfully, start Comparing graphs...')
-    # 基于graph、stack和data进行比较
+def _compare_graph(graph_n: GraphInfo, graph_b: GraphInfo, input_param, args):
     dump_path_param = {
-        'npu_json_path': data_path_n,
-        'bench_json_path': data_path_b,
-        'stack_json_path': stack_path_n,
+        'npu_json_path': graph_n.data_path,
+        'bench_json_path': graph_b.data_path,
+        'stack_json_path': graph_n.stack_path,
         'is_print_compare_log': input_param.get("is_print_compare_log", True)
     }
-    mapping_dict = None
+    mapping_dict = {}
     if args.layer_mapping:
-        yaml_path = FileChecker(args.layer_mapping, FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check()
         try:
-            mapping_dict = generate_api_mapping_by_layer_mapping(data_path_n, data_path_b, yaml_path)
+            mapping_dict = generate_api_mapping_by_layer_mapping(graph_n.data_path, graph_b.data_path,
+                                                                 args.layer_mapping)
         except Exception:
             logger.warning('The layer mapping file parsing failed, please check file format, mapping is not effective.')
-    graph_comparator = GraphComparator([graph_n, graph_b], dump_path_param, args, mapping_dict=mapping_dict)
+    is_cross_framework = detect_framework_by_dump_json(graph_n.data_path) != \
+                         detect_framework_by_dump_json(graph_b.data_path)
+    if is_cross_framework and not args.layer_mapping:
+        logger.error('The cross_frame graph comparison failed. '
+                     'Please specify -lm or --layer_mapping when performing cross_frame graph comparison.')
+        raise CompareException(CompareException.CROSS_FRAME_ERROR)
+    graph_comparator = GraphComparator([graph_n.graph, graph_b.graph], dump_path_param, args, is_cross_framework,
+                                       mapping_dict=mapping_dict)
     graph_comparator.compare()
-    micro_steps = graph_n.paging_by_micro_step(graph_b)
+    return graph_comparator
+def _compare_graph_result(input_param, args):
+    logger.info('Start building model graphs...')
+    # 对两个数据进行构图
+    graph_n = _build_graph_info(input_param.get('npu_path'), args)
+    graph_b = _build_graph_info(input_param.get('bench_path'), args)
+    logger.info('Model graphs built successfully, start Comparing graphs...')
+    # 基于graph、stack和data进行比较
+    graph_comparator = _compare_graph(graph_n, graph_b, input_param, args)
+    # 增加micro step标记
+    micro_steps = graph_n.graph.paging_by_micro_step(graph_b.graph)
     # 开启溢出检测
     if args.overflow_check:
-        graph_n.overflow_check()
-        graph_b.overflow_check()
+        graph_n.graph.overflow_check()
+        graph_b.graph.overflow_check()
-    return CompareGraphResult(graph_n, graph_b, graph_comparator, micro_steps)
+    return CompareGraphResult(graph_n.graph, graph_b.graph, graph_comparator, micro_steps)
-def _export_compare_graph_result(args, graphs, graph_comparator, micro_steps,
-                                 output_file_name=f'compare_{current_time}.vis'):
-    create_directory(args.output_path)
+def _export_compare_graph_result(args, result):
+    graphs = [result.graph_n, result.graph_b]
+    graph_comparator = result.graph_comparator
+    micro_steps = result.micro_steps
+    output_file_name = result.output_file_name
+    if not output_file_name:
+        output_file_name = f'compare_{current_time}.vis'
+    logger.info(f'Start exporting compare graph result, file name: {output_file_name}...')
     output_path = os.path.join(args.output_path, output_file_name)
     task = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(graph_comparator.ma.compare_mode)
     export_config = GraphExportConfig(graphs[0], graphs[1], graph_comparator.ma.get_tool_tip(),
                                       NodeColors.get_node_colors(graph_comparator.ma.compare_mode), micro_steps, task,
-                                      args.overflow_check)
-    GraphBuilder.to_json(output_path, export_config)
-    logger.info(f'Model graphs compared successfully, the result file is saved in {output_path}')
+                                      args.overflow_check, graph_comparator.ma.compare_mode)
+    try:
+        GraphBuilder.to_json(output_path, export_config)
+        logger.info(f'Exporting compare graph result successfully, the result file is saved in {output_path}')
+        return ''
+    except RuntimeError as e:
+        logger.error(f'Failed to export compare graph result, file: {output_file_name}, error: {e}')
+        return output_file_name
-def _build_graph(dump_path, args):
-    logger.info('Start building model graph...')
+def _build_graph_info(dump_path, args):
     construct_path = FileChecker(os.path.join(dump_path, GraphConst.CONSTRUCT_FILE), FileCheckConst.FILE,
                                  FileCheckConst.READ_ABLE).common_check()
     data_path = FileChecker(os.path.join(dump_path, GraphConst.DUMP_FILE), FileCheckConst.FILE,
@@ -99,6 +109,13 @@ def _build_graph(dump_path, args):
     stack_path = FileChecker(os.path.join(dump_path, GraphConst.STACK_FILE), FileCheckConst.FILE,
                              FileCheckConst.READ_ABLE).common_check()
     graph = GraphBuilder.build(construct_path, data_path, stack_path, complete_stack=args.complete_stack)
+    return GraphInfo(graph, construct_path, data_path, stack_path)
+def _build_graph_result(dump_path, args):
+    logger.info('Start building model graphs...')
+    graph = _build_graph_info(dump_path, args).graph
+    # 增加micro step标记
     micro_steps = graph.paging_by_micro_step()
     # 开启溢出检测
     if args.overflow_check:
@@ -106,15 +123,128 @@ def _build_graph(dump_path, args):
     return BuildGraphResult(graph, micro_steps)
-def _export_build_graph_result(out_path, graph, micro_steps, overflow_check,
-                               output_file_name=f'build_{current_time}.vis'):
-    create_directory(out_path)
+def _run_build_graph_compare(input_param, args, nr, br):
+    logger.info(f'Start building graph for {nr}...')
+    graph_n = _build_graph_info(input_param.get('npu_path'), args)
+    graph_b = _build_graph_info(input_param.get('bench_path'), args)
+    logger.info(f'Building graph for {nr} finished.')
+    return BuildGraphTaskInfo(graph_n, graph_b, nr, br, current_time)
+def _run_build_graph_single(dump_ranks_path, rank, step, args):
+    logger.info(f'Start building graph for {rank}...')
+    dump_path = os.path.join(dump_ranks_path, rank)
+    output_file_name = f'build_{step}_{rank}_{current_time}.vis' if step else f'build_{rank}_{current_time}.vis'
+    result = _build_graph_result(dump_path, args)
+    result.output_file_name = output_file_name
+    if rank != Const.RANK:
+        try:
+            result.rank = int(rank.replace(Const.RANK, ""))
+        except Exception as e:
+            logger.error('The folder name format is incorrect, expected rank+number.')
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from e
+    logger.info(f'Building graph for step: {step}, rank: {rank} finished.')
+    return result
+def _run_graph_compare(graph_task_info, input_param, args, output_file_name):
+    logger.info(f'Start comparing data for {graph_task_info.npu_rank}...')
+    graph_n = graph_task_info.graph_info_n
+    graph_b = graph_task_info.graph_info_b
+    nr = graph_task_info.npu_rank
+    graph_comparator = _compare_graph(graph_n, graph_b, input_param, args)
+    micro_steps = graph_n.graph.paging_by_micro_step(graph_b.graph)
+    # 开启溢出检测
+    if args.overflow_check:
+        graph_n.graph.overflow_check()
+        graph_b.graph.overflow_check()
+    graph_result = CompareGraphResult(graph_n.graph, graph_b.graph, graph_comparator, micro_steps)
+    graph_result.output_file_name = output_file_name
+    if nr != Const.RANK:
+        try:
+            graph_result.rank = int(nr.replace(Const.RANK, ""))
+        except Exception as e:
+            logger.error('The folder name format is incorrect, expected rank+number.')
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from e
+    logger.info(f'Comparing data for {graph_task_info.npu_rank} finished.')
+    return graph_result
+def _export_build_graph_result(args, result):
+    out_path = args.output_path
+    graph = result.graph
+    micro_steps = result.micro_steps
+    overflow_check = args.overflow_check
+    output_file_name = result.output_file_name
+    if not output_file_name:
+        output_file_name = f'build_{current_time}.vis'
+    logger.info(f'Start exporting graph for {output_file_name}...')
     output_path = os.path.join(out_path, output_file_name)
-    GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps, overflow_check=overflow_check))
-    logger.info(f'Model graph built successfully, the result file is saved in {output_path}')
+    try:
+        GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps,
+                                                            overflow_check=overflow_check))
+        logger.info(f'Model graph exported successfully, the result file is saved in {output_path}')
+        return None
+    except RuntimeError as e:
+        logger.error(f'Failed to export model graph, file: {output_file_name}, error: {e}')
+        return output_file_name
+def is_real_data_compare(input_param, npu_ranks, bench_ranks):
+    dump_rank_n = input_param.get('npu_path')
+    dump_rank_b = input_param.get('bench_path')
+    has_real_data = False
+    for nr, br in zip(npu_ranks, bench_ranks):
+        dump_path_param = {
+            'npu_json_path': FileChecker(os.path.join(dump_rank_n, nr, GraphConst.DUMP_FILE), FileCheckConst.FILE,
+                                         FileCheckConst.READ_ABLE).common_check(),
+            'bench_json_path': FileChecker(os.path.join(dump_rank_b, br, GraphConst.DUMP_FILE), FileCheckConst.FILE,
+                                           FileCheckConst.READ_ABLE).common_check()
+        }
+        has_real_data |= get_dump_mode(dump_path_param) == Const.ALL
+    return has_real_data
+def _mp_compare(input_param, serializable_args, output_file_name, nr, br):
+    graph_task_info = _run_build_graph_compare(input_param, serializable_args, nr, br)
+    return _run_graph_compare(graph_task_info, input_param, serializable_args, output_file_name)
 def _compare_graph_ranks(input_param, args, step=None):
+    with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool:
+        def err_call(err):
+            logger.error(f'Error occurred while comparing graph ranks: {err}')
+            try:
+                pool.close()
+            except OSError as e:
+                logger.error(f'Error occurred while terminating the pool: {e}')
+        serializable_args = SerializableArgs(args)
+        # 暂存所有rank的graph，用于匹配rank间的分布式节点
+        compare_graph_results = _get_compare_graph_results(input_param, serializable_args, step, pool, err_call)
+        # 匹配rank间的分布式节点
+        if len(compare_graph_results) > 1:
+            DistributedAnalyzer({obj.rank: obj.graph_n for obj in compare_graph_results},
+                                args.overflow_check).distributed_match()
+            DistributedAnalyzer({obj.rank: obj.graph_b for obj in compare_graph_results},
+                                args.overflow_check).distributed_match()
+        export_res_task_list = []
+        create_directory(args.output_path)
+        for result in compare_graph_results:
+            export_res_task_list.append(pool.apply_async(_export_compare_graph_result,
+                                                         args=(serializable_args, result),
+                                                         error_callback=err_call))
+        export_res_list = [res.get() for res in export_res_task_list]
+        if any(export_res_list):
+            failed_names = list(filter(lambda x: x, export_res_list))
+            logger.error(f'Unable to export compare graph results: {", ".join(failed_names)}.')
+        else:
+            logger.info('Successfully exported compare graph results.')
+def _get_compare_graph_results(input_param, serializable_args, step, pool, err_call):
     dump_rank_n = input_param.get('npu_path')
     dump_rank_b = input_param.get('bench_path')
     npu_ranks = sorted(check_and_return_dir_contents(dump_rank_n, Const.RANK))
@@ -123,32 +253,33 @@ def _compare_graph_ranks(input_param, args, step=None):
         logger.error('The number of ranks in the two runs are different. Unable to match the ranks.')
         raise CompareException(CompareException.INVALID_PATH_ERROR)
     compare_graph_results = []
-    for nr, br in zip(npu_ranks, bench_ranks):
-        logger.info(f'Start processing data for {nr}...')
-        input_param['npu_path'] = os.path.join(dump_rank_n, nr)
-        input_param['bench_path'] = os.path.join(dump_rank_b, br)
-        output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis'
-        result = _compare_graph(input_param, args)
-        result.output_file_name = output_file_name
-        if nr != Const.RANK:
-            try:
-                result.rank = int(nr.replace(Const.RANK, ""))
-            except Exception as e:
-                logger.error('The folder name format is incorrect, expected rank+number.')
-                raise CompareException(CompareException.INVALID_PATH_ERROR) from e
-        # 暂存所有rank的graph，用于匹配rank间的分布式节点
-        compare_graph_results.append(result)
-    # 匹配rank间的分布式节点
-    if len(compare_graph_results) > 1:
-        DistributedAnalyzer({obj.rank: obj.graph_n for obj in compare_graph_results},
-                            args.overflow_check).distributed_match()
-        DistributedAnalyzer({obj.rank: obj.graph_b for obj in compare_graph_results},
-                            args.overflow_check).distributed_match()
-    for result in compare_graph_results:
-        _export_compare_graph_result(args, [result.graph_n, result.graph_b], result.graph_comparator,
-                                     result.micro_steps, output_file_name=result.output_file_name)
+    if is_real_data_compare(input_param, npu_ranks, bench_ranks):
+        mp_task_dict = {}
+        for nr, br in zip(npu_ranks, bench_ranks):
+            input_param['npu_path'] = os.path.join(dump_rank_n, nr)
+            input_param['bench_path'] = os.path.join(dump_rank_b, br)
+            output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis'
+            input_param_copy = deepcopy(input_param)
+            mp_task_dict[output_file_name] = pool.apply_async(_run_build_graph_compare,
+                                                              args=(input_param_copy, serializable_args, nr, br),
+                                                              error_callback=err_call)
+        mp_res_dict = {k: v.get() for k, v in mp_task_dict.items()}
+        for output_file_name, mp_res in mp_res_dict.items():
+            compare_graph_results.append(_run_graph_compare(mp_res, input_param, serializable_args, output_file_name))
+    else:
+        compare_graph_tasks = []
+        for nr, br in zip(npu_ranks, bench_ranks):
+            input_param['npu_path'] = os.path.join(dump_rank_n, nr)
+            input_param['bench_path'] = os.path.join(dump_rank_b, br)
+            output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis'
+            input_param_copy = deepcopy(input_param)
+            compare_graph_tasks.append(pool.apply_async(_mp_compare,
+                                                        args=(input_param_copy, serializable_args, output_file_name, nr,
+                                                              br),
+                                                        error_callback=err_call))
+        compare_graph_results = [task.get() for task in compare_graph_tasks]
+    return compare_graph_results
 def _compare_graph_steps(input_param, args):
@@ -172,28 +303,39 @@ def _compare_graph_steps(input_param, args):
 def _build_graph_ranks(dump_ranks_path, args, step=None):
     ranks = sorted(check_and_return_dir_contents(dump_ranks_path, Const.RANK))
-    build_graph_results = []
-    for rank in ranks:
-        logger.info(f'Start processing data for {rank}...')
-        dump_path = os.path.join(dump_ranks_path, rank)
-        output_file_name = f'build_{step}_{rank}_{current_time}.vis' if step else f'build_{rank}_{current_time}.vis'
-        result = _build_graph(dump_path, args)
-        result.output_file_name = output_file_name
-        if rank != Const.RANK:
+    serializable_args = SerializableArgs(args)
+    with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool:
+        def err_call(err):
+            logger.error(f'Error occurred while comparing graph ranks: {err}')
             try:
-                result.rank = int(rank.replace(Const.RANK, ""))
-            except Exception as e:
-                logger.error('The folder name format is incorrect, expected rank+number.')
-                raise CompareException(CompareException.INVALID_PATH_ERROR) from e
-        build_graph_results.append(result)
-    if len(build_graph_results) > 1:
-        DistributedAnalyzer({obj.rank: obj.graph for obj in build_graph_results},
-                            args.overflow_check).distributed_match()
+                pool.close()
+            except OSError as e:
+                logger.error(f'Error occurred while terminating the pool: {e}')
+        build_graph_tasks = []
+        for rank in ranks:
+            build_graph_tasks.append(pool.apply_async(_run_build_graph_single,
+                                                      args=(dump_ranks_path, rank, step, serializable_args),
+                                                      error_callback=err_call))
+        build_graph_results = [task.get() for task in build_graph_tasks]
+        if len(build_graph_results) > 1:
+            DistributedAnalyzer({obj.rank: obj.graph for obj in build_graph_results},
+                                args.overflow_check).distributed_match()
+        create_directory(args.output_path)
+        export_build_graph_tasks = []
+        for result in build_graph_results:
+            export_build_graph_tasks.append(pool.apply_async(_export_build_graph_result,
+                                                             args=(serializable_args, result),
+                                                             error_callback=err_call))
+        export_build_graph_result = [task.get() for task in export_build_graph_tasks]
+        if any(export_build_graph_result):
+            failed_names = list(filter(lambda x: x, export_build_graph_result))
+            logger.error(f'Unable to export build graph results: {failed_names}.')
+        else:
+            logger.info(f'Successfully exported build graph results.')
-    for result in build_graph_results:
-        _export_build_graph_result(args.output_path, result.graph, result.micro_steps, args.overflow_check,
-                                   result.output_file_name)
 def _build_graph_steps(dump_steps_path, args):
@@ -209,7 +351,7 @@ def _graph_service_parser(parser):
                         help="<Required> The compare input path, a dict json.", required=True)
     parser.add_argument("-o", "--output_path", dest="output_path", type=str,
                         help="<Required> The compare task result out path.", required=True)
-    parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str,
+    parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str, nargs='?', const=True,
                         help="<Optional> The layer mapping file path.", required=False)
     parser.add_argument("-oc", "--overflow_check", dest="overflow_check", action="store_true",
                         help="<Optional> whether open overflow_check for graph.", required=False)
@@ -233,8 +375,11 @@ def _graph_service_command(args):
         elif content == GraphConst.STEPS:
             _build_graph_steps(npu_path, args)
         else:
-            result = _build_graph(npu_path, args)
-            _export_build_graph_result(args.output_path, result.graph, result.micro_steps, args.overflow_check)
+            result = _build_graph_result(npu_path, args)
+            create_directory(args.output_path)
+            file_name = _export_build_graph_result(args, result)
+            if file_name:
+                logger.error('Failed to export model build graph.')
     elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR:
         content_n = check_directory_content(npu_path)
         content_b = check_directory_content(bench_path)
@@ -245,9 +390,11 @@ def _graph_service_command(args):
         elif content_n == GraphConst.STEPS:
             _compare_graph_steps(input_param, args)
         else:
-            result = _compare_graph(input_param, args)
-            _export_compare_graph_result(args, [result.graph_n, result.graph_b],
-                                         result.graph_comparator, result.micro_steps)
+            result = _compare_graph_result(input_param, args)
+            create_directory(args.output_path)
+            file_name = _export_compare_graph_result(args, result)
+            if file_name:
+                logger.error('Failed to export model compare graph.')
     else:
         logger.error("The npu_path or bench_path should be a folder.")
         raise CompareException(CompareException.INVALID_COMPARE_MODE)

msprobe/visualization/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -16,9 +16,10 @@
 import os
 import re
 import json
+import pickle
 from msprobe.core.common.file_utils import FileOpen
 from msprobe.core.common.const import CompareConst, Const
-from msprobe.core.compare.acc_compare import Comparator, ModeConfig
+from msprobe.core.common.log import logger
 def load_json_file(file_path):
@@ -42,15 +43,6 @@ def load_data_json_file(file_path):
     return load_json_file(file_path).get(GraphConst.DATA_KEY, {})
-def get_csv_df(stack_mode, csv_data, compare_mode):
-    """
-    调用acc接口写入csv
-    """
-    dump_mode = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(compare_mode)
-    mode_config = ModeConfig(stack_mode=stack_mode, dump_mode=dump_mode)
-    return Comparator(mode_config).make_result_table(csv_data)
 def str2float(percentage_str):
     """
     百分比字符串转换转换为浮点型
@@ -192,3 +184,27 @@ class GraphConst:
     OP = 'op'
     PEER = 'peer'
     GROUP_ID = 'group_id'
+    IGNORE_PRECISION_INDEX = {'empty', 'empty_like', 'empty_with_format', 'new_empty_strided', 'new_empty',
+                              'empty_strided'}
+def is_serializable(obj):
+    """
+    Check if an object is serializable
+    """
+    try:
+        pickle.dumps(obj)
+        return True
+    except (pickle.PicklingError, AttributeError, TypeError):
+        return False
+    except Exception as e:
+        logger.error('Unexpected error occurred while pickling obj.')
+        raise RuntimeError('Unexpected error occurred while pickling obj.') from e
+class SerializableArgs:
+    def __init__(self, args):
+        for k, v in vars(args).items():
+            if is_serializable(v):
+                setattr(self, k, v)

msprobe/mindspore/dym_loader/hook_dynamic_loader.cc DELETED Viewed

@@ -1,106 +0,0 @@
-/**
- * Copyright 2024 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "hook_dynamic_loader.h"
-#include <sys/stat.h>
-#include <cstdlib>
-#include <cstring>
-#include <pybind11/embed.h>
-#include "utils/log_adapter.h"
-namespace py = pybind11;
-HookDynamicLoader &HookDynamicLoader::GetInstance() {
-  static HookDynamicLoader instance;
-  return instance;
-}
-bool HookDynamicLoader::loadFunction(void *handle, const std::string &functionName) {
-  void *func = dlsym(handle, functionName.c_str());
-  if (!func) {
-    MS_LOG(WARNING) << "Could not load function: " << functionName << ", error: " << dlerror();
-    return false;
-  }
-  funcMap_[functionName] = func;
-  return true;
-}
-bool HookDynamicLoader::LoadLibrary() {
-  std::string msprobePath = "";
-  // 获取gil锁
-  py::gil_scoped_acquire acquire;
-  try {
-    py::module msprobeMod = py::module::import("msprobe.lib._msprobe_c");
-		if (!py::hasattr(msprobeMod, "__file__")) {
-			MS_LOG(WARNING) << "Adump mod not found";
-			return false;
-		}
-		msprobePath = msprobeMod.attr("__file__").cast<std::string>();
-  } catch (const std::exception& e) {
-		MS_LOG(WARNING) << "Adump mod path unable to get: " << e.what();
-		return false;
-	}
-  std::lock_guard<std::mutex> lock(mutex_);
-  if (handle_) {
-    MS_LOG(WARNING) << "Hook library already loaded!";
-    return false;
-  }
-	if (msprobePath == "") {
-		MS_LOG(WARNING) << "Adump path not loaded";
-		return false;
-	}
-  handle_ = dlopen(msprobePath.c_str(), RTLD_LAZY | RTLD_LOCAL);
-  if (!handle_) {
-    MS_LOG(WARNING) << "Failed to load Hook library: " << dlerror();
-    return false;
-  }
-  for (const auto &functionName : functionList_) {
-    if (!loadFunction(handle_, functionName)) {
-      MS_LOG(WARNING) << "Failed to load adump function";
-      dlclose(handle_);
-      handle_ = nullptr;
-      return false;
-    }
-  }
-  MS_LOG(INFO) << "Hook library loaded successfully.";
-  return true;
-}
-bool HookDynamicLoader::UnloadLibrary() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  if (!handle_) {
-    MS_LOG(WARNING) << "Hook library hasn't been loaded.";
-    return false;
-  }
-  dlclose(handle_);
-  handle_ = nullptr;
-  funcMap_.clear();
-  MS_LOG(INFO) << "Library unloaded successfully.";
-  return true;
-}
-void *HookDynamicLoader::GetHooker(const std::string &funcName) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto iter = funcMap_.find(funcName);
-  if (iter == funcMap_.end()) {
-    MS_LOG(WARNING) << "Function not found: " << funcName;
-    return nullptr;
-  }
-  return iter->second;
-}

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl