PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/visualization/graph_service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -15,83 +15,93 @@
 import os
 import time
-import json
+from copy import deepcopy
+from multiprocessing import cpu_count, Pool
 from msprobe.core.common.file_utils import (check_file_type, create_directory, FileChecker,
                                             check_file_or_directory_path, load_json)
 from msprobe.core.common.const import FileCheckConst, Const
-from msprobe.core.common.utils import CompareException
-from msprobe.core.overflow_check.checker import AnomalyDetector
+from msprobe.core.common.utils import CompareException, get_dump_mode
 from msprobe.visualization.compare.graph_comparator import GraphComparator
-from msprobe.visualization.utils import GraphConst, check_directory_content
-from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig
+from msprobe.visualization.utils import GraphConst, check_directory_content, SerializableArgs
+from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig, GraphInfo, BuildGraphTaskInfo
 from msprobe.core.common.log import logger
 from msprobe.visualization.graph.node_colors import NodeColors
 from msprobe.core.compare.layer_mapping import generate_api_mapping_by_layer_mapping
 from msprobe.core.compare.utils import check_and_return_dir_contents
+from msprobe.core.common.utils import detect_framework_by_dump_json
 from msprobe.visualization.graph.distributed_analyzer import DistributedAnalyzer
 current_time = time.strftime("%Y%m%d%H%M%S")
-def _compare_graph(input_param, args):
-    logger.info('Start building model graphs...')
-    # 对两个数据进行构图
-    dump_path_n = input_param.get('npu_path')
-    dump_path_b = input_param.get('bench_path')
-    construct_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.CONSTRUCT_FILE),
-                                   FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check()
-    construct_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.CONSTRUCT_FILE),
-                                   FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check()
-    data_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.DUMP_FILE), FileCheckConst.FILE,
-                              FileCheckConst.READ_ABLE).common_check()
-    data_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.DUMP_FILE), FileCheckConst.FILE,
-                              FileCheckConst.READ_ABLE).common_check()
-    stack_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.STACK_FILE), FileCheckConst.FILE,
-                               FileCheckConst.READ_ABLE).common_check()
-    stack_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.STACK_FILE), FileCheckConst.FILE,
-                               FileCheckConst.READ_ABLE).common_check()
-    graph_n = GraphBuilder.build(construct_path_n, data_path_n, stack_path_n, complete_stack=args.complete_stack)
-    graph_b = GraphBuilder.build(construct_path_b, data_path_b, stack_path_b, complete_stack=args.complete_stack)
-    logger.info('Model graphs built successfully, start Comparing graphs...')
-    # 基于graph、stack和data进行比较
+def _compare_graph(graph_n: GraphInfo, graph_b: GraphInfo, input_param, args):
     dump_path_param = {
-        'npu_json_path': data_path_n,
-        'bench_json_path': data_path_b,
-        'stack_json_path': stack_path_n,
+        'npu_json_path': graph_n.data_path,
+        'bench_json_path': graph_b.data_path,
+        'stack_json_path': graph_n.stack_path,
         'is_print_compare_log': input_param.get("is_print_compare_log", True)
     }
-    mapping_dict = None
+    mapping_dict = {}
     if args.layer_mapping:
-        yaml_path = FileChecker(args.layer_mapping, FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check()
         try:
-            mapping_dict = generate_api_mapping_by_layer_mapping(data_path_n, data_path_b, yaml_path)
+            mapping_dict = generate_api_mapping_by_layer_mapping(graph_n.data_path, graph_b.data_path,
+                                                                 args.layer_mapping)
         except Exception:
             logger.warning('The layer mapping file parsing failed, please check file format, mapping is not effective.')
-    graph_comparator = GraphComparator([graph_n, graph_b], dump_path_param, args, mapping_dict=mapping_dict)
+    is_cross_framework = detect_framework_by_dump_json(graph_n.data_path) != \
+                         detect_framework_by_dump_json(graph_b.data_path)
+    if is_cross_framework and not args.layer_mapping:
+        logger.error('The cross_frame graph comparison failed. '
+                     'Please specify -lm or --layer_mapping when performing cross_frame graph comparison.')
+        raise CompareException(CompareException.CROSS_FRAME_ERROR)
+    graph_comparator = GraphComparator([graph_n.graph, graph_b.graph], dump_path_param, args, is_cross_framework,
+                                       mapping_dict=mapping_dict)
     graph_comparator.compare()
-    micro_steps = graph_n.paging_by_micro_step(graph_b)
+    return graph_comparator
+def _compare_graph_result(input_param, args):
+    logger.info('Start building model graphs...')
+    # 对两个数据进行构图
+    graph_n = _build_graph_info(input_param.get('npu_path'), args)
+    graph_b = _build_graph_info(input_param.get('bench_path'), args)
+    logger.info('Model graphs built successfully, start Comparing graphs...')
+    # 基于graph、stack和data进行比较
+    graph_comparator = _compare_graph(graph_n, graph_b, input_param, args)
+    # 增加micro step标记
+    micro_steps = graph_n.graph.paging_by_micro_step(graph_b.graph)
     # 开启溢出检测
     if args.overflow_check:
-        graph_n.overflow_check()
-        graph_b.overflow_check()
+        graph_n.graph.overflow_check()
+        graph_b.graph.overflow_check()
-    return CompareGraphResult(graph_n, graph_b, graph_comparator, micro_steps)
+    return CompareGraphResult(graph_n.graph, graph_b.graph, graph_comparator, micro_steps)
-def _export_compare_graph_result(args, graphs, graph_comparator, micro_steps,
-                                 output_file_name=f'compare_{current_time}.vis'):
-    create_directory(args.output_path)
+def _export_compare_graph_result(args, result):
+    graphs = [result.graph_n, result.graph_b]
+    graph_comparator = result.graph_comparator
+    micro_steps = result.micro_steps
+    output_file_name = result.output_file_name
+    if not output_file_name:
+        output_file_name = f'compare_{current_time}.vis'
+    logger.info(f'Start exporting compare graph result, file name: {output_file_name}...')
     output_path = os.path.join(args.output_path, output_file_name)
     task = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(graph_comparator.ma.compare_mode)
     export_config = GraphExportConfig(graphs[0], graphs[1], graph_comparator.ma.get_tool_tip(),
                                       NodeColors.get_node_colors(graph_comparator.ma.compare_mode), micro_steps, task,
-                                      args.overflow_check)
-    GraphBuilder.to_json(output_path, export_config)
-    logger.info(f'Model graphs compared successfully, the result file is saved in {output_path}')
+                                      args.overflow_check, graph_comparator.ma.compare_mode)
+    try:
+        GraphBuilder.to_json(output_path, export_config)
+        logger.info(f'Exporting compare graph result successfully, the result file is saved in {output_path}')
+        return ''
+    except RuntimeError as e:
+        logger.error(f'Failed to export compare graph result, file: {output_file_name}, error: {e}')
+        return output_file_name
-def _build_graph(dump_path, args):
-    logger.info('Start building model graph...')
+def _build_graph_info(dump_path, args):
     construct_path = FileChecker(os.path.join(dump_path, GraphConst.CONSTRUCT_FILE), FileCheckConst.FILE,
                                  FileCheckConst.READ_ABLE).common_check()
     data_path = FileChecker(os.path.join(dump_path, GraphConst.DUMP_FILE), FileCheckConst.FILE,
@@ -99,6 +109,13 @@ def _build_graph(dump_path, args):
     stack_path = FileChecker(os.path.join(dump_path, GraphConst.STACK_FILE), FileCheckConst.FILE,
                              FileCheckConst.READ_ABLE).common_check()
     graph = GraphBuilder.build(construct_path, data_path, stack_path, complete_stack=args.complete_stack)
+    return GraphInfo(graph, construct_path, data_path, stack_path)
+def _build_graph_result(dump_path, args):
+    logger.info('Start building model graphs...')
+    graph = _build_graph_info(dump_path, args).graph
+    # 增加micro step标记
     micro_steps = graph.paging_by_micro_step()
     # 开启溢出检测
     if args.overflow_check:
@@ -106,15 +123,128 @@ def _build_graph(dump_path, args):
     return BuildGraphResult(graph, micro_steps)
-def _export_build_graph_result(out_path, graph, micro_steps, overflow_check,
-                               output_file_name=f'build_{current_time}.vis'):
-    create_directory(out_path)
+def _run_build_graph_compare(input_param, args, nr, br):
+    logger.info(f'Start building graph for {nr}...')
+    graph_n = _build_graph_info(input_param.get('npu_path'), args)
+    graph_b = _build_graph_info(input_param.get('bench_path'), args)
+    logger.info(f'Building graph for {nr} finished.')
+    return BuildGraphTaskInfo(graph_n, graph_b, nr, br, current_time)
+def _run_build_graph_single(dump_ranks_path, rank, step, args):
+    logger.info(f'Start building graph for {rank}...')
+    dump_path = os.path.join(dump_ranks_path, rank)
+    output_file_name = f'build_{step}_{rank}_{current_time}.vis' if step else f'build_{rank}_{current_time}.vis'
+    result = _build_graph_result(dump_path, args)
+    result.output_file_name = output_file_name
+    if rank != Const.RANK:
+        try:
+            result.rank = int(rank.replace(Const.RANK, ""))
+        except Exception as e:
+            logger.error('The folder name format is incorrect, expected rank+number.')
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from e
+    logger.info(f'Building graph for step: {step}, rank: {rank} finished.')
+    return result
+def _run_graph_compare(graph_task_info, input_param, args, output_file_name):
+    logger.info(f'Start comparing data for {graph_task_info.npu_rank}...')
+    graph_n = graph_task_info.graph_info_n
+    graph_b = graph_task_info.graph_info_b
+    nr = graph_task_info.npu_rank
+    graph_comparator = _compare_graph(graph_n, graph_b, input_param, args)
+    micro_steps = graph_n.graph.paging_by_micro_step(graph_b.graph)
+    # 开启溢出检测
+    if args.overflow_check:
+        graph_n.graph.overflow_check()
+        graph_b.graph.overflow_check()
+    graph_result = CompareGraphResult(graph_n.graph, graph_b.graph, graph_comparator, micro_steps)
+    graph_result.output_file_name = output_file_name
+    if nr != Const.RANK:
+        try:
+            graph_result.rank = int(nr.replace(Const.RANK, ""))
+        except Exception as e:
+            logger.error('The folder name format is incorrect, expected rank+number.')
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from e
+    logger.info(f'Comparing data for {graph_task_info.npu_rank} finished.')
+    return graph_result
+def _export_build_graph_result(args, result):
+    out_path = args.output_path
+    graph = result.graph
+    micro_steps = result.micro_steps
+    overflow_check = args.overflow_check
+    output_file_name = result.output_file_name
+    if not output_file_name:
+        output_file_name = f'build_{current_time}.vis'
+    logger.info(f'Start exporting graph for {output_file_name}...')
     output_path = os.path.join(out_path, output_file_name)
-    GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps, overflow_check=overflow_check))
-    logger.info(f'Model graph built successfully, the result file is saved in {output_path}')
+    try:
+        GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps,
+                                                            overflow_check=overflow_check))
+        logger.info(f'Model graph exported successfully, the result file is saved in {output_path}')
+        return None
+    except RuntimeError as e:
+        logger.error(f'Failed to export model graph, file: {output_file_name}, error: {e}')
+        return output_file_name
+def is_real_data_compare(input_param, npu_ranks, bench_ranks):
+    dump_rank_n = input_param.get('npu_path')
+    dump_rank_b = input_param.get('bench_path')
+    has_real_data = False
+    for nr, br in zip(npu_ranks, bench_ranks):
+        dump_path_param = {
+            'npu_json_path': FileChecker(os.path.join(dump_rank_n, nr, GraphConst.DUMP_FILE), FileCheckConst.FILE,
+                                         FileCheckConst.READ_ABLE).common_check(),
+            'bench_json_path': FileChecker(os.path.join(dump_rank_b, br, GraphConst.DUMP_FILE), FileCheckConst.FILE,
+                                           FileCheckConst.READ_ABLE).common_check()
+        }
+        has_real_data |= get_dump_mode(dump_path_param) == Const.ALL
+    return has_real_data
+def _mp_compare(input_param, serializable_args, output_file_name, nr, br):
+    graph_task_info = _run_build_graph_compare(input_param, serializable_args, nr, br)
+    return _run_graph_compare(graph_task_info, input_param, serializable_args, output_file_name)
 def _compare_graph_ranks(input_param, args, step=None):
+    with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool:
+        def err_call(err):
+            logger.error(f'Error occurred while comparing graph ranks: {err}')
+            try:
+                pool.close()
+            except OSError as e:
+                logger.error(f'Error occurred while terminating the pool: {e}')
+        serializable_args = SerializableArgs(args)
+        # 暂存所有rank的graph，用于匹配rank间的分布式节点
+        compare_graph_results = _get_compare_graph_results(input_param, serializable_args, step, pool, err_call)
+        # 匹配rank间的分布式节点
+        if len(compare_graph_results) > 1:
+            DistributedAnalyzer({obj.rank: obj.graph_n for obj in compare_graph_results},
+                                args.overflow_check).distributed_match()
+            DistributedAnalyzer({obj.rank: obj.graph_b for obj in compare_graph_results},
+                                args.overflow_check).distributed_match()
+        export_res_task_list = []
+        create_directory(args.output_path)
+        for result in compare_graph_results:
+            export_res_task_list.append(pool.apply_async(_export_compare_graph_result,
+                                                         args=(serializable_args, result),
+                                                         error_callback=err_call))
+        export_res_list = [res.get() for res in export_res_task_list]
+        if any(export_res_list):
+            failed_names = list(filter(lambda x: x, export_res_list))
+            logger.error(f'Unable to export compare graph results: {", ".join(failed_names)}.')
+        else:
+            logger.info('Successfully exported compare graph results.')
+def _get_compare_graph_results(input_param, serializable_args, step, pool, err_call):
     dump_rank_n = input_param.get('npu_path')
     dump_rank_b = input_param.get('bench_path')
     npu_ranks = sorted(check_and_return_dir_contents(dump_rank_n, Const.RANK))
@@ -123,32 +253,33 @@ def _compare_graph_ranks(input_param, args, step=None):
         logger.error('The number of ranks in the two runs are different. Unable to match the ranks.')
         raise CompareException(CompareException.INVALID_PATH_ERROR)
     compare_graph_results = []
-    for nr, br in zip(npu_ranks, bench_ranks):
-        logger.info(f'Start processing data for {nr}...')
-        input_param['npu_path'] = os.path.join(dump_rank_n, nr)
-        input_param['bench_path'] = os.path.join(dump_rank_b, br)
-        output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis'
-        result = _compare_graph(input_param, args)
-        result.output_file_name = output_file_name
-        if nr != Const.RANK:
-            try:
-                result.rank = int(nr.replace(Const.RANK, ""))
-            except Exception as e:
-                logger.error('The folder name format is incorrect, expected rank+number.')
-                raise CompareException(CompareException.INVALID_PATH_ERROR) from e
-        # 暂存所有rank的graph，用于匹配rank间的分布式节点
-        compare_graph_results.append(result)
-    # 匹配rank间的分布式节点
-    if len(compare_graph_results) > 1:
-        DistributedAnalyzer({obj.rank: obj.graph_n for obj in compare_graph_results},
-                            args.overflow_check).distributed_match()
-        DistributedAnalyzer({obj.rank: obj.graph_b for obj in compare_graph_results},
-                            args.overflow_check).distributed_match()
-    for result in compare_graph_results:
-        _export_compare_graph_result(args, [result.graph_n, result.graph_b], result.graph_comparator,
-                                     result.micro_steps, output_file_name=result.output_file_name)
+    if is_real_data_compare(input_param, npu_ranks, bench_ranks):
+        mp_task_dict = {}
+        for nr, br in zip(npu_ranks, bench_ranks):
+            input_param['npu_path'] = os.path.join(dump_rank_n, nr)
+            input_param['bench_path'] = os.path.join(dump_rank_b, br)
+            output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis'
+            input_param_copy = deepcopy(input_param)
+            mp_task_dict[output_file_name] = pool.apply_async(_run_build_graph_compare,
+                                                              args=(input_param_copy, serializable_args, nr, br),
+                                                              error_callback=err_call)
+        mp_res_dict = {k: v.get() for k, v in mp_task_dict.items()}
+        for output_file_name, mp_res in mp_res_dict.items():
+            compare_graph_results.append(_run_graph_compare(mp_res, input_param, serializable_args, output_file_name))
+    else:
+        compare_graph_tasks = []
+        for nr, br in zip(npu_ranks, bench_ranks):
+            input_param['npu_path'] = os.path.join(dump_rank_n, nr)
+            input_param['bench_path'] = os.path.join(dump_rank_b, br)
+            output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis'
+            input_param_copy = deepcopy(input_param)
+            compare_graph_tasks.append(pool.apply_async(_mp_compare,
+                                                        args=(input_param_copy, serializable_args, output_file_name, nr,
+                                                              br),
+                                                        error_callback=err_call))
+        compare_graph_results = [task.get() for task in compare_graph_tasks]
+    return compare_graph_results
 def _compare_graph_steps(input_param, args):
@@ -159,7 +290,7 @@ def _compare_graph_steps(input_param, args):
     bench_steps = sorted(check_and_return_dir_contents(dump_step_b, Const.STEP))
     if npu_steps != bench_steps:
-        logger.error('The number of steps in the two runs are different. Unable to match the steps.')
+        logger.error('The number of steps in the two runs is different. Unable to match the steps.')
         raise CompareException(CompareException.INVALID_PATH_ERROR)
     for folder_step in npu_steps:
@@ -172,28 +303,39 @@ def _compare_graph_steps(input_param, args):
 def _build_graph_ranks(dump_ranks_path, args, step=None):
     ranks = sorted(check_and_return_dir_contents(dump_ranks_path, Const.RANK))
-    build_graph_results = []
-    for rank in ranks:
-        logger.info(f'Start processing data for {rank}...')
-        dump_path = os.path.join(dump_ranks_path, rank)
-        output_file_name = f'build_{step}_{rank}_{current_time}.vis' if step else f'build_{rank}_{current_time}.vis'
-        result = _build_graph(dump_path, args)
-        result.output_file_name = output_file_name
-        if rank != Const.RANK:
+    serializable_args = SerializableArgs(args)
+    with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool:
+        def err_call(err):
+            logger.error(f'Error occurred while comparing graph ranks: {err}')
             try:
-                result.rank = int(rank.replace(Const.RANK, ""))
-            except Exception as e:
-                logger.error('The folder name format is incorrect, expected rank+number.')
-                raise CompareException(CompareException.INVALID_PATH_ERROR) from e
-        build_graph_results.append(result)
-    if len(build_graph_results) > 1:
-        DistributedAnalyzer({obj.rank: obj.graph for obj in build_graph_results},
-                            args.overflow_check).distributed_match()
+                pool.close()
+            except OSError as e:
+                logger.error(f'Error occurred while terminating the pool: {e}')
+        build_graph_tasks = []
+        for rank in ranks:
+            build_graph_tasks.append(pool.apply_async(_run_build_graph_single,
+                                                      args=(dump_ranks_path, rank, step, serializable_args),
+                                                      error_callback=err_call))
+        build_graph_results = [task.get() for task in build_graph_tasks]
+        if len(build_graph_results) > 1:
+            DistributedAnalyzer({obj.rank: obj.graph for obj in build_graph_results},
+                                args.overflow_check).distributed_match()
+        create_directory(args.output_path)
+        export_build_graph_tasks = []
+        for result in build_graph_results:
+            export_build_graph_tasks.append(pool.apply_async(_export_build_graph_result,
+                                                             args=(serializable_args, result),
+                                                             error_callback=err_call))
+        export_build_graph_result = [task.get() for task in export_build_graph_tasks]
+        if any(export_build_graph_result):
+            failed_names = list(filter(lambda x: x, export_build_graph_result))
+            logger.error(f'Unable to export build graph results: {failed_names}.')
+        else:
+            logger.info(f'Successfully exported build graph results.')
-    for result in build_graph_results:
-        _export_build_graph_result(args.output_path, result.graph, result.micro_steps, args.overflow_check,
-                                   result.output_file_name)
 def _build_graph_steps(dump_steps_path, args):
@@ -209,7 +351,7 @@ def _graph_service_parser(parser):
                         help="<Required> The compare input path, a dict json.", required=True)
     parser.add_argument("-o", "--output_path", dest="output_path", type=str,
                         help="<Required> The compare task result out path.", required=True)
-    parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str,
+    parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str, nargs='?', const=True,
                         help="<Optional> The layer mapping file path.", required=False)
     parser.add_argument("-oc", "--overflow_check", dest="overflow_check", action="store_true",
                         help="<Optional> whether open overflow_check for graph.", required=False)
@@ -233,8 +375,11 @@ def _graph_service_command(args):
         elif content == GraphConst.STEPS:
             _build_graph_steps(npu_path, args)
         else:
-            result = _build_graph(npu_path, args)
-            _export_build_graph_result(args.output_path, result.graph, result.micro_steps, args.overflow_check)
+            result = _build_graph_result(npu_path, args)
+            create_directory(args.output_path)
+            file_name = _export_build_graph_result(args, result)
+            if file_name:
+                logger.error('Failed to export model build graph.')
     elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR:
         content_n = check_directory_content(npu_path)
         content_b = check_directory_content(bench_path)
@@ -245,9 +390,11 @@ def _graph_service_command(args):
         elif content_n == GraphConst.STEPS:
             _compare_graph_steps(input_param, args)
         else:
-            result = _compare_graph(input_param, args)
-            _export_compare_graph_result(args, [result.graph_n, result.graph_b],
-                                         result.graph_comparator, result.micro_steps)
+            result = _compare_graph_result(input_param, args)
+            create_directory(args.output_path)
+            file_name = _export_compare_graph_result(args, result)
+            if file_name:
+                logger.error('Failed to export model compare graph.')
     else:
         logger.error("The npu_path or bench_path should be a folder.")
         raise CompareException(CompareException.INVALID_COMPARE_MODE)

msprobe/visualization/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -16,9 +16,10 @@
 import os
 import re
 import json
+import pickle
 from msprobe.core.common.file_utils import FileOpen
 from msprobe.core.common.const import CompareConst, Const
-from msprobe.core.compare.acc_compare import Comparator, ModeConfig
+from msprobe.core.common.log import logger
 def load_json_file(file_path):
@@ -42,23 +43,6 @@ def load_data_json_file(file_path):
     return load_json_file(file_path).get(GraphConst.DATA_KEY, {})
-def save_json_file(file_path, data):
-    """
-    保存json文件
-    """
-    with FileOpen(file_path, 'w') as f:
-        f.write(json.dumps(data, indent=4))
-def get_csv_df(stack_mode, csv_data, compare_mode):
-    """
-    调用acc接口写入csv
-    """
-    dump_mode = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(compare_mode)
-    mode_config = ModeConfig(stack_mode=stack_mode, dump_mode=dump_mode)
-    return Comparator(mode_config).make_result_table(csv_data)
 def str2float(percentage_str):
     """
     百分比字符串转换转换为浮点型
@@ -73,14 +57,6 @@ def str2float(percentage_str):
         return 0
-def is_integer(s):
-    try:
-        int(s)
-        return True
-    except Exception:
-        return False
 def check_directory_content(input_path):
     """
     检查input_path内容, 是否全是step{数字}命名的文件夹(例如step0), 或者全是rank{数字}命名的文件夹(例如rank0), 或者全是文件
@@ -143,14 +119,12 @@ class ToolTip:
         '当最大相对误差越接近0表示其计算的误差越小。'
         '当dump数据中存在0或Nan时，比对结果中最大相对误差则出现inf或Nan的情况，属于正常现象'
     )
-    SMALL_VALUE_TIP = '{}, 由于{}小于{}, 建议不参考此相对误差，请参考绝对误差'
 class GraphConst:
     CONSTRUCT_FILE = 'construct.json'
     DUMP_FILE = 'dump.json'
     STACK_FILE = 'stack.json'
-    GRAPH_FILE = 'graph.vis'
     ERROR_KEY = 'error_key'
     SUMMARY_COMPARE = 0
     MD5_COMPARE = 1
@@ -164,35 +138,22 @@ class GraphConst:
     JSON_DATA_KEY = 'dump_data_dir'
     JSON_TASK_KEY = 'task'
     DATA_KEY = 'data'
-    REAL_DATA_TH = 0.1
-    MAX_RELATIVE_ERR_TH = 0.5
     ROUND_TH = 6
     JSON_INDEX_KEY = 'precision_index'
     MATCHED_DISTRIBUTED = 'matched_distributed'
     OVERFLOW_LEVEL = 'overflow_level'
     MAX_INDEX_KEY = 1
     MIN_INDEX_KEY = 0
-    SUGGEST_KEY = 'text'
-    TAG_NA = 'na'
-    OUTPUT_INDEX_TWO = -2
-    OUTPUT_INDEX_THREE = -3
-    OUTPUT_MIN_LEN = 3
     INPUT = '.input.'
     OUTPUT = '.output.'
     STR_MAX_LEN = 50
-    SMALL_VALUE = 1e-3
     MD5_INDEX_LIST = [CompareConst.RESULT]
-    REAL_DATA_INDEX_LIST = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR,
-                            CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO]
-    SUMMARY_INDEX_LIST = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF,
-                          CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR,
-                          CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR]
-    VALUE_INDEX_LIST = [Const.MAX, Const.MIN, Const.MEAN, Const.NORM]
+    REAL_DATA_INDEX_LIST = CompareConst.ALL_COMPARE_INDEX
+    SUMMARY_INDEX_LIST = CompareConst.SUMMARY_COMPARE_INDEX
     APIS_BETWEEN_MODULES = 'Apis_Between_Modules'
     NULL = 'null'
     NONE = 'None'
     VALUE = 'value'
-    BRACE = '{}'
     DESCRIPTION = 'description'
     COLORS = 'Colors'
     MICRO_STEPS = 'MicroSteps'
@@ -223,3 +184,24 @@ class GraphConst:
     OP = 'op'
     PEER = 'peer'
     GROUP_ID = 'group_id'
+def is_serializable(obj):
+    """
+    Check if an object is serializable
+    """
+    try:
+        pickle.dumps(obj)
+        return True
+    except (pickle.PicklingError, AttributeError, TypeError):
+        return False
+    except Exception as e:
+        logger.error('Unexpected error occurred while pickling obj.')
+        raise RuntimeError('Unexpected error occurred while pickling obj.') from e
+class SerializableArgs:
+    def __init__(self, args):
+        for k, v in vars(args).items():
+            if is_serializable(v):
+                setattr(self, k, v)

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl