PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +3 -2
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/RECORD +196 -141
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +14 -19
msprobe/config.json +1 -0
msprobe/core/common/const.py +155 -6
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/utils.py +28 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +18 -7
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +189 -69
msprobe/core/data_dump/data_collector.py +51 -21
msprobe/core/data_dump/data_processor/base.py +38 -20
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +154 -20
msprobe/core/data_dump/data_processor/pytorch_processor.py +118 -58
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +96 -17
msprobe/docs/02.config_introduction.md +5 -5
msprobe/docs/05.data_dump_PyTorch.md +91 -61
msprobe/docs/06.data_dump_MindSpore.md +57 -19
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -4
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +120 -27
msprobe/docs/21.visualization_PyTorch.md +115 -35
msprobe/docs/22.visualization_MindSpore.md +138 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +10 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +57 -25
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +5 -7
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +50 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +3 -0
msprobe/mindspore/debugger/precision_debugger.py +81 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +83 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +5 -1
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +267 -101
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -6
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +54 -30
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +45 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +27 -12
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +80 -6
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +34 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +107 -22
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +483 -277
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +52 -14
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +77 -6
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/service.py +176 -106
msprobe/visualization/builder/graph_builder.py +62 -5
msprobe/visualization/builder/msprobe_adapter.py +24 -2
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +12 -17
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph_service.py +97 -23
msprobe/visualization/utils.py +14 -29
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py CHANGED Viewed

@@ -46,6 +46,13 @@ class KernelGraphOverflowCheck:
             self.dump_json["common_dump_settings"]["op_debug_mode"] = 2
     def handle(self):
+        try:
+            from msprobe.lib import _msprobe_c
+            return
+        except ImportError:
+            # 如果没有_msprobe_ce_c走MindSpore老流程
+            logger.info("Module _msprobe_c has not been installed, use interface in mindspore instead.")
         if os.getenv("GRAPH_OP_RUN") == "1":
             raise Exception("Must run in graph mode, not kbk mode")
         json_path = self.dump_json["common_dump_settings"]["path"]

msprobe/mindspore/service.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +20,8 @@ from collections import defaultdict
 import mindspore as ms
 from mindspore import nn
+from mindspore.common.api import _no_grad
+from mindspore.ops.primitive import Primitive
 try:
     from mindspore.common._pijit_context import PIJitCaptureContext
 except ImportError:
@@ -27,19 +29,25 @@ except ImportError:
 else:
     pijit_label = True
 from msprobe.core.common.exceptions import DistributedNotInitializedError, MsprobeException
 from msprobe.core.common.file_utils import create_directory
 from msprobe.core.common.utils import Const, print_tools_ends_info
 from msprobe.core.data_dump.data_collector import build_data_collector
-from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs
+from msprobe.core.data_dump.data_processor.base import (ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs,
+                                                        ModuleBackwardInputs)
 from msprobe.core.data_dump.scope import BaseScope
 from msprobe.mindspore.cell_processor import CellProcessor
 from msprobe.mindspore.common.log import logger
-from msprobe.mindspore.common.utils import get_rank_if_initialized
+from msprobe.mindspore.common.utils import (get_rank_if_initialized, clean_input_kwargs,
+                                            is_mindtorch, register_backward_hook_functions)
 from msprobe.mindspore.dump.hook_cell.api_registry import api_register
 from msprobe.mindspore.dump.hook_cell.primitive_hooks import PrimitiveHookService
 from msprobe.mindspore.dump.jit_dump import JitDump
+from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
+from msprobe.mindspore.dump.kernel_dump.kernel_config import create_kernel_config_json
+if is_mindtorch():
+    import torch
 class Service:
@@ -51,54 +59,144 @@ class Service:
         self.cell_processor = CellProcessor(self.data_collector.scope)
         self.primitive_hook_service = PrimitiveHookService(self)
         self.switch = False
+        self.inner_switch = False
         self.primitive_switch = False
         self.current_iter = 0
         self.first_start = True
         self.current_rank = None
         self.dump_iter_dir = None
         self.start_call = False
-        self.check_level_valid()
         self.should_stop_service = False
+        self.params_grad_info = {}
+        # 提前注册，确保注册尽可能多的API hook
+        self.register_api_hook()
     @staticmethod
-    def check_model_valid(model):
-        if not model or isinstance(model, nn.Cell):
-            return model
-        raise MsprobeException(
-            MsprobeException.INVALID_PARAM_ERROR, "model 参数必须是 mindspore.nn.Cell 类型。"
-        )
+    def check_model_valid(models):
+        target_module_type = (torch.nn.Module, "torch.nn.Module") if is_mindtorch() else (nn.Cell, "mindspore.nn.Cell")
+        if models is None or isinstance(models, target_module_type[0]):
+            return models
+        error_model = None
+        if isinstance(models, (list, tuple)):
+            for model in models:
+                if not isinstance(model, target_module_type[0]):
+                    error_model = model
+                    break
+        else:
+            error_model = models
-    def check_level_valid(self):
-        if self.config.level == Const.LEVEL_L2:
+        if error_model is not None:
+            error_info = (f"The 'model' parameter must be a {target_module_type[1]} or list[{target_module_type[1]}] "
+                          f"type, currently there is a {type(error_model)} type.")
             raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, "L2 level dump function is currently not supported."
-            )
+                MsprobeException.INVALID_PARAM_ERROR, error_info)
+        return models
+    @staticmethod
+    def prepare_module_input_output(target_type, cell, input_data, output):
+        if target_type == BaseScope.Module_Type_Module:
+            module_input_output = ModuleForwardInputsOutputs(args=input_data, kwargs={}, output=output)
+        else:
+            module_input_output = ModuleForwardInputsOutputs(args=input_data, kwargs=cell.input_kwargs, output=output)
+        return module_input_output
     def build_hook(self, target_type, name):
-        def forward_hook(api_or_cell_name, cell, input_data, output):
-            if not self.should_excute_hook():
-                if hasattr(cell, 'input_kwargs'):
-                    del cell.input_kwargs
+        def pre_hook(api_or_cell_name, cell, input_data):
+            if not self.should_execute_hook(target_type, cell, True):
+                clean_input_kwargs(cell)
                 return None
-            if target_type == BaseScope.Module_Type_Module:
-                api_or_cell_name = self.cell_processor.set_and_get_reserved_name(cell, api_or_cell_name)
-                module_input_output = ModuleForwardInputsOutputs(args=input_data, kwargs={}, output=output)
-            else:
-                module_input_output = ModuleForwardInputsOutputs(args=input_data, kwargs=cell.input_kwargs,
-                                                                 output=output)
+            with _no_grad():
+                self.inner_switch = True
+                if target_type == BaseScope.Module_Type_Module:
+                    api_or_cell_name = self.cell_processor.set_and_get_reserved_name(cell, api_or_cell_name)
+                else:
+                    cell.forward_data_collected = True
+                    HOOKCell.add_cell_count(name)
+                module_input_output = self.prepare_module_input_output(target_type, cell, input_data, None)
+                self.data_collector.update_api_or_module_name(api_or_cell_name)
+                self.data_collector.forward_input_data_collect(api_or_cell_name, cell, pid, module_input_output)
+                self.inner_switch = False
+                return input_data
+        def grad_hook(cell, ori_name, param_name):
+            def hook_fn(grad):
+                if not self.should_execute_hook(target_type, cell, False):
+                    return None
+                self.inner_switch = True
+                self.data_collector.params_data_collect(ori_name, param_name, pid, grad)
+                self.inner_switch = False
+                return None
-            self.data_collector.update_api_or_module_name(api_or_cell_name)
-            self.data_collector.forward_data_collect(api_or_cell_name, cell, pid, module_input_output)
-            if self.data_collector.if_return_forward_new_output():
-                return self.data_collector.get_forward_new_output()
-            if hasattr(cell, 'input_kwargs'):
-                del cell.input_kwargs
-            return output
+            return hook_fn
+        def register_param_hook(ori_name, cell, params_dict):
+            '''
+            注册参数hook
+            '''
+            # data_mode为forward时，不注册参数hook
+            if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
+                for param_name, param in params_dict.items():
+                    if param.requires_grad:
+                        param.register_hook(grad_hook(cell, ori_name, param_name))
+        def init_params_grad_info(cell, params_dict):
+            '''
+            初始化参数梯度信息, 在前向hook结束后, 将参数梯度信息写入cache_data中用于占位
+            '''
+            if not params_dict:
+                return
+            if not (Const.FORWARD in self.config.data_mode and Const.BACKWARD not in self.config.data_mode):
+                grad_name = cell.params_grad_name if hasattr(cell, 'params_grad_name') else None
+                # 判断是否已经在cache_data中进行了占位, 若没有则先写入cache_data中
+                if not self.params_grad_info.get(grad_name):
+                    data_info = {grad_name: {key: [None] for key, value in params_dict.items() if value.requires_grad}}
+                    # 当模块中的参数有requires_grad属性为True时，才会进行梯度计算，此时才需要占位
+                    if data_info.get(grad_name):
+                        # 将grad_name的data_info先写入cache_data中, 梯度计算后再更新
+                        self.data_collector.handle_data(grad_name, data_info,
+                                                        flush=self.data_collector.data_processor.is_terminated)
+                    # 记录当前模块的参数梯度信息已占位
+                    self.params_grad_info[grad_name] = True
+        def forward_hook(api_or_cell_name, cell, input_data, output):
+            if not self.should_execute_hook(target_type, cell, True):
+                clean_input_kwargs(cell)
+                return None
+            with _no_grad():
+                self.inner_switch = True
+                module_input_output = self.prepare_module_input_output(target_type, cell, input_data, output)
+                if target_type == BaseScope.Module_Type_Module:
+                    api_or_cell_name = self.cell_processor.set_and_get_reserved_name(cell, api_or_cell_name)
+                    params_dict = {key.split(Const.SEP)[-1]: value for key, value in cell.parameters_dict(
+                        recurse=False).items()}
+                    setattr(module_input_output, Const.PARAMS, params_dict)
+                    # 判断是否需要注册参数hook
+                    if not hasattr(cell, 'params_grad_name') and params_dict:
+                        ori_name = api_or_cell_name.rsplit(Const.SEP, 2)[0]
+                        grad_name = ori_name + Const.SEP + Const.PARAMS_GRAD
+                        # 首次执行前向hook时，添加params_grad_name属性，并注册参数hook
+                        setattr(cell, 'params_grad_name', grad_name)
+                        register_param_hook(ori_name, cell, params_dict)
+                    self.data_collector.update_api_or_module_name(api_or_cell_name)
+                    self.data_collector.forward_data_collect(api_or_cell_name, cell, pid, module_input_output)
+                    init_params_grad_info(cell, params_dict)
+                else:
+                    self.data_collector.update_api_or_module_name(api_or_cell_name)
+                    self.data_collector.forward_output_data_collect(api_or_cell_name, cell, pid, module_input_output)
+                if self.data_collector.if_return_forward_new_output():
+                    forward_new_output = self.data_collector.get_forward_new_output()
+                    self.inner_switch = False
+                    return forward_new_output
+                clean_input_kwargs(cell)
+                self.inner_switch = False
+                return output
         def backward_hook(api_or_cell_name, cell, grad_input, grad_output):
-            if not self.should_excute_hook():
+            if not self.should_execute_hook(target_type, cell, False):
                 return
+            self.inner_switch = True
             need_exchange = True
             if target_type == BaseScope.Module_Type_Module:
@@ -114,12 +212,32 @@ class Service:
                 else:
                     module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output)
                 self.data_collector.backward_data_collect(api_or_cell_name, cell, pid, module_input_output)
+            self.inner_switch = False
+        def pre_backward_hook(api_or_cell_name, cell, grad_input):
+            if not self.should_execute_hook(target_type, cell, False):
+                return
+            self.inner_switch = True
+            module_input = ModuleBackwardInputs(grad_input=grad_input)
+            self.data_collector.update_api_or_module_name(api_or_cell_name)
+            self.data_collector.backward_input_data_collect(api_or_cell_name, cell, pid, module_input)
+            self.inner_switch = False
         pid = os.getpid()
-        forward_name_template = name + Const.FORWARD
-        backward_name_template = name + Const.BACKWARD
-        forward_hook = functools.partial(forward_hook, forward_name_template)
-        backward_hook = functools.partial(backward_hook, backward_name_template)
+        if target_type == BaseScope.Module_Type_Module:
+            full_forward_name = name + Const.FORWARD
+            full_backward_name = name + Const.BACKWARD
+        else:
+            full_forward_name = name + str(HOOKCell.get_cell_count(name)) + Const.SEP + Const.FORWARD
+            full_backward_name = name + str(HOOKCell.get_cell_count(name)) + Const.SEP + Const.BACKWARD
+        pre_forward_hook = functools.partial(pre_hook, full_forward_name)
+        forward_hook = functools.partial(forward_hook, full_forward_name)
+        backward_hook = functools.partial(backward_hook, full_backward_name)
+        pre_backward_hook = functools.partial(pre_backward_hook, full_backward_name)
+        def wrap_pre_forward_hook(cell, input_data):
+            return pre_forward_hook(cell, input_data)
         def wrap_forward_hook(cell, input_data, output_data):
             return forward_hook(cell, input_data, output_data)
@@ -127,7 +245,10 @@ class Service:
         def wrap_backward_hook(cell, grad_input, grad_output):
             return backward_hook(cell, grad_input, grad_output)
-        return wrap_forward_hook, wrap_backward_hook
+        def wrap_pre_backward_hook(cell, grad_input):
+            return pre_backward_hook(cell, grad_input)
+        return wrap_pre_forward_hook, wrap_forward_hook, wrap_backward_hook, wrap_pre_backward_hook
     def update_primitive_counters(self, primitive_name):
         if primitive_name not in self.primitive_counters:
@@ -135,33 +256,20 @@ class Service:
         else:
             self.primitive_counters[primitive_name] += 1
-    def register_primitive_hooks(self):
-        primitive_set = set()
-        for _, cell in self.model.cells_and_names():
-            for pname, primitive in cell._primitives.items():
-                primitive_set.add((pname, primitive))
-        for pname, primitive in primitive_set:
-            primitive_class_name = primitive.__class__.__name__
-            primitive_combined_name = pname + Const.SEP + primitive_class_name
-            new_primitive = type('NewPrimitive', (primitive.__class__,),
-                                 {'__call__': self.primitive_hook_service.wrap_primitive(primitive.__call__,
-                                                                                         primitive_combined_name)})
-            primitive.__class__ = new_primitive
     def step(self):
+        if self.config.async_dump:
+            self.data_collector.fill_stack_tensor_data()
+            self.data_collector.data_processor.dump_async_data()
+        self.data_collector.write_json()
         self.current_iter += 1
         self.data_collector.update_iter(self.current_iter)
-        self.primitive_hook_service.primitive_counters.clear()
-        self.data_collector.data_writer.reset_cache()
-        JitDump.jit_count = defaultdict(int)
+        self.reset_status()
     def start(self, model=None):
         self.start_call = True
         if self.should_stop_service:
             return
         if self.need_end_service():
-            api_register.api_set_ori_func()
             self.should_stop_service = True
             self.switch = False
             self.primitive_switch = False
@@ -181,7 +289,8 @@ class Service:
             if self.config.rank and self.current_rank not in self.config.rank:
                 return
-            self.register_hook_new()
+            self.register_primitive_hook()
+            self.register_cell_hook()
             if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
                 JitDump.set_config(self.config)
                 JitDump.set_data_collector(self.data_collector)
@@ -200,25 +309,6 @@ class Service:
         logger.info(f"Dump data will be saved in {self.dump_iter_dir}.")
         JitDump.jit_dump_switch = True
-    def forward_backward_dump_end(self):
-        if self.should_stop_service:
-            return
-        logger.info(f"{Const.TOOL_NAME}: debugger.forward_backward_dump_end() is set successfully. ")
-        if not self.start_call:
-            logger.error(f"{Const.TOOL_NAME}: debugger.start() is not set in the current scope.")
-            raise Exception("debugger.start() is not set in the current scope.")
-        if not self.switch:
-            logger.error(f"{Const.TOOL_NAME}: debugger.forward_backward_dump_end() should be called between "
-                         "debugger.start() and debugger.stop() ")
-            raise Exception("debugger.stop() is already called. ")
-        if self.config.step and self.current_iter not in self.config.step:
-            return
-        if self.config.rank and self.current_rank not in self.config.rank:
-            return
-        self.primitive_switch = False
-        api_register.api_set_ori_func()
-        JitDump.jit_dump_switch = False
     def stop(self):
         if self.should_stop_service:
             return
@@ -234,6 +324,9 @@ class Service:
         self.switch = False
         self.primitive_switch = False
         self.start_call = False
+        if self.config.async_dump:
+            self.data_collector.fill_stack_tensor_data()
+            self.data_collector.data_processor.dump_async_data()
         self.data_collector.write_json()
         JitDump.jit_dump_switch = False
@@ -244,8 +337,16 @@ class Service:
             return True
         return False
-    def should_excute_hook(self):
-        if not self.switch:
+    def should_execute_hook(self, hook_type, cell, is_forward):
+        is_cell_hook = hook_type == BaseScope.Module_Type_Module
+        if is_cell_hook and not self.switch:
+            return False
+        elif not is_cell_hook and is_forward and not self.switch:
+            return False
+        elif not is_cell_hook and not is_forward and not cell.forward_data_collected:
+            return False
+        if self.inner_switch:
             return False
         if not self.data_collector or self.data_collector.data_processor.is_terminated:
             return False
@@ -255,6 +356,12 @@ class Service:
         create_directory(self.config.dump_path)
         self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}")
         cur_rank = self.current_rank if self.current_rank is not None else ''
+        if self.config.level == Const.LEVEL_L2:
+            create_directory(self.dump_iter_dir)
+            kernel_config_path = create_kernel_config_json(self.dump_iter_dir, cur_rank)
+            self.config.kernel_config_path = kernel_config_path
+            return
         dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
         create_directory(dump_dir)
         if self.config.task in self.data_collector.tasks_need_tensor_data:
@@ -267,37 +374,96 @@ class Service:
         stack_file_path = os.path.join(dump_dir, "stack.json")
         construct_file_path = os.path.join(dump_dir, "construct.json")
         self.data_collector.update_dump_paths(
-            dump_file_path, stack_file_path, construct_file_path, dump_data_dir, None)
+            dump_file_path, stack_file_path, construct_file_path, dump_data_dir, None
+        )
+        self.data_collector.initialize_json_file(
+            framework=Const.MT_FRAMEWORK if is_mindtorch() else Const.MS_FRAMEWORK
+        )
     def empty(self, *args, **kwargs):
         pass
-    def register_hook_new(self):
-        logger.info("The {} hook function is successfully mounted to the model.".format(self.config.task))
-        if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
+    def register_api_hook(self):
+        if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1, Const.LEVEL_L2]:
+            logger.info(f"The api {self.config.task} hook function is successfully mounted to the model.")
             api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
             api_register.api_set_hook_func()
-            if self.model and self.config.task in Const.DUMP_DATA_COLLECTION_LIST:
-                self.register_primitive_hooks()
+    def get_cells_and_names(self):
+        cells_and_names_with_index = {}
+        def get_cell_or_module(model):
+            return model.named_modules() if is_mindtorch() else model.cells_and_names()
+        if isinstance(self.model, (list, tuple)):
+            for index, model in enumerate(self.model):
+                cells_and_names_with_index[str(index)] = get_cell_or_module(model)
+        else:
+            cells_and_names_with_index["-1"] = get_cell_or_module(self.model)
+        return cells_and_names_with_index
+    def register_primitive_hook(self):
+        if self.config.level not in [Const.LEVEL_MIX, Const.LEVEL_L1]:
+            return
+        if not self.model or self.config.task not in Const.DUMP_DATA_COLLECTION_LIST:
+            return
+        primitive_set = set()
+        cells_and_names_with_index = self.get_cells_and_names()
+        for cells_and_names in cells_and_names_with_index.values():
+            for _, cell in cells_and_names:
+                for attribute, value in vars(cell).items():
+                    if isinstance(value, Primitive):
+                        primitive_set.add((attribute, value))
+        for pname, primitive in primitive_set:
+            primitive_class_name = primitive.__class__.__name__
+            primitive_combined_name = pname + Const.SEP + primitive_class_name
+            new_primitive = type('NewPrimitive', (primitive.__class__,),
+                                 {'__call__': self.primitive_hook_service.wrap_primitive(primitive.__call__,
+                                                                                         primitive_combined_name)})
+            primitive.__class__ = new_primitive
+    def register_cell_hook(self):
         if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L0]:
+            logger.info(f"The cell {self.config.task} hook function is successfully mounted to the model.")
             if not self.model:
                 raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                        f"The current level is {self.config.level}, the model cannot be None")
-            for name, cell in self.model.cells_and_names():
-                if cell == self.model:
-                    continue
-                prefix = 'Cell' + Const.SEP + name + Const.SEP + \
-                         cell.__class__.__name__ + Const.SEP
-                forward_hook, backward_hook = self.build_hook(BaseScope.Module_Type_Module, prefix)
-                cell.register_forward_hook(forward_hook)
-                cell.register_backward_hook(backward_hook)
-                cell.register_forward_pre_hook(
-                    self.cell_processor.node_hook(prefix + Const.FORWARD, Const.START))
-                cell.register_forward_hook(
-                    self.cell_processor.node_hook(prefix + Const.FORWARD, Const.STOP))
-                cell.register_backward_pre_hook(
-                    self.cell_processor.node_hook(prefix + Const.BACKWARD, Const.START))
-                cell.register_backward_hook(
-                    self.cell_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
+            model_type = Const.MODULE if is_mindtorch() else Const.CELL
+            cells_and_names_with_index = self.get_cells_and_names()
+            for index, cells_and_names in cells_and_names_with_index.items():
+                model = self.model if index == "-1" else self.model[int(index)]
+                for name, cell in cells_and_names:
+                    if cell == model:
+                        continue
+                    cell_index = (index + Const.SEP) if index != "-1" else ""
+                    prefix = (model_type + Const.SEP + cell_index + name +
+                              Const.SEP + cell.__class__.__name__ + Const.SEP)
+                    _, forward_hook, backward_hook, _ = self.build_hook(BaseScope.Module_Type_Module, prefix)
+                    cell.register_forward_hook(forward_hook)
+                    cell.register_forward_pre_hook(
+                        self.cell_processor.node_hook(prefix + Const.FORWARD, Const.START))
+                    cell.register_forward_hook(
+                        self.cell_processor.node_hook(prefix + Const.FORWARD, Const.STOP))
+                    register_backward_hook_functions["full"](cell, backward_hook)
+                    register_backward_hook_functions["pre"](
+                        cell, self.cell_processor.node_hook(prefix + Const.BACKWARD, Const.START))
+                    register_backward_hook_functions["full"](
+                        cell, self.cell_processor.node_hook(prefix + Const.BACKWARD, Const.STOP))
+    def reset_status(self):
+        self.primitive_hook_service.primitive_counters.clear()
+        self.data_collector.data_writer.reset_cache()
+        JitDump.jit_count = defaultdict(int)
+        self.params_grad_info.clear()
+        if self.config.level == Const.LEVEL_L2:
+            self.data_collector.data_processor.reset_status()
+            return
+        if self.config.step and self.current_iter not in self.config.step:
+            return
+        if self.config.rank and self.current_rank not in self.config.rank:
+            return

msprobe/msprobe.py CHANGED Viewed

@@ -16,10 +16,12 @@
 import argparse
 import sys
 import importlib.util
-from msprobe.core.compare.utils import _compare_parser
+from msprobe.core.common.const import Const
 from msprobe.core.common.log import logger
+from msprobe.core.compare.utils import _compare_parser
 from msprobe.core.compare.compare_cli import compare_cli
-from msprobe.core.common.const import Const
+from msprobe.core.compare.merge_result.merge_result_cli import _merge_result_parser, merge_result_cli
 def is_module_available(module_name):
@@ -45,10 +47,15 @@ def main():
     multi_run_ut_cmd_parser = subparsers.add_parser('multi_run_ut')
     api_precision_compare_cmd_parser = subparsers.add_parser('api_precision_compare')
     run_overflow_check_cmd_parser = subparsers.add_parser('run_overflow_check')
+    code_mapping_cmd_parser = subparsers.add_parser('code_mapping')
     graph_service_cmd_parser = subparsers.add_parser('graph')
+    op_generate_cmd_parser = subparsers.add_parser('op_generate')
+    merge_result_parser = subparsers.add_parser('merge_result')
     _compare_parser(compare_cmd_parser)
+    _merge_result_parser(merge_result_parser)
     is_torch_available = is_module_available("torch")
-    is_mindspore_available = is_module_available("mindspore")
     if len(sys.argv) < 4:
         parser.print_help()
         sys.exit(0)
@@ -62,6 +69,8 @@ def main():
         from msprobe.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \
             _run_overflow_check_command
         from msprobe.visualization.graph_service import _pt_graph_service_parser, _pt_graph_service_command
+        from msprobe.pytorch.api_accuracy_checker.generate_op_script.op_generator import _op_generator_parser, \
+            _run_operator_generate_commond
         _run_ut_parser(run_ut_cmd_parser)
         _run_ut_parser(multi_run_ut_cmd_parser)
@@ -70,12 +79,15 @@ def main():
         _api_precision_compare_parser(api_precision_compare_cmd_parser)
         _run_overflow_check_parser(run_overflow_check_cmd_parser)
         _pt_graph_service_parser(graph_service_cmd_parser)
+        _op_generator_parser(op_generate_cmd_parser)
     elif framework_args.framework == Const.MS_FRAMEWORK:
         from msprobe.mindspore.api_accuracy_checker.cmd_parser import add_api_accuracy_checker_argument
         from msprobe.visualization.graph_service import _ms_graph_service_parser, _ms_graph_service_command
         add_api_accuracy_checker_argument(run_ut_cmd_parser)
         from msprobe.mindspore.api_accuracy_checker.cmd_parser import multi_add_api_accuracy_checker_argument
         multi_add_api_accuracy_checker_argument(multi_run_ut_cmd_parser)
+        from msprobe.mindspore.code_mapping.cmd_parser import add_ir_parser_arguments
+        add_ir_parser_arguments(code_mapping_cmd_parser)
         _ms_graph_service_parser(graph_service_cmd_parser)
@@ -97,17 +109,23 @@ def main():
             _run_overflow_check_command(args)
         elif sys.argv[3] == "graph":
             _pt_graph_service_command(args)
+        elif sys.argv[3] == 'op_generate':
+            _run_operator_generate_commond(args)
         elif sys.argv[3] == "compare":
             if args.cell_mapping is not None or args.api_mapping is not None:
                 logger.error("Argument -cm or -am is not supported in PyTorch framework")
                 raise Exception("Argument -cm or -am is not supported in PyTorch framework")
             compare_cli(args)
+        elif sys.argv[3] == "merge_result":
+            merge_result_cli(args)
     else:
         if not is_module_available(Const.MS_FRAMEWORK):
             logger.error("MindSpore does not exist, please install MindSpore library")
             raise Exception("MindSpore does not exist, please install MindSpore library")
         if sys.argv[3] == "compare":
             compare_cli(args)
+        elif sys.argv[3] == "merge_result":
+            merge_result_cli(args)
         elif sys.argv[3] == "run_ut":
             from msprobe.mindspore.api_accuracy_checker.main import api_checker_main
             api_checker_main(args)
@@ -116,6 +134,9 @@ def main():
             mul_api_checker_main(args)
         elif sys.argv[3] == "graph":
             _ms_graph_service_command(args)
+        elif sys.argv[3] == "code_mapping":
+            from msprobe.mindspore.code_mapping.main import code_mapping_main
+            code_mapping_main(args)
 if __name__ == "__main__":

msprobe/pytorch/__init__.py CHANGED Viewed

@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -16,9 +14,12 @@
 # limitations under the License.
-from msprobe.pytorch.monitor.module_hook import TrainerMon
+import torch
 from .compare.distributed_compare import compare_distributed
 from .compare.pt_compare import compare
 from .common.utils import seed_all
-from .debugger.precision_debugger import PrecisionDebugger
-from .functional.module_dump import module_dump, module_dump_end
+from .debugger.precision_debugger import PrecisionDebugger, module_dump, module_dump_end
+torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+if torch_version_above_or_equal_2:
+    from msprobe.pytorch.monitor.module_hook import TrainerMon

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.1py3-none-any.whl