PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +3 -2
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/RECORD +196 -141
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +14 -19
msprobe/config.json +1 -0
msprobe/core/common/const.py +155 -6
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/utils.py +28 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +18 -7
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +189 -69
msprobe/core/data_dump/data_collector.py +51 -21
msprobe/core/data_dump/data_processor/base.py +38 -20
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +154 -20
msprobe/core/data_dump/data_processor/pytorch_processor.py +118 -58
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +96 -17
msprobe/docs/02.config_introduction.md +5 -5
msprobe/docs/05.data_dump_PyTorch.md +91 -61
msprobe/docs/06.data_dump_MindSpore.md +57 -19
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +4 -4
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +120 -27
msprobe/docs/21.visualization_PyTorch.md +115 -35
msprobe/docs/22.visualization_MindSpore.md +138 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +10 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +57 -25
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +5 -7
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +50 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +3 -0
msprobe/mindspore/debugger/precision_debugger.py +81 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +83 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +5 -1
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +267 -101
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -6
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +54 -30
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +45 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +27 -12
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +80 -6
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +34 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +107 -22
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +483 -277
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +52 -14
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +77 -6
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/service.py +176 -106
msprobe/visualization/builder/graph_builder.py +62 -5
msprobe/visualization/builder/msprobe_adapter.py +24 -2
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +12 -17
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph_service.py +97 -23
msprobe/visualization/utils.py +14 -29
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/pytorch/compare/pt_compare.py CHANGED Viewed

@@ -14,19 +14,29 @@
 # limitations under the License.
 import os.path
 import torch
 from msprobe.core.common.const import FileCheckConst
-from msprobe.pytorch.common.log import logger
 from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.compare.acc_compare import Comparator
-from msprobe.core.common.utils import check_configuration_param, check_compare_param, \
-    CompareException, set_dump_path, get_dump_mode
 from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml
+from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
+    set_dump_path
+from msprobe.core.compare.acc_compare import Comparator, ModeConfig
+from msprobe.core.compare.utils import set_stack_json_path
+from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import load_pt
-class PTComparator (Comparator):
-    def __init__(self, data_mapping=None):
+class PTComparator(Comparator):
+    def __init__(self, mode_config, data_mapping=None):
+        super().__init__(mode_config)
+        self.stack_mode = mode_config.stack_mode
+        self.auto_analyze = mode_config.auto_analyze
+        self.fuzzy_match = mode_config.fuzzy_match
+        self.dump_mode = mode_config.dump_mode
         self.frame_name = PTComparator.__name__
         self.data_mapping = data_mapping
         if isinstance(self.data_mapping, str) or self.data_mapping is None:
@@ -37,23 +47,24 @@ class PTComparator (Comparator):
             raise TypeError(f"The type of parameter `data_mapping` must be dict, str or None, but got "
                             f"{type(self.data_mapping)}")
-    def load_mapping_file(self, mapping_file):
+    @staticmethod
+    def load_mapping_file(mapping_file):
         if isinstance(mapping_file, str):
             mapping_dict = load_yaml(mapping_file)
         else:
             mapping_dict = {}
         return mapping_dict
     def read_npy_data(self, dir_path, file_name):
         if not file_name:
             return None
         data_path = os.path.join(dir_path, file_name)
         path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                                FileCheckConst.PT_SUFFIX, False)
+                                   FileCheckConst.PT_SUFFIX, False)
         data_path = path_checker.common_check()
         try:
-            data_value = load_pt(data_path,
-                                 to_cpu=True).detach()  # detach because numpy can not process gradient information
+            # detach because numpy can not process gradient information
+            data_value = load_pt(data_path, to_cpu=True).detach()
         except RuntimeError as e:
             # 这里捕获 load_pt 中抛出的异常
             logger.error(f"Failed to load the .pt file at {data_path}.")
@@ -65,20 +76,29 @@ class PTComparator (Comparator):
         if data_value.dtype == torch.bfloat16:
             data_value = data_value.to(torch.float32)
         data_value = data_value.numpy()
-        return data_value
-def compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False, **kwargs):
+        return data_value
+def compare(input_param, output_path, **kwargs):
     try:
+        auto_analyze = kwargs.get('auto_analyze', True)
+        fuzzy_match = kwargs.get('fuzzy_match', False)
+        data_mapping = kwargs.get('data_mapping', None)
+        suffix = kwargs.get('suffix', '')
         set_dump_path(input_param)
         dump_mode = get_dump_mode(input_param)
+        if "stack_json_path" in input_param:
+            stack_mode = kwargs.get('stack_mode', False)
+        else:
+            stack_mode = set_stack_json_path(input_param)  # set stack_mode and set "stack_json_path" in input_param
         check_configuration_param(stack_mode, auto_analyze, fuzzy_match, input_param.get('is_print_compare_log', True))
         create_directory(output_path)
-        check_compare_param(input_param, output_path, dump_mode)
-        data_mapping = kwargs.get('data_mapping', None)
+        check_compare_param(input_param, output_path, dump_mode, stack_mode)
     except (CompareException, FileCheckException) as error:
         logger.error('Compare failed. Please check the arguments and do it again!')
         raise CompareException(error.code) from error
-    pt_comparator = PTComparator(data_mapping)
-    pt_comparator.compare_core(input_param, output_path, stack_mode=stack_mode,
-                 auto_analyze=auto_analyze, fuzzy_match=fuzzy_match, dump_mode=dump_mode)
+    mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
+    pt_comparator = PTComparator(mode_config, data_mapping)
+    pt_comparator.compare_core(input_param, output_path, suffix=suffix)

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -34,6 +34,7 @@ class DebuggerConfig:
         self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS
         self.overflow_nums = task_config.overflow_nums if task_config.overflow_nums else 1
         self.framework = Const.PT_FRAMEWORK
+        self.async_dump = common_config.async_dump if common_config.async_dump else False
         if self.level == Const.LEVEL_L2:
             self.is_backward_kernel_dump = False
@@ -74,29 +75,43 @@ class DebuggerConfig:
         if not self.dump_path:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The dump_path not found.")
+        if not isinstance(self.async_dump, bool):
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                   f"The parameters async_dump should be bool.")
     def check(self):
         self.check_kwargs()
         return True
     def check_model(self, instance, start_model):
-        if self.level not in ["L0", "mix"]:
+        if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             if instance.model is not None or start_model is not None:
-                logger.warning_on_rank_0(
+                logger.info_on_rank_0(
                     f"The current level is not L0 or mix level, so the model parameters will not be used.")
             return
-        if start_model is None:
-            if instance.model is None:
-                logger.error_on_rank_0(
-                    f"For level {self.level}, PrecisionDebugger or start interface must receive a 'model' argument.")
-                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"missing the parameter 'model'")
-            return
-        if isinstance(start_model, torch.nn.Module):
-            instance.model = start_model
+        if start_model is None and instance.model is None:
+            logger.error_on_rank_0(
+                f"For level {self.level}, PrecisionDebugger or start interface must receive a 'model' parameter.")
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"missing the parameter 'model'")
+        instance.model = start_model if start_model is not None else instance.model
+        if isinstance(instance.model, torch.nn.Module):
+            return
+        error_model = None
+        if isinstance(instance.model, (list, tuple)):
+            for model in instance.model:
+                if not isinstance(model, torch.nn.Module):
+                    error_model = model
+                    break
         else:
-            logger.error_on_rank_0(f"The 'model' parameter of start must be a torch.nn.Module type.")
+            error_model = instance.model
+        if error_model is not None:
+            error_info = (f"The 'model' parameter must be a torch.nn.Moudle or list[torch.nn.Moudle] "
+                          f"type, currently there is a {type(error_model)} type.")
             raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, f"model must be a torch.nn.Module")
+                MsprobeException.INVALID_PARAM_ERROR, error_info)
     def _check_and_adjust_config_with_l2(self):
         if self.scope:

msprobe/pytorch/debugger/precision_debugger.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -22,6 +22,7 @@ from msprobe.core.common.file_utils import FileChecker
 from msprobe.core.common.utils import get_real_step_or_rank
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.debugger.debugger_config import DebuggerConfig
+from msprobe.pytorch.dump.module_dump.module_dump import ModuleDumper
 from msprobe.pytorch.grad_probe.grad_monitor import GradientMonitor
 from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.pytorch.service import Service
@@ -49,7 +50,7 @@ class PrecisionDebugger:
         dump_path=None,
         level=None,
         model=None,
-        step=None,
+        step=None
     ):
         if not hasattr(self, "initialized"):
             config_params = ConfigParameters(config_path,
@@ -59,7 +60,6 @@ class PrecisionDebugger:
                                              model)
             self.check_input_params(config_params)
-            self.api_origin = False
             self.initialized = True
             self.model = model
             common_config, task_config = parse_json_config(config_path, task)
@@ -67,12 +67,13 @@ class PrecisionDebugger:
             if self.task == Const.GRAD_PROBE:
                 self.gm = GradientMonitor(common_config, task_config)
                 return
-            if step:
+            if step is not None:
                 common_config.step = get_real_step_or_rank(step, Const.STEP)
             self.config = DebuggerConfig(
                 common_config, task_config, task, dump_path, level
             )
             self.service = Service(self.config)
+            self.module_dumper = ModuleDumper(self.service)
             self.enable_dataloader = self.config.enable_dataloader
             if self.enable_dataloader:
                 logger.warning_on_rank_0("The enable_dataloader feature will be deprecated in the future.")
@@ -105,9 +106,11 @@ class PrecisionDebugger:
             raise MsprobeException(
                 MsprobeException.INVALID_PARAM_ERROR, f"level must be one of {Const.LEVEL_LIST}")
-        if args.model is not None and not isinstance(args.model, torch.nn.Module):
-            raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, f"model must be a torch.nn.Module")
+        if args.model is not None:
+            logger.warning_on_rank_0(
+                "The 'model' parameter in the PrecisionDebugger will be deprecated in the future."
+                "It is recommended to pass the 'model' parameter in the start interface instead."
+            )
     @classmethod
     def start(cls, model=None):
@@ -120,15 +123,12 @@ class PrecisionDebugger:
         if instance.enable_dataloader:
             logger.warning_on_rank_0("DataLoader is enabled, start() skipped.")
         else:
-            instance.service.start(instance.model, instance.api_origin)
-            instance.api_origin = False
+            instance.service.start(instance.model)
-    # 指定代码段dump前反向结束符，之后的计算过程数据将被忽略，无法被dump
     @classmethod
     def forward_backward_dump_end(cls):
         instance = cls._instance
-        instance.service.forward_backward_dump_end()
-        instance.api_origin = True
+        instance.stop()
     @classmethod
     def stop(cls):
@@ -159,6 +159,36 @@ class PrecisionDebugger:
         cls._instance.gm.monitor(model)
+def module_dump(module, dump_name):
+    if not isinstance(module, torch.nn.Module):
+        raise MsprobeException(
+            MsprobeException.INVALID_PARAM_ERROR,
+            f"the module argument in module_dump must be a torch.nn.Module subclass"
+        )
+    if not isinstance(dump_name, str):
+        raise MsprobeException(
+            MsprobeException.INVALID_PARAM_ERROR,
+            f"the dump_name argument in module_dump must be a str type"
+        )
+    instance = PrecisionDebugger._instance
+    if not instance:
+        raise MsprobeException(
+            MsprobeException.INTERFACE_USAGE_ERROR,
+            f"PrecisionDebugger must be instantiated before using module_dump interface"
+        )
+    instance.module_dumper.start_module_dump(module, dump_name)
+def module_dump_end():
+    instance = PrecisionDebugger._instance
+    if not instance:
+        raise MsprobeException(
+            MsprobeException.INTERFACE_USAGE_ERROR,
+            f"PrecisionDebugger must be instantiated before using module_dump_end interface"
+        )
+    instance.module_dumper.stop_module_dump()
 def iter_tracer(func):
     def func_wrapper(*args, **kwargs):
         debugger_instance = PrecisionDebugger.instance

msprobe/pytorch/dump/module_dump/__init__.py ADDED Viewed

File without changes

msprobe/pytorch/dump/module_dump/module_dump.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from msprobe.core.common.const import Const
+from msprobe.core.data_dump.scope import BaseScope
+from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.hook_module.api_registry import api_register
+torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+class ModuleDumper:
+    def __init__(self, service):
+        self.service = service
+        self.hook_handle_list = []
+    def start_module_dump(self, module, dump_name):
+        api_register.api_originality()
+        self.register_hook(module, dump_name)
+    def stop_module_dump(self):
+        api_register.api_modularity()
+        for hook_handle in self.hook_handle_list:
+            if isinstance(hook_handle, torch.utils.hooks.RemovableHandle):
+                hook_handle.remove()
+        self.hook_handle_list.clear()
+    def register_hook(self, module, dump_name):
+        prefix_name = (
+                BaseScope.Module_Type_Module + Const.SEP +
+                dump_name + Const.SEP +
+                module.__class__.__name__ + Const.SEP
+        )
+        module_processor = self.service.module_processor
+        _, forward_hook, backward_hook, forward_hook_torch_version_below_2 = self.service.build_hook(
+            BaseScope.Module_Type_Module,
+            prefix_name
+        )
+        if module_processor.has_register_backward_hook(module):
+            logger.warning(
+                f"The {dump_name} module has registered deprecated register_backward_hook,"
+                f"which may cause abnormal data dump. The backward data dump for this module will be skipped."
+            )
+        if torch_version_above_or_equal_2:
+            forward_hook_handle = module.register_forward_hook(forward_hook, with_kwargs=True)
+        else:
+            if not module_processor.has_register_backward_hook(module):
+                backward_hook_handle = module.register_full_backward_hook(
+                    module_processor.node_hook(prefix_name + Const.BACKWARD, Const.STOP)
+                )
+                self.hook_handle_list.append(backward_hook_handle)
+            forward_hook_handle = module.register_forward_hook(forward_hook_torch_version_below_2)
+        self.hook_handle_list.append(forward_hook_handle)
+        if not module_processor.has_register_backward_hook(module):
+            backward_hook_handle = module.register_full_backward_hook(backward_hook)
+            self.hook_handle_list.append(backward_hook_handle)
+        forward_pre_hook_handle = module.register_forward_pre_hook(
+            module_processor.node_hook(prefix_name + Const.FORWARD, Const.START)
+        )
+        forward_hook_handle = module.register_forward_hook(
+            module_processor.node_hook(prefix_name + Const.FORWARD, Const.STOP)
+        )
+        self.hook_handle_list.extend([forward_pre_hook_handle, forward_hook_handle])
+        if torch_version_above_or_equal_2 and not module_processor.has_register_backward_hook(module):
+            backward_pre_hook_handle = module.register_full_backward_pre_hook(
+                module_processor.node_hook(prefix_name + Const.BACKWARD, Const.START)
+            )
+            backward_hook_handle = module.register_full_backward_hook(
+                module_processor.node_hook(prefix_name + Const.BACKWARD, Const.STOP)
+            )
+            self.hook_handle_list.extend([backward_pre_hook_handle, backward_hook_handle])

msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -17,12 +17,24 @@ from functools import wraps
 import torch
 from msprobe.core.common.const import Const
-from msprobe.core.data_dump.scope import ModuleRangeScope, MixRangeScope
+from msprobe.core.data_dump.scope import BaseScope, ModuleRangeScope, MixRangeScope
+from msprobe.pytorch.common.log import logger
+from torch.utils.checkpoint import checkpoint as origin_checkpoint
+from torch.utils.checkpoint import set_checkpoint_early_stop
 from torch.utils.hooks import BackwardHook
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
+def checkpoint_without_early_stop(*args, **kwargs):
+    with set_checkpoint_early_stop(False):
+        return origin_checkpoint(*args, **kwargs)
+def replace_checkpoint():
+    torch.utils.checkpoint.checkpoint = checkpoint_without_early_stop
 class ModuleProcesser:
     module_count = {}
     module_stack = []
@@ -34,6 +46,7 @@ class ModuleProcesser:
         BackwardHook.setup_input_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_input_hook)
         BackwardHook.setup_output_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_output_hook)
         BackwardHook.setup_output_hook = ModuleProcesser.filter_tensor_and_tuple(BackwardHook.setup_output_hook)
+        replace_checkpoint()
     @staticmethod
     def filter_tensor_and_tuple(func):
@@ -63,7 +76,7 @@ class ModuleProcesser:
             return ModuleProcesser.clone_if_tensor(result)
         return clone_return_value_func
     @staticmethod
     def clone_if_tensor(result):
         if isinstance(result, torch.Tensor):
@@ -85,6 +98,22 @@ class ModuleProcesser:
             ModuleProcesser.module_count[module_name] += 1
         return ModuleProcesser.module_count[module_name]
+    @staticmethod
+    def has_register_backward_hook(module):
+        return hasattr(module, '_backward_hooks') and \
+            len(module._backward_hooks) > 0 and \
+            module._is_full_backward_hook is False
+    @staticmethod
+    def get_modules_and_names(models):
+        modules_and_names_with_index = {}
+        if isinstance(models, (list, tuple)):
+            for index, model in enumerate(models):
+                modules_and_names_with_index[str(index)] = model.named_modules()
+        else:
+            modules_and_names_with_index["-1"] = models.named_modules()
+        return modules_and_names_with_index
     @classmethod
     def reset_module_stats(cls):
         cls.module_count = {}
@@ -92,6 +121,42 @@ class ModuleProcesser:
         cls.api_parent_node = ""
         cls.module_node = {}
+    def register_module_hook(self, models, build_hook):
+        logger.info_on_rank_0("The init dump is enabled, and the module dump function will not be available.")
+        modules_and_names_with_index = self.get_modules_and_names(models)
+        for index, modules_and_names in modules_and_names_with_index.items():
+            model = models if index == "-1" else models[int(index)]
+            for name, module in modules_and_names:
+                if module == model:
+                    continue
+                module_index = (index + Const.SEP) if index != "-1" else ""
+                prefix_name = (BaseScope.Module_Type_Module + Const.SEP + module_index +
+                                name + Const.SEP + module.__class__.__name__ + Const.SEP)
+                pre_forward_hook, forward_hook, backward_hook, forward_hook_torch_version_below_2 = build_hook(
+                    BaseScope.Module_Type_Module,
+                    prefix_name
+                )
+                if self.has_register_backward_hook(module):
+                    logger.warning(
+                        f"The {prefix_name[:-1]} has registered deprecated register_backward_hook,"
+                        f"which may cause abnormal data dump. The backward data dump for this module will be skipped."
+                    )
+                if torch_version_above_or_equal_2:
+                    module.register_forward_hook(forward_hook, with_kwargs=True)
+                else:
+                    if not self.has_register_backward_hook(module):
+                        module.register_full_backward_hook(self.node_hook(prefix_name + Const.BACKWARD, Const.STOP))
+                    module.register_forward_hook(forward_hook_torch_version_below_2)
+                if not self.has_register_backward_hook(module):
+                    module.register_full_backward_hook(backward_hook)
+                module.register_forward_pre_hook(self.node_hook(prefix_name + Const.FORWARD, Const.START))
+                module.register_forward_hook(self.node_hook(prefix_name + Const.FORWARD, Const.STOP))
+                if torch_version_above_or_equal_2 and not self.has_register_backward_hook(module):
+                    module.register_full_backward_pre_hook(self.node_hook(prefix_name + Const.BACKWARD, Const.START))
+                    module.register_full_backward_hook(self.node_hook(prefix_name + Const.BACKWARD, Const.STOP))
     def node_hook(self, name_prefix, start_or_stop, **kwargs):
         def pre_hook(module, input, output=None):
@@ -100,7 +165,10 @@ class ModuleProcesser:
             except IndexError as e:
                 index = None
                 pass
-            module.mindstudio_reserved_name = full_name = name_prefix + Const.SEP + str(index)
+            full_name = name_prefix + Const.SEP + str(index)
+            if not hasattr(module, "mindstudio_reserved_name") or not module.mindstudio_reserved_name:
+                module.mindstudio_reserved_name = []
+            module.mindstudio_reserved_name.append(full_name)
             if self.module_stack:
                 ModuleProcesser.module_node[full_name] = self.module_stack[-1]
             else:
@@ -119,8 +187,11 @@ class ModuleProcesser:
                 ModuleProcesser.api_parent_node = self.module_stack[-1]
             else:
                 ModuleProcesser.api_parent_node = None
+            if not hasattr(module, "mindstudio_reserved_name") or not module.mindstudio_reserved_name:
+                raise RuntimeError(f"module reserve name is None when pop")
+            current_name = module.mindstudio_reserved_name.pop()
             if self.scope:
-                self.scope.end_module(module.mindstudio_reserved_name)
+                self.scope.end_module(current_name)
         def backward_hook(module, input, output=None):
             try:
@@ -128,7 +199,10 @@ class ModuleProcesser:
             except IndexError as e:
                 index = None
                 pass
-            module.mindstudio_reserved_name = full_name = name_prefix + Const.SEP + str(index)
+            full_name = name_prefix + Const.SEP + str(index)
+            if not hasattr(module, "mindstudio_reserved_name") or not module.mindstudio_reserved_name:
+                module.mindstudio_reserved_name = []
+            module.mindstudio_reserved_name.append(full_name)
             forward_full_name = full_name.replace(Const.BACKWARD, Const.FORWARD)
             ModuleProcesser.module_node[full_name] = ModuleProcesser.module_node[forward_full_name].replace(
                 Const.FORWARD, Const.BACKWARD) if ModuleProcesser.module_node[forward_full_name] else None

msprobe/pytorch/free_benchmark/common/params.py CHANGED Viewed

@@ -39,7 +39,6 @@ class DataParams:
     origin_func: Optional[Callable] = None
     api_type: Optional[str] = None
     fuzz_stage: Optional[str] = None
-    grad_unequal_flag: Optional[bool] = True
 @dataclass
@@ -127,6 +126,8 @@ def make_unequal_row(
     )
     if isinstance(ratio, float):
         row.max_rel = ratio - 1
+    if isinstance(ratio, str):
+        row.max_rel = ratio
     origin_tensor = data_params.original_result
     perturbed_tensor = data_params.perturbed_result
     if index is not None:

msprobe/pytorch/free_benchmark/common/utils.py CHANGED Viewed

@@ -124,6 +124,7 @@ class TorchC:
     abs = torch._C._VariableFunctionsClass.abs
     where = torch._C._VariableFunctionsClass.where
     div = torch._C._VariableFunctionsClass.div
+    mul = torch._C._VariableFunctionsClass.mul
     max = torch._C._VariableFunctionsClass.max
     min = torch._C._VariableFunctionsClass.min
     gt = torch._C._VariableFunctionsClass.gt
@@ -138,3 +139,5 @@ class TorchC:
     tensor_split = torch._C._VariableFunctionsClass.tensor_split
     stack = torch._C._VariableFunctionsClass.stack
     reshape = torch._C._VariableFunctionsClass.reshape
+    nan_to_num = torch._C._VariableFunctionsClass.nan_to_num
+    aminmax = torch._C._VariableFunctionsClass.aminmax

msprobe/pytorch/free_benchmark/compare/grad_saver.py CHANGED Viewed

@@ -82,13 +82,11 @@ class GradSaver:
         data_params = DataParams()
         data_params.original_result = origin_grad
         data_params.perturbed_result = perturbed_grad
-        data_params.grad_unequal_flag = False
         data_params.valid_input_index = index
         try:
             handler.handle(data_params)
             if not data_params.is_consistent:
                 self.is_compare = False
-                data_params.grad_unequal_flag = True
                 data_params.is_consistent = True
                 data_params.perturbed_result = self.perturbed_grad_input
                 data_params.original_result = self.origin_grad_input

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.1py3-none-any.whl