PyPI - mindstudio-probe - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/METADATA +3 -2
mindstudio_probe-1.2.2.dist-info/RECORD +415 -0
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +16 -21
msprobe/config.json +1 -0
msprobe/core/common/const.py +185 -11
msprobe/core/common/exceptions.py +3 -1
msprobe/core/common/file_utils.py +33 -7
msprobe/core/common/inplace_ops.yaml +4 -0
msprobe/core/common/utils.py +42 -14
msprobe/core/common_config.py +6 -0
msprobe/core/compare/acc_compare.py +139 -128
msprobe/core/compare/check.py +31 -29
msprobe/core/compare/compare_cli.py +17 -16
msprobe/core/compare/highlight.py +186 -99
msprobe/core/compare/layer_mapping/data_scope_parser.py +19 -8
msprobe/core/compare/layer_mapping/layer_mapping.py +21 -14
msprobe/core/compare/layer_mapping/postprocess_pass.py +4 -3
msprobe/core/compare/merge_result/merge_result.py +381 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/merge_result/utils.py +81 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +109 -147
msprobe/core/compare/utils.py +199 -69
msprobe/core/data_dump/data_collector.py +100 -25
msprobe/core/data_dump/data_processor/base.py +130 -28
msprobe/core/data_dump/data_processor/factory.py +8 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +170 -23
msprobe/core/data_dump/data_processor/pytorch_processor.py +175 -64
msprobe/core/data_dump/json_writer.py +54 -8
msprobe/core/data_dump/scope.py +19 -18
msprobe/core/overflow_check/abnormal_scene.py +9 -5
msprobe/core/overflow_check/checker.py +1 -1
msprobe/core/overflow_check/utils.py +1 -1
msprobe/docs/01.installation.md +121 -17
msprobe/docs/02.config_introduction.md +18 -16
msprobe/docs/03.config_examples.md +24 -0
msprobe/docs/05.data_dump_PyTorch.md +107 -58
msprobe/docs/06.data_dump_MindSpore.md +95 -34
msprobe/docs/07.accuracy_checker_PyTorch.md +18 -18
msprobe/docs/09.accuracy_checker_MindSpore.md +8 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +99 -41
msprobe/docs/11.accuracy_compare_MindSpore.md +249 -48
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +310 -220
msprobe/docs/21.visualization_PyTorch.md +125 -35
msprobe/docs/22.visualization_MindSpore.md +149 -41
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/{23.tool_function_introduction.md → 25.tool_function_introduction.md} +1 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +525 -0
msprobe/docs/28.debugger_save_instruction.md +94 -0
msprobe/docs/28.kernel_dump_MindSpore.md +69 -0
msprobe/docs/FAQ.md +26 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/step_count_per_record.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +11 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +80 -28
msprobe/mindspore/api_accuracy_checker/api_runner.py +54 -16
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +2 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +52 -8
msprobe/mindspore/api_accuracy_checker/data_manager.py +37 -0
msprobe/mindspore/api_accuracy_checker/main.py +1 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +12 -6
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +3 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +129 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +24 -1
msprobe/mindspore/api_accuracy_checker/utils.py +6 -1
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +3 -1
msprobe/mindspore/common/utils.py +68 -5
msprobe/mindspore/compare/distributed_compare.py +0 -2
msprobe/mindspore/compare/ms_compare.py +105 -63
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/debugger/debugger_config.py +28 -2
msprobe/mindspore/debugger/precision_debugger.py +100 -12
msprobe/mindspore/dump/hook_cell/api_registry.py +85 -16
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +33 -15
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +11 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +7 -6
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +7 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +13 -4
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +24 -12
msprobe/mindspore/grad_probe/hook.py +13 -4
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/monitor/anomaly_detect.py +404 -0
msprobe/mindspore/monitor/distributed/__init__.py +0 -0
msprobe/mindspore/monitor/distributed/distributed_ops.yaml +15 -0
msprobe/mindspore/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +300 -0
msprobe/mindspore/monitor/features.py +63 -0
msprobe/mindspore/monitor/module_hook.py +821 -0
msprobe/mindspore/monitor/module_spec_verifier.py +94 -0
msprobe/mindspore/monitor/utils.py +267 -0
msprobe/mindspore/ms_config.py +13 -3
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +7 -0
msprobe/mindspore/service.py +347 -107
msprobe/msprobe.py +24 -3
msprobe/pytorch/__init__.py +7 -7
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +100 -267
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +55 -31
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +57 -1
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -1
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +42 -14
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +64 -19
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +34 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/bench_functions/apply_adam.py +215 -0
msprobe/pytorch/bench_functions/group_norm_silu.py +27 -0
msprobe/pytorch/bench_functions/mish.py +21 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +44 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +42 -10
msprobe/pytorch/bench_functions/sort_v2.py +21 -0
msprobe/pytorch/common/parse_json.py +2 -1
msprobe/pytorch/common/utils.py +116 -2
msprobe/pytorch/compare/distributed_compare.py +17 -29
msprobe/pytorch/compare/pt_compare.py +40 -20
msprobe/pytorch/debugger/debugger_config.py +42 -17
msprobe/pytorch/debugger/precision_debugger.py +56 -12
msprobe/pytorch/dump/module_dump/__init__.py +0 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/dump/module_dump/module_processer.py +204 -0
msprobe/pytorch/free_benchmark/common/params.py +2 -1
msprobe/pytorch/free_benchmark/common/utils.py +3 -0
msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -2
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +31 -47
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/function_factory.py +7 -1
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +36 -1
msprobe/pytorch/hook_module/wrap_distributed.py +10 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -40
msprobe/pytorch/monitor/anomaly_analyse.py +1 -1
msprobe/pytorch/monitor/anomaly_detect.py +98 -28
msprobe/pytorch/monitor/csv2tb.py +164 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +25 -14
msprobe/pytorch/monitor/features.py +3 -3
msprobe/pytorch/monitor/module_hook.py +543 -318
msprobe/pytorch/monitor/module_metric.py +27 -48
msprobe/pytorch/monitor/module_spec_verifier.py +3 -1
msprobe/pytorch/monitor/optimizer_collect.py +76 -56
msprobe/pytorch/monitor/unittest/test_monitor.py +24 -9
msprobe/pytorch/monitor/utils.py +84 -48
msprobe/pytorch/online_dispatch/dispatch.py +8 -2
msprobe/pytorch/parse_tool/lib/compare.py +10 -10
msprobe/pytorch/parse_tool/lib/config.py +5 -7
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +11 -10
msprobe/pytorch/parse_tool/lib/utils.py +18 -19
msprobe/pytorch/parse_tool/lib/visualization.py +9 -10
msprobe/pytorch/pt_config.py +19 -22
msprobe/pytorch/service.py +264 -115
msprobe/visualization/builder/graph_builder.py +93 -10
msprobe/visualization/builder/msprobe_adapter.py +30 -6
msprobe/visualization/compare/graph_comparator.py +64 -14
msprobe/visualization/compare/mode_adapter.py +1 -15
msprobe/visualization/graph/base_node.py +15 -19
msprobe/visualization/graph/distributed_analyzer.py +395 -0
msprobe/visualization/graph/graph.py +9 -0
msprobe/visualization/graph/node_op.py +4 -2
msprobe/visualization/graph_service.py +100 -27
msprobe/visualization/utils.py +24 -31
mindstudio_probe-1.1.1.dist-info/RECORD +0 -341
msprobe/pytorch/functional/module_dump.py +0 -84
msprobe/pytorch/module_processer.py +0 -150
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.1.1.dist-info → mindstudio_probe-1.2.2.dist-info}/top_level.txt +0 -0
/msprobe/docs/{data_dump_Mindspore → data_dump_MindSpore}/dynamic_graph_quick_start_example.md +0 -0
/msprobe/{pytorch/functional → mindspore/code_mapping}/__init__.py +0 -0

msprobe/mindspore/debugger/precision_debugger.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,15 +14,20 @@
 # limitations under the License.
 import os
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 import mindspore as ms
 from mindspore._c_expression import MSContext
-from msprobe.core.common.const import Const, MsgConst
+from msprobe.core.common.const import Const, FileCheckConst, MsgConst
+from msprobe.core.common.exceptions import MsprobeException
+from msprobe.core.common.file_utils import FileChecker
+from msprobe.core.common.utils import get_real_step_or_rank
 from msprobe.mindspore.cell_processor import CellProcessor
 from msprobe.mindspore.common.const import Const as MsConst
+from msprobe.mindspore.common.utils import set_register_backward_hook_functions, check_save_param
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.dump.hook_cell.api_registry import api_register
 from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
 from msprobe.mindspore.grad_probe.grad_monitor import GradientMonitor
 from msprobe.mindspore.ms_config import parse_json_config
@@ -30,12 +35,21 @@ from msprobe.mindspore.runtime import Runtime
 from msprobe.mindspore.service import Service
 from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
+try:
+    from msprobe.lib import _msprobe_c
+except ImportError:
+    _msprobe_c = None
+ConfigParameters = namedtuple("ConfigParameters", ["config_path", "task", "dump_path", "level"])
 class PrecisionDebugger:
     _instance = None
     task_not_need_service = [Const.GRAD_PROBE]
-    def __new__(cls, config_path=None, opt=None):
+    def __new__(cls, config_path=None, task=None, dump_path=None,
+                level=None, step=None, opt=None):
         if not cls._instance:
             cls._instance = super().__new__(cls)
             cls._instance.initialized = False
@@ -44,22 +58,66 @@ class PrecisionDebugger:
             cls.first_start = False
         return cls._instance
-    def __init__(self, config_path=None):
+    def __init__(self, config_path=None, task=None, dump_path=None,
+                 level=None, step=None):
         if self.initialized:
             return
         self.initialized = True
+        set_register_backward_hook_functions()
         if not config_path:
             config_path = os.path.join(os.path.dirname(__file__), "../../config.json")
+        config_params = ConfigParameters(config_path, task, dump_path, level)
+        self.check_input_params(config_params)
         common_config, task_config = parse_json_config(config_path)
+        common_config.task = task if task else common_config.task
         self.task = common_config.task
         if self.task == Const.GRAD_PROBE:
             self.gm = GradientMonitor(common_config, task_config)
             return
+        common_config.step = get_real_step_or_rank(
+            step, Const.STEP) if step is not None else common_config.step
+        common_config.level = level if level else common_config.level
+        common_config.dump_path = dump_path if dump_path else common_config.dump_path
         self.config = DebuggerConfig(common_config, task_config)
+        if _msprobe_c:
+            _msprobe_c._PrecisionDebugger(framework="MindSpore", config_path=config_path)
+        self.config.execution_mode = self._get_execution_mode()
+        if self._need_service():
+            self.config.check_config_with_l2()
+            self.service = Service(self.config)
         Runtime.step_count = 0
         Runtime.is_running = False
+    @staticmethod
+    def check_input_params(args):
+        if args.config_path is not None:
+            if not isinstance(args.config_path, str):
+                raise MsprobeException(
+                    MsprobeException.INVALID_PARAM_ERROR, f"config_path must be a string")
+            file_checker = FileChecker(
+                file_path=args.config_path, path_type=FileCheckConst.FILE, file_type=FileCheckConst.JSON_SUFFIX)
+            file_checker.common_check()
+        if args.task is not None and args.task not in Const.TASK_LIST:
+            raise MsprobeException(
+                MsprobeException.INVALID_PARAM_ERROR, f"task must be one of {Const.TASK_LIST}")
+        if args.dump_path is not None:
+            if not isinstance(args.dump_path, str):
+                raise MsprobeException(
+                    MsprobeException.INVALID_PARAM_ERROR, f"dump_path must be a string")
+        if args.level is not None and args.level not in Const.LEVEL_LIST:
+            raise MsprobeException(
+                MsprobeException.INVALID_PARAM_ERROR, f"level must be one of {Const.LEVEL_LIST}")
     @staticmethod
     def _get_execution_mode():
         jit_level = ms.context.get_jit_config().get(MsConst.JIT_LEVEL)
@@ -78,11 +136,23 @@ class PrecisionDebugger:
         else:
             return MsConst.PYNATIVE_MODE
+    @staticmethod
+    def _is_graph_dump(config):
+        if config.level != MsConst.KERNEL:
+            return False
+        if not config.list:
+            return True
+        is_graph = any(item.startswith("name-regex") for item in config.list)
+        is_graph |= all("." not in item for item in config.list)
+        return is_graph
     @classmethod
     def start(cls, model=None):
         instance = cls._instance
         if not instance:
             raise Exception(MsgConst.NOT_CREATED_INSTANCE)
+        if _msprobe_c:
+            _msprobe_c._PrecisionDebugger().start()
         if instance.task in PrecisionDebugger.task_not_need_service:
             return
@@ -93,6 +163,7 @@ class PrecisionDebugger:
             instance.service.start(model)
         else:
             if not instance.first_start:
+                api_register.api_set_ori_func()
                 handler = TaskHandlerFactory.create(instance.config)
                 handler.handle()
@@ -102,18 +173,15 @@ class PrecisionDebugger:
     @classmethod
     def forward_backward_dump_end(cls):
         instance = cls._instance
-        if not instance:
-            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
-        if instance.task in PrecisionDebugger.task_not_need_service:
-            return
-        if instance.service:
-            instance.service.forward_backward_dump_end()
+        instance.stop()
     @classmethod
     def stop(cls):
         instance = cls._instance
         if not instance:
             raise Exception(MsgConst.NOT_CREATED_INSTANCE)
+        if _msprobe_c:
+            _msprobe_c._PrecisionDebugger().stop()
         if instance.task == Const.GRAD_PROBE:
             instance.gm.stop()
         if instance.task in PrecisionDebugger.task_not_need_service:
@@ -127,6 +195,8 @@ class PrecisionDebugger:
         instance = cls._instance
         if not instance:
             raise Exception(MsgConst.NOT_CREATED_INSTANCE)
+        if _msprobe_c:
+            _msprobe_c._PrecisionDebugger().step()
         if instance.task in PrecisionDebugger.task_not_need_service:
             return
         if instance.service:
@@ -145,6 +215,24 @@ class PrecisionDebugger:
             return
         instance.gm.monitor(opt)
+    @classmethod
+    def save(cls, variable, name, save_backward=True):
+        instance = cls._instance
+        if not instance:
+            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
+        if instance.task not in [Const.TENSOR, Const.STATISTICS] or instance.config.level_ori != Const.LEVEL_DEBUG:
+            return
+        try:
+            check_save_param(variable, name, save_backward)
+        except ValueError:
+            return
+        instance.config.execution_mode = cls._get_execution_mode()
+        if cls._need_service():
+            if not instance.service:
+                instance.service = Service(instance.config)
+            instance.service.save(variable, name, save_backward)
     @classmethod
     def _need_service(cls):
         instance = cls._instance
@@ -153,4 +241,4 @@ class PrecisionDebugger:
         if instance.config.execution_mode != MsConst.PYNATIVE_MODE:
             return False
         else:
-            return instance.config.task != Const.FREE_BENCHMARK and instance.config.level != MsConst.KERNEL
+            return instance.config.task != Const.FREE_BENCHMARK and not instance._is_graph_dump(instance.config)

msprobe/mindspore/dump/hook_cell/api_registry.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ============================================================================
 from mindspore import Tensor, ops, mint
 from mindspore.mint.nn import functional
@@ -20,8 +20,15 @@ from mindspore.communication import comm_func
 from msprobe.mindspore.dump.hook_cell.wrap_api import (HOOKTensor, HOOKStubTensor, HOOKFunctionalOP,
                                                        HOOKMintOP, HOOKMintNNFunctionalOP, HOOKDistributedOP,
-                                                       get_wrap_api_list, setup_hooks)
+                                                       HOOKTorchOP, HOOKTorchTensor, HOOKTorchFunctionalOP,
+                                                       HOOKTorchDistributedOP, HOOKTorchNpuOP,
+                                                       get_wrap_api_list, get_wrap_torch_api_list, setup_hooks)
 from msprobe.core.common.utils import Const
+from msprobe.mindspore.common.utils import is_mindtorch
+if is_mindtorch():
+    import torch
+    import torch_npu
 def stub_method(method):
@@ -40,6 +47,12 @@ class ApiRegistry:
         self.distributed_ori_attr = {}
         self.norm_inner_ops_ori_attr = {}
+        self.torch_ori_attr = {}
+        self.torch_tensor_ori_attr = {}
+        self.torch_functional_ori_attr = {}
+        self.torch_distributed_ori_attr = {}
+        self.torch_npu_ori_attr = {}
         self.tensor_hook_attr = {}
         self.stub_tensor_hook_attr = {}
         self.functional_hook_attr = {}
@@ -48,6 +61,12 @@ class ApiRegistry:
         self.distibuted_hook_attr = {}
         self.norm_inner_ops_hook_attr = {}
+        self.torch_hook_attr = {}
+        self.torch_tensor_hook_attr = {}
+        self.torch_functional_hook_attr = {}
+        self.torch_distributed_hook_attr = {}
+        self.torch_npu_hook_attr = {}
         self.norm_inner_ops = ["norm", "square", "sqrt", "is_complex"]
     @staticmethod
@@ -82,22 +101,73 @@ class ApiRegistry:
         self.set_api_attr(ops, self.norm_inner_ops_ori_attr)
     def api_set_hook_func(self):
-        self.set_api_attr(Tensor, self.tensor_hook_attr)
-        self.set_api_attr(StubTensor, self.stub_tensor_hook_attr)
-        self.set_api_attr(ops, self.functional_hook_attr)
-        self.set_api_attr(mint, self.mint_ops_hook_attr)
-        self.set_api_attr(functional, self.mint_func_ops_hook_attr)
-        self.set_api_attr(comm_func, self.distibuted_hook_attr)
+        if is_mindtorch():
+            self.set_api_attr(torch, self.torch_hook_attr)
+            self.set_api_attr(torch.Tensor, self.torch_tensor_hook_attr)
+            self.set_api_attr(torch.nn.functional, self.torch_functional_hook_attr)
+            self.set_api_attr(torch.distributed, self.torch_distributed_hook_attr)
+            self.set_api_attr(torch.distributed.distributed_c10d, self.torch_distributed_hook_attr)
+            self.set_api_attr(torch_npu, self.torch_npu_hook_attr)
+        else:
+            self.set_api_attr(Tensor, self.tensor_hook_attr)
+            self.set_api_attr(StubTensor, self.stub_tensor_hook_attr)
+            self.set_api_attr(ops, self.functional_hook_attr)
+            self.set_api_attr(mint, self.mint_ops_hook_attr)
+            self.set_api_attr(functional, self.mint_func_ops_hook_attr)
+            self.set_api_attr(comm_func, self.distibuted_hook_attr)
     def api_set_ori_func(self):
-        self.set_api_attr(Tensor, self.tensor_ori_attr)
-        self.set_api_attr(StubTensor, self.stub_tensor_ori_attr)
-        self.set_api_attr(ops, self.functional_ori_attr)
-        self.set_api_attr(mint, self.mint_ops_ori_attr)
-        self.set_api_attr(functional, self.mint_func_ops_ori_attr)
-        self.set_api_attr(comm_func, self.distributed_ori_attr)
+        if is_mindtorch():
+            self.set_api_attr(torch, self.torch_ori_attr)
+            self.set_api_attr(torch.Tensor, self.torch_tensor_ori_attr)
+            self.set_api_attr(torch.nn.functional, self.torch_functional_ori_attr)
+            self.set_api_attr(torch.distributed, self.torch_distributed_ori_attr)
+            self.set_api_attr(torch.distributed.distributed_c10d, self.torch_distributed_ori_attr)
+            self.set_api_attr(torch_npu, self.torch_npu_ori_attr)
+        else:
+            self.set_api_attr(Tensor, self.tensor_ori_attr)
+            self.set_api_attr(StubTensor, self.stub_tensor_ori_attr)
+            self.set_api_attr(ops, self.functional_ori_attr)
+            self.set_api_attr(mint, self.mint_ops_ori_attr)
+            self.set_api_attr(functional, self.mint_func_ops_ori_attr)
+            self.set_api_attr(comm_func, self.distributed_ori_attr)
     def initialize_hook(self, hook):
+        setup_hooks(hook)
+        if is_mindtorch():
+            wrap_torch_api_name = get_wrap_torch_api_list()
+            self.store_ori_attr(torch,
+                                wrap_torch_api_name.torch_api_names, self.torch_ori_attr)
+            self.store_ori_attr(torch.Tensor,
+                                wrap_torch_api_name.tensor_api_names, self.torch_tensor_ori_attr)
+            self.store_ori_attr(torch.nn.functional,
+                                wrap_torch_api_name.functional_api_names, self.torch_functional_ori_attr)
+            self.store_ori_attr(torch.distributed,
+                                wrap_torch_api_name.distributed_api_names, self.torch_distributed_ori_attr)
+            self.store_ori_attr(torch_npu,
+                                wrap_torch_api_name.npu_api_names, self.torch_npu_ori_attr)
+            for attr_name in dir(HOOKTorchOP):
+                if attr_name.startswith(Const.ATTR_NAME_PREFIX):
+                    api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
+                    self.torch_hook_attr[api_name] = getattr(HOOKTorchOP, attr_name)
+            for attr_name in dir(HOOKTorchTensor):
+                if attr_name.startswith(Const.ATTR_NAME_PREFIX):
+                    api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
+                    self.torch_tensor_hook_attr[api_name] = getattr(HOOKTorchTensor, attr_name)
+            for attr_name in dir(HOOKTorchFunctionalOP):
+                if attr_name.startswith(Const.ATTR_NAME_PREFIX):
+                    api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
+                    self.torch_functional_hook_attr[api_name] = getattr(HOOKTorchFunctionalOP, attr_name)
+            for attr_name in dir(HOOKTorchDistributedOP):
+                if attr_name.startswith(Const.ATTR_NAME_PREFIX):
+                    api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
+                    self.torch_distributed_hook_attr[api_name] = getattr(HOOKTorchDistributedOP, attr_name)
+            for attr_name in dir(HOOKTorchNpuOP):
+                if attr_name.startswith(Const.ATTR_NAME_PREFIX):
+                    api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]
+                    self.torch_npu_hook_attr[api_name] = getattr(HOOKTorchNpuOP, attr_name)
+            return
         wrap_api_name = get_wrap_api_list()
         self.store_ori_attr(Tensor, wrap_api_name.tensor_api_names, self.tensor_ori_attr)
         self.store_ori_attr(StubTensor, wrap_api_name.stub_tensor_api_names, self.stub_tensor_ori_attr)
@@ -106,7 +176,6 @@ class ApiRegistry:
         self.store_ori_attr(functional, wrap_api_name.mint_nn_func_api_names, self.mint_func_ops_ori_attr)
         self.store_ori_attr(comm_func, wrap_api_name.distributed_api_names, self.distributed_ori_attr)
         self.store_ori_attr(ops, self.norm_inner_ops, self.norm_inner_ops_ori_attr)
-        setup_hooks(hook)
         for attr_name in dir(HOOKTensor):
             if attr_name.startswith(Const.ATTR_NAME_PREFIX):
                 api_name = attr_name[Const.ATTR_NAME_PREFIX_LEN:]

msprobe/mindspore/dump/hook_cell/hook_cell.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,45 +12,66 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ============================================================================
 from collections import defaultdict
 from mindspore import nn
-from msprobe.core.common.const import Const
-class HOOKCell(nn.Cell):
-    cell_count = defaultdict(int)
-    g_stop_hook = False
-    def __init__(self, build_hook) -> None:
-        super(HOOKCell, self).__init__()
-        self.changed_status = False
-        self.input_kwargs = {}
-        self.prefix = ""
-        if not HOOKCell.g_stop_hook:
-            HOOKCell.g_stop_hook = True
-            self.changed_status = True
-            if hasattr(self, "prefix_api_name"):
-                self.prefix = self.prefix_api_name
-            HOOKCell.cell_count[self.prefix] += 1
-            self.prefix = self.prefix + str(HOOKCell.cell_count[self.prefix] - 1) + Const.SEP
-            forward_hook, backward_hook = build_hook(self.prefix)
-            self.register_forward_hook(forward_hook)
-            self.register_backward_hook(backward_hook)
-    # 重载call，加全局标志。
-    def __call__(self, *args, **kwargs):
-        try:
-            self.input_kwargs = kwargs
-            out = super(HOOKCell, self).__call__(*args, **kwargs)
-        except Exception as e:
-            raise e
-        finally:
-            if self.changed_status:
-                self.changed_status = False
-                HOOKCell.g_stop_hook = False
-        return out
+from msprobe.mindspore.common.utils import is_mindtorch, register_backward_hook_functions
+def add_cell_count(name):
+    HOOKCell.cell_count[name] += 1
+def get_cell_count(name):
+    return HOOKCell.cell_count[name]
+def __init__(self, build_hook) -> None:
+    super(HOOKCell, self).__init__()
+    self.changed_status = False
+    self.input_kwargs = {}
+    self.prefix = ""
+    if not HOOKCell.g_stop_hook:
+        HOOKCell.g_stop_hook = True
+        self.changed_status = True
+        if hasattr(self, "prefix_api_name"):
+            self.prefix = self.prefix_api_name
+        self.forward_data_collected = False
+        forward_pre_hook, forward_hook, backward_hook, backward_pre_hook = build_hook(self.prefix)
+        self.register_forward_pre_hook(forward_pre_hook)
+        self.register_forward_hook(forward_hook)
+        register_backward_hook_functions["full"](self, backward_hook)
+        register_backward_hook_functions["pre"](self, backward_pre_hook)
+# 重载call，加全局标志。
+def __call__(self, *args, **kwargs):
+    try:
+        self.input_kwargs = kwargs
+        out = super(HOOKCell, self).__call__(*args, **kwargs)
+    except Exception as e:
+        raise e
+    finally:
+        if self.changed_status:
+            self.changed_status = False
+            HOOKCell.g_stop_hook = False
+    return out
+hook_cell_dict = {
+    "cell_count": defaultdict(int),
+    "g_stop_hook": False,
+    "add_cell_count": staticmethod(add_cell_count),
+    "get_cell_count": staticmethod(get_cell_count),
+    "__init__": __init__,
+    "__call__": __call__
+}
+if is_mindtorch():
+    import torch
+    HOOKCell = type("HOOKCell", (torch.nn.Module,), hook_cell_dict)
+else:
+    HOOKCell = type("HOOKCell", (nn.Cell,), hook_cell_dict)

msprobe/mindspore/dump/hook_cell/primitive_hooks.py CHANGED Viewed

@@ -135,6 +135,34 @@ class PrimitiveHookService:
                 return tuple(hooked_outputs)
             return out
+        def pre_forward_hook(primitive_name, primitive_instance, args, kwargs):
+            module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
+            try:
+                self.service_instance.data_collector.forward_input_data_collect(
+                    primitive_name,
+                    primitive_instance,
+                    os.getpid(),
+                    module_input_output
+                )
+            except Exception as exception:
+                logger.error(f"This is a primitive op dump error during forward input data collection: {exception}, "
+                             f"primitive_name: {primitive_name}")
+                raise DumpException(DumpException.FORWARD_DATA_COLLECTION_ERROR) from exception
+        def post_forward_hook(primitive_name, primitive_instance, args, kwargs, output):
+            module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
+            try:
+                self.service_instance.data_collector.forward_output_data_collect(
+                    primitive_name,
+                    primitive_instance,
+                    os.getpid(),
+                    module_input_output
+                )
+            except Exception as exception:
+                logger.error(f"This is a primitive op dump error during forward output data collection: {exception}, "
+                             f"primitive_name: {primitive_name}")
+                raise DumpException(DumpException.FORWARD_DATA_COLLECTION_ERROR) from exception
         def wrapped_primitive_call(instance_self, *args, **kwargs):
             """
             包装后的 primitive 调用函数，添加输入和输出的 hook。
@@ -163,27 +191,17 @@ class PrimitiveHookService:
                              f"primitive_name: {primitive_name}")
                 raise DumpException(DumpException.INPUT_HOOK_ERROR) from exception
+            forward_primitive_name = f"{updated_primitive_name}{Const.SEP}{Const.FORWARD}"
+            self.service_instance.data_collector.update_api_or_module_name(forward_primitive_name)
+            pre_forward_hook(forward_primitive_name, instance_self, hooked_inputs, kwargs)
             try:
                 out = origin_func(*hooked_inputs, **kwargs)
             except Exception as exception:
                 logger.error(f"This is a primitive op dump error during function call: {exception}, "
                              f"primitive_name: {primitive_name}")
                 raise DumpException(DumpException.FUNCTION_CALL_ERROR) from exception
-            forward_primitive_name = f"{updated_primitive_name}{Const.SEP}{Const.FORWARD}"
-            self.service_instance.data_collector.update_api_or_module_name(forward_primitive_name)
-            if self.service_instance.data_collector:
-                module_input_output = ModuleForwardInputsOutputs(args=hooked_inputs, kwargs=kwargs, output=out)
-                try:
-                    self.service_instance.data_collector.forward_data_collect(forward_primitive_name, instance_self,
-                                                                              os.getpid(), module_input_output)
-                except Exception as exception:
-                    logger.error(f"This is a primitive op dump error during forward data collection: {exception}, "
-                                 f"primitive_name: {primitive_name}")
-                    raise DumpException(DumpException.FORWARD_DATA_COLLECTION_ERROR) from exception
-                if self.service_instance.data_collector.if_return_forward_new_output():
-                    out = self.service_instance.data_collector.get_forward_new_output()
+            post_forward_hook(forward_primitive_name, instance_self, hooked_inputs, kwargs, out)
             try:
                 out = hook_primitive_outputs(out, captured_grads_output, updated_primitive_name)

msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml CHANGED Viewed

@@ -15,7 +15,7 @@
 # List of ops that register hooks
 ops:
   - adaptive_avg_pool1d
   - adaptive_avg_pool2d
@@ -85,6 +85,7 @@ ops:
   - relu6
   - celu
   - rrelu
+  - rms_norm
   - selu
   - sigmoid
   - silu
@@ -553,6 +554,7 @@ tensor:
   - acos
   - acosh
   - add
+  - add_
   - addbmm
   - addcdiv
   - addcmul
@@ -607,6 +609,7 @@ tensor:
   - diff
   - digamma
   - div
+  - div_
   - divide
   - equal
   - erf
@@ -739,6 +742,8 @@ tensor:
   - square
   - squeeze
   - std
+  - sub
+  - sub_
   - subtract
   - subtract
   - svd
@@ -983,6 +988,7 @@ mint.nn.functional:
   - one_hot_ext
   - pad
   - relu
+  - relu_
   - sigmoid
   - silu
   - softmax
@@ -1017,3 +1023,7 @@ communication.comm_func:
   - broadcast
   - gather_into_tensor
   - scatter_tensor
+  - send
+  - recv
+  - isend
+  - irecv

mindstudio-probe 1.1.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

mindstudio-probe 1.1.1py3-none-any.whl → 1.2.2py3-none-any.whl