PyPI - mindstudio-probe - Versions diffs - 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl - Mend

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/METADATA +2 -2
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/RECORD +172 -147
msprobe/README.md +6 -6
msprobe/core/common/const.py +98 -41
msprobe/core/common/db_manager.py +256 -0
msprobe/core/common/file_utils.py +28 -5
msprobe/core/common/log.py +7 -0
msprobe/core/common/megatron_utils.py +59 -0
msprobe/core/common/parallel_state.py +193 -0
msprobe/core/common/utils.py +20 -13
msprobe/core/common_config.py +5 -0
msprobe/core/compare/acc_compare.py +140 -93
msprobe/core/compare/check.py +13 -0
msprobe/core/compare/compare_cli.py +64 -6
msprobe/core/compare/config.py +10 -8
msprobe/core/compare/diff_analyze/diff_analyze_threshold.yaml +14 -0
msprobe/core/compare/diff_analyze/first_diff_analyze.py +135 -0
msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
msprobe/core/compare/find_first/__init__.py +0 -0
msprobe/core/compare/find_first/analyzer.py +282 -0
msprobe/core/compare/find_first/data_processor.py +35 -0
msprobe/core/compare/find_first/graph.py +188 -0
msprobe/core/compare/find_first/utils.py +189 -0
msprobe/core/compare/highlight.py +74 -101
msprobe/core/compare/layer_mapping/layer_mapping.py +14 -9
msprobe/core/compare/merge_result/merge_result.py +2 -2
msprobe/core/compare/multiprocessing_compute.py +45 -28
msprobe/core/compare/npy_compare.py +7 -10
msprobe/core/compare/utils.py +338 -130
msprobe/core/config_check/checkers/dataset_checker.py +2 -1
msprobe/core/config_check/checkers/env_args_checker.py +5 -5
msprobe/core/config_check/checkers/hyperparameter_checker.py +30 -10
msprobe/core/config_check/checkers/pip_checker.py +4 -3
msprobe/core/config_check/checkers/random_checker.py +3 -3
msprobe/core/config_check/checkers/weights_checker.py +2 -1
msprobe/core/config_check/ckpt_compare/megatron_loader.py +2 -0
msprobe/core/config_check/resource/hyperparameter.yaml +11 -1
msprobe/core/config_check/utils/hyperparameter_parser.py +7 -3
msprobe/core/config_check/utils/utils.py +10 -0
msprobe/core/data_dump/api_registry.py +49 -30
msprobe/core/data_dump/data_collector.py +71 -29
msprobe/core/data_dump/data_processor/base.py +2 -0
msprobe/core/data_dump/data_processor/mindspore_processor.py +47 -53
msprobe/core/data_dump/data_processor/pytorch_processor.py +227 -93
msprobe/core/data_dump/json_writer.py +81 -7
msprobe/core/data_dump/scope.py +4 -6
msprobe/core/hook_manager.py +129 -70
msprobe/core/monitor/csv2db.py +361 -0
msprobe/core/monitor/db_utils.py +278 -0
msprobe/core/monitor/utils.py +35 -1
msprobe/core/service.py +31 -39
msprobe/core/single_save/single_comparator.py +16 -3
msprobe/docs/01.installation.md +51 -19
msprobe/docs/02.config_introduction.md +16 -20
msprobe/docs/03.config_examples.md +26 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +6 -2
msprobe/docs/06.data_dump_MindSpore.md +44 -7
msprobe/docs/07.accuracy_checker_PyTorch.md +1 -1
msprobe/docs/10.accuracy_compare_PyTorch.md +124 -44
msprobe/docs/11.accuracy_compare_MindSpore.md +75 -7
msprobe/docs/14.data_parse_PyTorch.md +1 -1
msprobe/docs/19.monitor.md +94 -7
msprobe/docs/21.visualization_PyTorch.md +71 -101
msprobe/docs/22.visualization_MindSpore.md +69 -119
msprobe/docs/23.generate_operator_PyTorch.md +1 -1
msprobe/docs/25.tool_function_introduction.md +0 -1
msprobe/docs/26.data_dump_PyTorch_baseline.md +7 -7
msprobe/docs/28.debugger_save_instruction.md +184 -81
msprobe/docs/29.data_dump_MSAdapter.md +6 -0
msprobe/docs/31.config_check.md +4 -2
msprobe/docs/36.calculation_result_change.md +75 -0
msprobe/docs/FAQ.md +22 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +6 -2
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +59 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +80 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +330 -0
msprobe/mindspore/__init__.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +1 -1
msprobe/mindspore/api_accuracy_checker/api_runner.py +9 -6
msprobe/mindspore/api_accuracy_checker/compute_element.py +18 -12
msprobe/mindspore/cell_processor.py +64 -25
msprobe/mindspore/common/utils.py +51 -7
msprobe/mindspore/compare/common_dir_compare.py +45 -37
msprobe/mindspore/compare/ms_compare.py +10 -2
msprobe/mindspore/compare/ms_graph_compare.py +47 -52
msprobe/mindspore/debugger/debugger_config.py +18 -7
msprobe/mindspore/debugger/precision_debugger.py +16 -12
msprobe/mindspore/dump/cell_dump_process.py +130 -68
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +10 -2
msprobe/mindspore/dump/graph_mode_cell_dump.py +35 -9
msprobe/mindspore/dump/graph_tensor_dump.py +11 -0
msprobe/mindspore/dump/hook_cell/api_register.py +19 -20
msprobe/mindspore/dump/hook_cell/hook_cell.py +12 -34
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +142 -21
msprobe/mindspore/dump/kernel_kbyk_dump.py +24 -0
msprobe/mindspore/exception_dump/__init__.py +0 -0
msprobe/mindspore/exception_dump/exception_dump_tool_factory.py +51 -0
msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py +57 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +5 -4
msprobe/mindspore/mindspore_service.py +2 -2
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +12 -7
msprobe/mindspore/monitor/features.py +82 -0
msprobe/mindspore/monitor/module_hook.py +168 -10
msprobe/mindspore/monitor/utils.py +27 -1
msprobe/mindspore/ms_config.py +12 -4
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +1 -1
msprobe/mindspore/task_handler_factory.py +3 -1
msprobe/nan_analyze/graph.py +1 -1
msprobe/pytorch/api_accuracy_checker/common/config.py +3 -36
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +0 -24
msprobe/pytorch/api_accuracy_checker/compare/compare.py +2 -12
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -6
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +2 -2
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +12 -132
msprobe/pytorch/common/utils.py +1 -21
msprobe/pytorch/compare/pt_compare.py +10 -2
msprobe/pytorch/{hook_module/jit_script_wrapper.py → compare/pt_diff_analyze.py} +3 -15
msprobe/pytorch/compare/utils.py +2 -1
msprobe/pytorch/debugger/debugger_config.py +18 -23
msprobe/pytorch/dump/module_dump/hook_wrapper.py +10 -7
msprobe/pytorch/dump/module_dump/module_processer.py +41 -19
msprobe/pytorch/free_benchmark/main.py +7 -4
msprobe/pytorch/hook_module/api_register.py +62 -24
msprobe/pytorch/hook_module/hook_module.py +9 -29
msprobe/pytorch/hook_module/pt_hook_manager.py +84 -15
msprobe/pytorch/hook_module/script_wrapper.py +140 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +6 -0
msprobe/pytorch/monitor/csv2tb.py +1 -1
msprobe/pytorch/monitor/features.py +94 -0
msprobe/pytorch/monitor/module_hook.py +221 -81
msprobe/pytorch/monitor/module_metric.py +27 -1
msprobe/pytorch/monitor/optimizer_collect.py +109 -4
msprobe/pytorch/online_dispatch/dispatch.py +42 -24
msprobe/pytorch/online_dispatch/dump_compare.py +1 -1
msprobe/pytorch/parse_tool/lib/visualization.py +0 -1
msprobe/pytorch/pt_config.py +2 -51
msprobe/pytorch/pytorch_service.py +7 -14
msprobe/visualization/builder/graph_builder.py +192 -63
msprobe/visualization/builder/graph_merger.py +986 -0
msprobe/visualization/builder/msprobe_adapter.py +17 -15
msprobe/visualization/compare/graph_comparator.py +26 -16
msprobe/visualization/db_utils.py +252 -0
msprobe/visualization/graph/base_node.py +2 -22
msprobe/visualization/graph/distributed_analyzer.py +12 -12
msprobe/visualization/graph/graph.py +44 -16
msprobe/visualization/graph_service.py +143 -59
msprobe/visualization/utils.py +103 -4
msprobe/docs/08.accuracy_checker_online_PyTorch.md +0 -295
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +0 -205
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +0 -378
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +0 -239
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +0 -115
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +0 -250
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +0 -63
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +0 -198
msprobe/pytorch/attl_manager.py +0 -65
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-8.1.2.dist-info → mindstudio_probe-8.2.1.dist-info}/top_level.txt +0 -0
/msprobe/{pytorch/api_accuracy_checker/tensor_transport_layer → core/compare/diff_analyze}/__init__.py +0 -0

msprobe/mindspore/dump/hook_cell/ms_hook_manager.py CHANGED Viewed

@@ -14,21 +14,45 @@
 # limitations under the License.
 import threading
+from collections import OrderedDict
+import mindspore as ms
+from mindspore import Tensor
+from mindspore.common.api import _no_grad, _pynative_executor
+from mindspore.ops.operations import _inner_ops as inner
-from mindspore.common.api import _no_grad
 from msprobe.core.common.const import Const
+from msprobe.core.common.log import logger
 from msprobe.core.common.utils import replace_last_occurrence, ThreadSafe
 from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputs
 from msprobe.core.hook_manager import BaseHookManager, HookSet
-from msprobe.mindspore.common.utils import has_kwargs_in_forward_hook
+from msprobe.mindspore.common.const import Const as MsConst
+from msprobe.mindspore.common.utils import (
+    has_kwargs_in_forward_hook,
+    is_mindtorch,
+    is_backward_hook_output_a_view
+)
 from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
+ms_version = ms.__version__
+class MindsporeHookManager(BaseHookManager):
+    cell_bw_hook_kernels = {}
+    cell_backward_pre_hook = []
+    cell_backward_hook = []
-class MindsproeHookManager(BaseHookManager):
     @property
     def _is_recompute(self):
         return None
+    @staticmethod
+    def reset_status():
+        BaseHookManager.reset_status()
+        MindsporeHookManager.cell_bw_hook_kernels.clear()
+        MindsporeHookManager.cell_backward_pre_hook.clear()
+        MindsporeHookManager.cell_backward_hook.clear()
     @staticmethod
     def _no_grad_context():
         return _no_grad()
@@ -38,9 +62,13 @@ class MindsproeHookManager(BaseHookManager):
         HOOKCell.add_cell_count(name)
     @staticmethod
-    def _process_kwargs_and_output(module, hook_type, kwargs_or_output, output_or_kwargs):
+    def _get_count(name):
+        return HOOKCell.get_cell_count(name)
+    @staticmethod
+    def _process_kwargs_and_output(module, tid, hook_type, kwargs_or_output, output_or_kwargs):
         if not has_kwargs_in_forward_hook() or hook_type == Const.API:
-            kwargs = module.msprobe_input_kwargs if hasattr(module, 'msprobe_input_kwargs') else {}
+            kwargs = module.msprobe_input_kwargs.get(tid, {}) if hasattr(module, 'msprobe_input_kwargs') else {}
             output = kwargs_or_output
         else:
             kwargs = kwargs_or_output
@@ -49,17 +77,107 @@ class MindsproeHookManager(BaseHookManager):
     def build_hook(self, hook_type, name):
         if hook_type == Const.API:
-            full_forward_name = name + str(HOOKCell.get_cell_count(name)) + Const.SEP + Const.FORWARD
+            hook_set = HookSet(
+                forward_pre_hook=self._build_forward_pre_hook(hook_type, name)
+            )
         else:
-            full_forward_name = name
-        full_backward_name = replace_last_occurrence(full_forward_name, Const.FORWARD, Const.BACKWARD)
-        hookset = HookSet(
-            forward_hook=self._build_forward_hook(hook_type, full_forward_name),
-            forward_pre_hook=self._build_forward_pre_hook(hook_type, full_forward_name, name),
-            backward_hook=self._build_backward_hook(hook_type, full_backward_name),
-            backward_pre_hook=self._build_backward_pre_hook(hook_type, full_backward_name)
+            full_backward_name = replace_last_occurrence(name, Const.FORWARD, Const.BACKWARD)
+            hook_set = HookSet(
+                forward_hook=self._build_forward_hook(hook_type, name),
+                backward_pre_hook=self._build_backward_pre_hook(hook_type, full_backward_name),
+                backward_hook=self._build_backward_hook(hook_type, full_backward_name)
+            )
+        return hook_set
+    def _register_forward_hook(self, module, api_name):
+        if not hasattr(module, 'msprobe_forward_hook'):
+            forward_hook = self._build_forward_hook(Const.API, api_name)
+            if ms_version < "2.6.0" and not is_mindtorch():
+                getattr(module, "_forward_hook", {})[id(module)] = forward_hook
+            else:
+                module.register_forward_hook(forward_hook)
+            setattr(module, 'msprobe_forward_hook', True)
+    def _register_backward_hook(self, module, full_backward_name, args):
+        if not _pynative_executor.requires_grad():
+            return args
+        enable_hooked = sum(
+            [isinstance(ele, Tensor) and ele.dtype not in MsConst.NonDifferentiableType for ele in args]
+        )
+        if enable_hooked:
+            backward_hook_dict = OrderedDict()
+            backward_hook_dict[full_backward_name] = self._build_backward_hook(Const.API, full_backward_name)
+            MindsporeHookManager.cell_backward_hook.append(backward_hook_dict)
+            bw_hook = inner.CellBackwardHook(full_backward_name, module, MindsporeHookManager.cell_backward_hook[-1])
+            bw_hook.register_backward_hook()
+            MindsporeHookManager.cell_bw_hook_kernels[full_backward_name] = bw_hook
+            args = bw_hook(args) if is_backward_hook_output_a_view() else bw_hook(*args)
+        return args
+    def _register_backward_pre_hook(self, module, full_backward_name, output):
+        if not _pynative_executor.requires_grad():
+            return output
+        bw_hook = MindsporeHookManager.cell_bw_hook_kernels.get(full_backward_name)
+        if bw_hook:
+            if not isinstance(output, (Tensor, tuple)):
+                logger.debug("For backward hooks to be called, "
+                             "cell output should be a Tensor or a tuple of Tensors "
+                             f"but received {type(output)}")
+            if is_backward_hook_output_a_view():
+                new_outputs = bw_hook(output)
+            else:
+                if isinstance(output, tuple):
+                    new_outputs = bw_hook(*output)
+                else:
+                    new_outputs = bw_hook(output)
+                if isinstance(output, tuple) and len(output) == 1:
+                    new_outputs = (new_outputs,)
+            output = new_outputs
+        def get_backward_pre_hook(backward_pre_hook, backward_post_hook):
+            @ThreadSafe.synchronized
+            def backward_pre_hook_fn(cell, grad_output):
+                backward_pre_hook(cell, grad_output)
+                if backward_post_hook:
+                    backward_post_hook(cell, (), grad_output)
+            return backward_pre_hook_fn
+        backward_pre_hook = self._build_backward_pre_hook(Const.API, full_backward_name)
+        backward_post_hook = None if bw_hook else self._build_backward_hook(Const.API, full_backward_name)
+        backward_pre_hook_dict = OrderedDict()
+        backward_pre_hook_dict[full_backward_name] = get_backward_pre_hook(
+            backward_pre_hook,
+            backward_post_hook
+        )
+        MindsporeHookManager.cell_backward_pre_hook.append(backward_pre_hook_dict)
+        bw_pre_hook = inner.CellBackwardHook(
+            full_backward_name,
+            module,
+            MindsporeHookManager.cell_backward_pre_hook[-1]
         )
-        return hookset
+        bw_pre_hook.register_backward_pre_hook()
+        if is_backward_hook_output_a_view():
+            result = bw_pre_hook(output)
+        else:
+            if isinstance(output, tuple):
+                result = bw_pre_hook(*output)
+            else:
+                result = bw_pre_hook(output)
+            if isinstance(output, tuple):
+                if len(output) == 1:
+                    result = (result,)
+                if len(result) != len(output):
+                    raise TypeError(
+                        f"The backward pre hook return value size is {len(result)} "
+                        f"not equal to output size {len(output)}"
+                    )
+        return result
     def _need_exchange(self, module):
         if not hasattr(module, 'has_pre_hook_called') or not module.has_pre_hook_called:
@@ -71,23 +189,26 @@ class MindsproeHookManager(BaseHookManager):
         params_dict = {}
         if self.config.task != Const.STRUCTURE:
             params_dict = {
-                    key.split(Const.SEP)[-1]: value
-                    for key, value in module.parameters_dict(recurse=False).items()
-                    }
+                key.split(Const.SEP)[-1]: value
+                for key, value in module.parameters_dict(recurse=False).items()
+            }
         return params_dict
-    def _build_backward_pre_hook(self, hook_type, name):
+    def _build_backward_pre_hook(self, hook_type, full_name):
         def backward_pre_hook(module, grad_input):
             if self.config.level != Const.LEVEL_L2:
                 return
             tid = threading.get_ident()
-            if not self._should_execute_hook(hook_type, module, False, tid):
+            if not self._should_execute_hook(tid):
                 return
             with ThreadSafe():
+                original_state = self.ensure_gc_enabled()
                 BaseHookManager.inner_switch[tid] = True
                 module_input = ModuleBackwardInputs(grad_input=grad_input)
-                self.data_collector.update_api_or_module_name(name)
-                self.data_collector.backward_input_data_collect(name, module, self._pid, module_input)
+                self.data_collector.update_api_or_module_name(full_name)
+                self.data_collector.backward_input_data_collect(full_name, module, self._pid, module_input)
                 BaseHookManager.inner_switch[tid] = False
+                self.restore_gc_state(original_state)
         return backward_pre_hook

msprobe/mindspore/dump/kernel_kbyk_dump.py CHANGED Viewed

@@ -20,6 +20,9 @@ from msprobe.core.common.file_utils import create_directory, save_json
 from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+import mindspore as ms
+ms_version = ms.__version__
 class KernelKbykDump:
     COMMON_SETTINGS = "common_dump_settings"
@@ -39,6 +42,7 @@ class KernelKbykDump:
         common_set["input_output"] = 0
         common_set["kernels"] = []
         common_set["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7]
+        common_set["statistic_category"] = []
         if config.stat_cal_mode and config.device_stat_precision_mode:
             e2e_set = {
@@ -71,10 +75,30 @@ class KernelKbykDump:
                 common_set["input_output"] = 1
             if config.data_mode[0] == Const.OUTPUT:
                 common_set["input_output"] = 2
+        if config.summary_mode:
+            if isinstance(config.summary_mode, str):
+                if config.summary_mode == Const.STATISTICS:
+                    common_set["statistic_category"] = ["max", "min", "avg", "l2norm"]
+                else:
+                    mode = self._process_hash(config.summary_mode)
+                    common_set["statistic_category"] = [mode]
+            elif isinstance(config.summary_mode, list):
+                common_set["statistic_category"] = list({
+                    self._process_hash("avg" if mode == "mean" else mode)
+                    for mode in config.summary_mode
+                })
         self.dump_json[KernelKbykDump.COMMON_SETTINGS] = common_set
         self.dump_json[KernelKbykDump.E2E_SETTINGS] = e2e_set
+    @staticmethod
+    def _process_hash(value):
+        if ms_version <= "2.7.0" and (value == Const.HASH or value == Const.MD5):
+            value = "md5"
+        elif value == Const.MD5:
+            value = "hash:md5"
+        return value
     def handle(self):
         json_path = self.dump_json[KernelKbykDump.COMMON_SETTINGS]["path"]
         create_directory(json_path)

msprobe/mindspore/exception_dump/__init__.py ADDED Viewed

File without changes

msprobe/mindspore/exception_dump/exception_dump_tool_factory.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from msprobe.core.common.log import logger
+from msprobe.mindspore.common.const import Const
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.exception_dump.kernel_graph_exception_dump import KernelGraphExceptionDump
+class ExceptionDumpToolFactory:
+    tools = {
+        Const.CELL: {
+            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_GE_MODE: None,
+            Const.PYNATIVE_MODE: None
+        },
+        Const.API: {
+            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_GE_MODE: None,
+            Const.PYNATIVE_MODE: None
+        },
+        Const.KERNEL: {
+            Const.GRAPH_KBYK_MODE: KernelGraphExceptionDump,
+            Const.GRAPH_GE_MODE: None,
+            Const.PYNATIVE_MODE: KernelGraphExceptionDump
+        }
+    }
+    @staticmethod
+    def create(config: DebuggerConfig):
+        tool = ExceptionDumpToolFactory.tools.get(config.level)
+        if not tool:
+            raise Exception("Valid level is needed.")
+        tool = tool.get(config.execution_mode)
+        if not tool:
+            logger.error(f"Exception dump is not supported in {config.execution_mode} mode "
+                         f"when level is {config.level}.")
+            raise ValueError
+        return (tool(config),)

msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py ADDED Viewed

@@ -0,0 +1,57 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from msprobe.core.common.file_utils import create_directory, save_json
+from msprobe.mindspore.common.log import logger
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+class KernelGraphExceptionDump:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "tensor"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.dump_json["common_dump_settings"]["op_debug_mode"] = 4
+        self.dump_json["common_dump_settings"]["file_format"] = "npy"
+        self.dump_json["e2e_dump_settings"] = dict()
+        self.dump_json["e2e_dump_settings"]["enable"] = not config.async_dump
+        self.dump_json["e2e_dump_settings"]["trans_flag"] = True
+        if config.stat_cal_mode and config.device_stat_precision_mode:
+            self.dump_json["e2e_dump_settings"]["stat_calc_mode"] = config.stat_cal_mode
+            self.dump_json["e2e_dump_settings"]["device_stat_precision_mode"] = config.device_stat_precision_mode
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            logger.warning("Step would change to all in this task.")
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+    def handle(self):
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        create_directory(json_path)
+        json_path = os.path.join(json_path, "kernel_graph_exception_check.json")
+        save_json(json_path, self.dump_json, indent=4)
+        logger.info(json_path + " has been created.")
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path

msprobe/mindspore/free_benchmark/api_pynative_self_check.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import functools
 import importlib
 import os
+import threading
 import traceback
 import mindspore as ms
@@ -38,7 +39,6 @@ from msprobe.mindspore.free_benchmark.common.utils import Tools
 from msprobe.mindspore.free_benchmark.handler.handler_factory import HandlerFactory
 from msprobe.mindspore.free_benchmark.perturbation.perturbation_factory import PerturbationFactory
 _api_register = get_api_register()
@@ -74,9 +74,10 @@ class ApiPyNativeSelfCheck:
         def forward_hook(api_name_with_id, cell, input_data, output_data):
             ret = None
+            tid = threading.get_ident()
             if not need_wrapper_func():
-                del cell.msprobe_input_kwargs
+                del cell.msprobe_input_kwargs[tid]
                 return ret
             api_name_with_id = api_name_with_id[:-1]
@@ -85,9 +86,9 @@ class ApiPyNativeSelfCheck:
                         api_name_with_id[api_name_with_id.find(Const.SEP) + 1:api_name_with_id.rfind(Const.SEP)])
             if api_name in self.api_list:
                 ret = check_self(api_name_with_id, output_data, self.ori_func.get(api_name),
-                                 *input_data, **cell.msprobe_input_kwargs)
+                                 *input_data, **cell.msprobe_input_kwargs[tid])
-            del cell.msprobe_input_kwargs
+            del cell.msprobe_input_kwargs[tid]
             return ret
         def backward_hook(cell, grad_input, grad_output):

msprobe/mindspore/mindspore_service.py CHANGED Viewed

@@ -27,7 +27,7 @@ from msprobe.mindspore.common.utils import (
     get_cells_and_names_with_index
 )
 from msprobe.mindspore.dump.hook_cell.api_register import get_api_register, ApiTemplate
-from msprobe.mindspore.dump.hook_cell.ms_hook_manager import MindsproeHookManager
+from msprobe.mindspore.dump.hook_cell.ms_hook_manager import MindsporeHookManager
 from msprobe.mindspore.dump.hook_cell.primitive_hooks import PrimitiveHookService
 from msprobe.mindspore.dump.jit_dump import JitDump
@@ -59,7 +59,7 @@ class MindsporeService(BaseService):
         self.api_register = get_api_register()
         self.primitive_hook_service = PrimitiveHookService(self)
         self.cell_processor = CellProcessor(self.data_collector.scope)
-        self.hook_manager = MindsproeHookManager(self.data_collector, self.config)
+        self.hook_manager = MindsporeHookManager(self.data_collector, self.config)
         self._setup_jit_context()
         self.api_template = ApiTemplate

msprobe/mindspore/mindtorch/mindtorch_adaptor.py CHANGED Viewed

@@ -93,6 +93,8 @@ from torch.nn.modules.module import (_global_backward_pre_hooks, _global_backwar
                                      _global_forward_hooks, _global_forward_hooks_always_called)
 from torch.utils.hooks import RemovableHandle
+from msprobe.mindspore.common.utils import is_backward_hook_output_a_view
 def _call_impl(self, *args, **kwargs):
     forward_call = self.forward
@@ -245,11 +247,14 @@ def _get_backward_hooks(self):
 def apply_backward_hook_on_tensors(cell_backward_hook, args):
-    is_tuple = True
-    if not isinstance(args, tuple):
-        args = (args,)
-        is_tuple = False
-    hooked_args = cell_backward_hook(*args)
-    if is_tuple and len(args) == 1:
-        hooked_args = (hooked_args, )
+    if is_backward_hook_output_a_view():
+        hooked_args = cell_backward_hook(args)
+    else:
+        is_tuple = True
+        if not isinstance(args, tuple):
+            args = (args,)
+            is_tuple = False
+        hooked_args = cell_backward_hook(*args)
+        if is_tuple and len(args) == 1:
+            hooked_args = (hooked_args, )
     return hooked_args

msprobe/mindspore/monitor/features.py CHANGED Viewed

@@ -17,6 +17,8 @@ from mindspore import mint, ops, _no_grad
 from mindspore import Tensor
 from mindspore import dtype as mstype
+from msprobe.core.common.log import logger
 @_no_grad()
 def square_sum(x: Tensor):
@@ -74,3 +76,83 @@ FUNC_MAP = {
     "shape": get_shape,
     "dtype": get_dtype
 }
+def max_eigenvalue(input_tensor: Tensor, num_iterations=3):
+    input_tensor = input_tensor.float()
+    try:
+        check_tensor_dim(input_tensor, 2)
+    except (TypeError, ValueError) as e:
+        logger.warning(f"calcute max eigenvalue failed, {e}")
+        return Tensor(0)
+    in_features = input_tensor.shape[1]
+    u_tensor = ops.randn(in_features)
+    u_norm = u_tensor.norm()
+    if u_norm == 0:
+        return Tensor(0)
+    u_tensor /= u_tensor.norm()
+    input_seq = ops.matmul(input_tensor.T, input_tensor)
+    for _ in range(num_iterations):
+        v_tensor = ops.matmul(input_seq, u_tensor)
+        spectral_norm = ops.matmul(v_tensor.T, u_tensor)
+        v_norm = v_tensor.norm()
+        if v_norm > 0:
+            u_tensor = v_tensor / v_norm
+        else:
+            spectral_norm = Tensor(0)
+            break
+    return spectral_norm.sqrt()
+def check_tensor_dim(tensor, n):
+    if not isinstance(tensor, Tensor):
+        raise TypeError(
+            f"Input must be a mindspore Tensor, but got {type(tensor)} instead."
+            )
+    if len(tensor.shape) < n:
+        raise ValueError(
+            f"tensor dim must be at least {n} dimensions."
+            f"Got shape: {tuple(tensor.shape)} with {tensor.dim()} dims"
+            )
+def cal_entropy(qk_tensor: Tensor, mask=None):
+    try:
+        check_tensor_dim(qk_tensor, 2)
+    except (TypeError, ValueError) as e:
+        logger.warning(f"calculate entropy failed, {e}")
+        return Tensor(0), Tensor(0)
+    if mask is None:
+        mask = ops.tril(ops.ones((qk_tensor.shape[1], qk_tensor.shape[1])))
+    qk_tensor = qk_tensor - ops.amax(qk_tensor, axis=1, keepdims=True)
+    qk_tensor = qk_tensor.masked_fill(mask == 0, float('-inf'))
+    softmax_qkt = ops.softmax(qk_tensor.float(), axis=1)
+    softmax_max = ops.mean(ops.amax(softmax_qkt, axis=1))
+    entropy = ops.mean(-ops.nansum(softmax_qkt * ops.log(softmax_qkt), axis=1))
+    return entropy, softmax_max
+def cal_stable_rank(weight: Tensor):
+    eig = max_eigenvalue(weight)
+    if eig == Tensor(0):
+        return Tensor(0), Tensor(0)
+    f_norm = ops.norm(weight, ord='fro')
+    return f_norm / eig, eig
+def cal_qkt(q_h: Tensor, k_h: Tensor, order="s,b,h,d"):
+    # q_h shape is (s, b, h, d)
+    try:
+        check_tensor_dim(q_h, 4)
+        check_tensor_dim(k_h, 4)
+    except (TypeError, ValueError) as e:
+        logger.warning(f"calculatee qkt failed, {e}")
+        return Tensor(0)
+    if order == "s,b,h,d":
+        qkt = ops.matmul(q_h[:, 0, 0, :], k_h[:, 0, 0, :].t()) / q_h.shape[-1] ** 0.5
+    elif order == "b,s,h,d":
+        qkt = ops.matmul(q_h[0, :, 0, :], k_h[0, :, 0, :].t()) / q_h.shape[-1] ** 0.5
+    else:
+        logger.warning(f"Calculate qk tensor failed: Order unsupported.")
+        qkt = Tensor(0)
+    return qkt

mindstudio-probe 8.1.2__py3-none-any.whl → 8.2.1__py3-none-any.whl

mindstudio-probe 8.1.2py3-none-any.whl → 8.2.1py3-none-any.whl