PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/mindspore/dump/dump_tool_factory.py CHANGED Viewed

@@ -15,15 +15,17 @@
 from msprobe.mindspore.common.const import Const
 from msprobe.core.common.log import logger
+from msprobe.mindspore.common.utils import is_graph_mode_cell_dump_allowed
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump
 from msprobe.mindspore.dump.kernel_kbyk_dump import KernelKbykDump
+from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump
 class DumpToolFactory:
     tools = {
         Const.CELL: {
-            Const.GRAPH_KBYK_MODE: None,
+            Const.GRAPH_KBYK_MODE: GraphModeCellDump,
             Const.GRAPH_GE_MODE: None,
             Const.PYNATIVE_MODE: None
         },
@@ -40,9 +42,15 @@ class DumpToolFactory:
     }
     @staticmethod
-    def create(config: DebuggerConfig):
-        if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_DATA_MODE_LIST:
-            raise Exception("data_mode must be one of all, input, output.")
+    def create(config: DebuggerConfig, model=None):
+        if config.level == Const.CELL:
+            if not is_graph_mode_cell_dump_allowed(config):
+                raise Exception("Cell dump is not supported in graph mode.")
+            if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST:
+                raise Exception("data_mode must be one of all, forward, backward.")
+        else:
+            if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_DATA_MODE_LIST:
+                raise Exception("data_mode must be one of all, input, output.")
         tool = DumpToolFactory.tools.get(config.level)
         if not tool:
             raise Exception("Valid level is needed.")
@@ -51,4 +59,4 @@ class DumpToolFactory:
             logger.error(f"Data dump is not supported in {config.execution_mode} mode "
                          f"when dump level is {config.level}.")
             raise ValueError
-        return tool(config)
+        return tool(config, model) if tool == GraphModeCellDump else tool(config)

msprobe/mindspore/dump/graph_mode_cell_dump.py ADDED Viewed

@@ -0,0 +1,139 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import mindspore as ms
+from mindspore import hal, ops, Tensor
+from mindspore.ops.primitive import _run_op
+from msprobe.core.common.const import Const as CoreConst
+from msprobe.core.common.runtime import Runtime
+from msprobe.mindspore.common.const import Const
+from msprobe.mindspore.common.log import logger
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+import msprobe.mindspore.dump.cell_dump_process as cellDumperWithDumpGradient
+import msprobe.mindspore.dump.cell_dump_with_insert_gradient as cellDumperWithInsertGradient
+tensordump_flag = True
+try:
+    from mindspore._c_expression import _tensordump_set_step
+except ImportError:
+    tensordump_flag = False
+class GraphModeCellDump:
+    task = CoreConst.STATISTICS
+    def __init__(self, config: DebuggerConfig, model, strict=True):
+        self.net = model
+        self.white_list = []
+        self.black_list = []
+        self.execution_mode = config.execution_mode
+        self.dump_path = config.dump_path if config.dump_path else "./"
+        self.rank = config.rank
+        self.step = config.step
+        self.scope = config.scope
+        self.list = config.list
+        self.data_mode = config.data_mode
+        self.file_format = config.file_format
+        GraphModeCellDump.task = config.task
+        self.summary_mode = config.summary_mode
+        self.check_config(strict)
+        self.set_step()
+    @staticmethod
+    def step():
+        # 更新TensorDump Step
+        if GraphModeCellDump.task == CoreConst.TENSOR:
+            hal.synchronize()
+            temp_tensor = ms.Tensor([1], dtype=ms.float32)
+            step_flag = "<tensordump-update-step>"
+            _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
+            ops.tensordump(step_flag, temp_tensor)
+    def check_config(self, strict):
+        if not self.net:
+            raise Exception("The model is empty and cell dump is not enabled.")
+        if strict:
+            if self.rank:
+                raise Exception("In graph mode, cell dump does not currently support specifying rank.")
+            if self.scope:
+                raise Exception("In graph mode, cell dump does not currently support specifying scope.")
+            if self.list:
+                raise Exception("In graph mode, cell dump does not currently support specifying list.")
+            if len(self.data_mode) != 1 or self.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST:
+                raise Exception("In graph mode and cell dump, data_mode must be one of all, forword, backword.")
+            if self.file_format != []:
+                logger.warning("In graph mode, cell dump does not currently support specifying file_format."
+                               " The file will be stored in npy format.")
+            if self.task == CoreConst.STATISTICS and self.summary_mode == CoreConst.MD5:
+                raise Exception("The L0 level statistics dump mode does not support "
+                            "the calculation of md5 values currently In graph mode.")
+        else:
+            self.rank = []
+            self.scope = []
+            self.list = []
+            self.file_format = []
+            if len(self.data_mode) != 1 or self.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST:
+                self.data_mode = [CoreConst.ALL]
+            if self.task == CoreConst.STATISTICS and self.summary_mode == CoreConst.MD5:
+                self.summary_mode = CoreConst.STATISTICS
+        return True
+    def set_step(self):
+        if tensordump_flag:
+            _tensordump_set_step(self.step)
+        else:
+            raise Exception(
+                "Importing _tensordump_set_step failed, "
+                "please use the latest version package of MindSpore."
+            )
+    def handle(self):
+        os.environ['MS_JIT_MODULES'] = 'msprobe'
+        if Runtime.run_mode == Const.PYNATIVE_GRAPH_MODE:
+            dump_path = os.path.join(self.dump_path, Const.GRAPH_MODE)
+        else:
+            dump_path = self.dump_path
+        cell_dumper = cellDumperWithDumpGradient
+        if self.execution_mode == Const.PYNATIVE_MODE:
+            enable_dump_gradient = hasattr(ops, 'DumpGradient')
+            if hasattr(ops, 'DumpGradient'):
+                try:
+                    ops.DumpGradient()('grad.npy', Tensor([0], dtype=ms.float32), 'in')
+                except Exception:
+                    enable_dump_gradient = False
+                    logger.warning('the DumpGradient operator failed to execute.')
+            if not enable_dump_gradient:
+                cell_dumper = cellDumperWithInsertGradient
+        dump_config = cell_dumper.CellDumpConfig(
+            net=self.net,
+            dump_path=dump_path,
+            data_mode=self.data_mode[0],
+            task=self.task,
+            summary_mode=self.summary_mode,
+            step=self.step
+        )
+        cell_dumper.start(
+            dump_config
+        )

msprobe/mindspore/dump/graph_tensor_dump.py ADDED Viewed

@@ -0,0 +1,123 @@
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections import OrderedDict
+import mindspore as ms
+def _iterate_items(data):
+    if isinstance(data, (dict, OrderedDict)):
+        return data.items()
+    elif isinstance(data, (list, tuple)):
+        return enumerate(data)
+    else:
+        raise TypeError("Unsupported data type")
+class _SaveBase:
+    def __init__(self, save_dir):
+        super(_SaveBase, self).__init__()
+        self.path = save_dir
+        self.save_func = _npy_save
+    def get_save_func(self):
+        return self.save_func
+@ms.jit_class
+class _SaveCell(_SaveBase):
+    def __call__(self, name, data):
+        return self.get_save_func()(self.path, name, data)
+class _SaveGradBase:
+    def __init__(self, save_dir, name):
+        super(_SaveGradBase, self).__init__()
+        self.file = save_dir + name
+@ms.jit_class
+class _SaveGradCell(_SaveGradBase):
+    def __init__(self, save_dir, name):
+        super(_SaveGradCell, self).__init__(save_dir, name)
+        self.ms_save_grad = ms.ops.InsertGradientOf(
+            _wrapper_save_grad_func(self.file))
+    def __call__(self, x):
+        if isinstance(x, ms.Tensor):
+            return self.ms_save_grad(x)
+        else:
+            raise TypeError(f"For 'save_grad', the type of argument 'data' must be mindspore.Tensor or torch.tensor, "
+                            f"but got {type(x)}")
+def _npy_save_ops(file, data):
+    if isinstance(data, ms.Tensor):
+        if data.dtype == ms.bfloat16:
+            data = data.float()
+        ms.ops.TensorDump()(file, data)
+    else:
+        raise TypeError(f"For 'save', the type of argument 'data' must be mindspore.Tensor or torch.tensor, "
+                        f"but got {type(data)}")
+def _wrapper_save_grad_func(file):
+    def _save_grad_func(grad):
+        data = grad
+        if data.dtype == ms.bfloat16:
+            data = data.float()
+        ms.ops.TensorDump()(file, data)
+        return grad
+    return _save_grad_func
+def _npy_save(save_dir, item_name, data):
+    if isinstance(data, (list, tuple, dict, OrderedDict)):
+        for key, val in _iterate_items(data):
+            _npy_save(save_dir, f"{item_name}.{key}", val)
+    else:
+        if data is None:
+            return
+        _npy_save_ops(f"{save_dir}{item_name}", data)
+def generate_dump_dir(save_dir, sep=os.sep):
+    """
+    usage: generate dump directory path str in mindspore graph mode
+    """
+    full_suffix = '{step}' + sep + '{rank}' + sep
+    if save_dir and save_dir[-1] != sep:
+        result_dir = save_dir + sep + full_suffix
+    else:
+        result_dir = save_dir + full_suffix
+    return result_dir
+def save(save_dir, name, data):
+    """
+    save tensor.
+    """
+    dump_dir = generate_dump_dir(save_dir)
+    _SaveCell(dump_dir)(name, data)
+def save_grad(save_dir, name, data):
+    """
+    save grad.
+    """
+    dump_dir = generate_dump_dir(save_dir)
+    suffix_name = name + '_grad'
+    return _SaveGradCell(dump_dir, suffix_name)(data)

msprobe/mindspore/dump/hook_cell/api_register.py CHANGED Viewed

@@ -14,14 +14,17 @@
 # limitations under the License.
 import os
+import inspect
 from mindspore import Tensor, ops, mint
+from mindspore.mint import distributed
 from mindspore.mint.nn import functional
 from mindspore.communication import comm_func
 from msprobe.core.common.file_utils import load_yaml
 from msprobe.core.common.utils import Const
 from msprobe.core.data_dump.api_registry import ApiRegistry
+from msprobe.mindspore.common.log import logger
 from msprobe.mindspore.common.const import Const as MsConst
 from msprobe.mindspore.common.utils import is_mindtorch
 from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
@@ -41,7 +44,8 @@ if not is_mindtorch():
             Const.MS_API_TYPE_TENSOR: (Tensor, (Tensor,)),
             Const.MS_API_TYPE_MINT: (mint, (mint,)),
             Const.MS_API_TYPE_MINT_FUNC: (functional, (functional,)),
-            Const.MS_API_TYPE_COM: (comm_func, (comm_func,))
+            Const.MS_API_TYPE_COM: (comm_func, (comm_func,)),
+            Const.MS_API_TYPE_MINT_DIST: (distributed, (distributed,))
         }
     }
     if stub_tensor_existed:
@@ -50,6 +54,7 @@ if not is_mindtorch():
         )
     _supported_api_list_path = (os.path.join(cur_path, MsConst.SUPPORTED_API_LIST_FILE),)
+    _backlist = []
 else:
     import torch
     import torch_npu
@@ -64,13 +69,14 @@ else:
     }
     _supported_api_list_path = (os.path.join(cur_path, '../../../pytorch/hook_module',
                                              MsConst.SUPPORTED_API_LIST_FILE),)
+    _backlist = [f'{Const.PT_API_TYPE_TENSOR}.__setitem__']
 _inner_used_api = {
     Const.MS_FRAMEWORK + Const.SEP + Const.MS_API_TYPE_OPS: (
         ops, "norm", "square", "sqrt", "is_complex", "stack", "is_floating_point"
     ),
     Const.MS_FRAMEWORK + Const.SEP + Const.MS_API_TYPE_TENSOR: (
-        Tensor, "to", "numel"
+        Tensor, "to", "numel", 'sum'
     ),
     Const.MS_FRAMEWORK + Const.SEP + Const.MS_API_TYPE_MINT: (
         mint, "max", "min", "mean", "norm"
@@ -84,6 +90,9 @@ class ApiTemplate(HOOKCell):
         self.api_func = api_func
         self.prefix_api_name = prefix + Const.SEP + str(api_name.split(Const.SEP)[-1]) + Const.SEP
         super().__init__(hook_build_func)
+        distributed_prefix = Const.DIST_API_TYPE_PREFIX if is_mindtorch() else Const.MINT_DIST_API_TYPE_PREFIX
+        if prefix == distributed_prefix:
+            self.op_is_distributed = True
     @staticmethod
     def async_to_sync(output):
@@ -103,9 +112,22 @@ class ApiTemplate(HOOKCell):
         output = self.api_func(*args, **kwargs)
-        if self.prefix_api_name.startswith(MsConst.DISTRIBUTED_DATA_PREFIX):
-            if kwargs.get("async_op") or self.api_name in ["isend", "irecv"]:
+        if self.prefix_api_name.startswith(
+            (MsConst.DISTRIBUTED_DATA_PREFIX, Const.MINT_DIST_API_TYPE_PREFIX)
+        ):
+            try:
+                bound = inspect.signature(self.api_func).bind(*args, **kwargs)
+                bound.apply_defaults()
+                use_async_op_flag = bound.arguments.get("async_op", False)
+            except Exception as e:
+                use_async_op_flag = False
+                logger.warning(f"fail to get dist api's func signature because {e}, no wait")
+            if use_async_op_flag or self.api_name in ["isend", "irecv"]:
                 output = self.async_to_sync(output)
+            if self.api_name == "batch_isend_irecv" and isinstance(output, list):
+                output = [self.async_to_sync(handle) for handle in output]
         return output
     def forward(self, *args, **kwargs):
@@ -134,9 +156,21 @@ def get_api_register(return_new=False):
         stub_tensor_set = True
     if return_new:
-        return ApiRegistry(_api_types, _inner_used_api, _supported_api_list_path, ApiTemplate)
+        return ApiRegistry(
+            _api_types,
+            _inner_used_api,
+            _supported_api_list_path,
+            ApiTemplate,
+            _backlist
+        )
     global api_register
     if api_register is None:
-        api_register = ApiRegistry(_api_types, _inner_used_api, _supported_api_list_path, ApiTemplate)
+        api_register = ApiRegistry(
+            _api_types,
+            _inner_used_api,
+            _supported_api_list_path,
+            ApiTemplate,
+            _backlist
+        )
     return api_register

msprobe/mindspore/dump/hook_cell/hook_cell.py CHANGED Viewed

@@ -15,11 +15,16 @@
 from collections import defaultdict
+import mindspore as ms
 from mindspore import nn
+from msprobe.core.common.runtime import Runtime
 from msprobe.mindspore.common.utils import is_mindtorch, register_backward_hook_functions
+ms_version = ms.__version__
 def add_cell_count(name):
     HOOKCell.cell_count[name] += 1
@@ -31,25 +36,31 @@ def get_cell_count(name):
 def __init__(self, hook_build_func) -> None:
     super(HOOKCell, self).__init__()
     self.changed_status = False
-    self.input_kwargs = {}
+    self.msprobe_input_kwargs = {}
     if not HOOKCell.g_stop_hook:
         HOOKCell.g_stop_hook = True
         self.changed_status = True
         self.forward_data_collected = False
+        if not Runtime.is_running:
+            return
         prefix = self.prefix_api_name if hasattr(self, "prefix_api_name") else ""
         if callable(hook_build_func):
-            forward_pre_hook, forward_hook, backward_hook, backward_pre_hook = hook_build_func(prefix)
-            self.register_forward_pre_hook(forward_pre_hook)
-            self.register_forward_hook(forward_hook)
-            register_backward_hook_functions["full"](self, backward_hook)
-            register_backward_hook_functions["pre"](self, backward_pre_hook)
+            hook_set = hook_build_func(prefix)
+            if ms_version < "2.6.0" and not is_mindtorch():
+                getattr(self, "_forward_pre_hook", {})[id(self)] = hook_set.forward_pre_hook
+                getattr(self, "_forward_hook", {})[id(self)] = hook_set.forward_hook
+            else:
+                self.register_forward_pre_hook(hook_set.forward_pre_hook)
+                self.register_forward_hook(hook_set.forward_hook)
+            register_backward_hook_functions["full"](self, hook_set.backward_hook)
+            register_backward_hook_functions["pre"](self, hook_set.backward_pre_hook)
 # 重载call，加全局标志。
 def __call__(self, *args, **kwargs):
     try:
-        self.input_kwargs = kwargs
+        self.msprobe_input_kwargs = kwargs
         out = super(HOOKCell, self).__call__(*args, **kwargs)
     except Exception as e:
         raise e

msprobe/mindspore/dump/hook_cell/ms_hook_manager.py ADDED Viewed

@@ -0,0 +1,88 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mindspore.common.api import _no_grad
+from msprobe.core.common.const import Const
+from msprobe.core.common.utils import replace_last_occurrence
+from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputs
+from msprobe.core.hook_manager import BaseHookManager, HookSet
+from msprobe.mindspore.common.utils import has_kwargs_in_forward_hook
+from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
+class MindsproeHookManager(BaseHookManager):
+    @property
+    def _is_recompute(self):
+        return None
+    @staticmethod
+    def _no_grad_context():
+        return _no_grad()
+    @staticmethod
+    def _add_count(name):
+        HOOKCell.add_cell_count(name)
+    @staticmethod
+    def _process_kwargs_and_output(module, hook_type, kwargs_or_output, output_or_kwargs):
+        if not has_kwargs_in_forward_hook() or hook_type == Const.API:
+            kwargs = module.msprobe_input_kwargs if hasattr(module, 'msprobe_input_kwargs') else {}
+            output = kwargs_or_output
+        else:
+            kwargs = kwargs_or_output
+            output = output_or_kwargs
+        return kwargs, output
+    def build_hook(self, hook_type, name):
+        if hook_type == Const.API:
+            full_forward_name = name + str(HOOKCell.get_cell_count(name)) + Const.SEP + Const.FORWARD
+        else:
+            full_forward_name = name
+        full_backward_name = replace_last_occurrence(full_forward_name, Const.FORWARD, Const.BACKWARD)
+        hookset = HookSet(
+            forward_hook=self._build_forward_hook(hook_type, full_forward_name),
+            forward_pre_hook=self._build_forward_pre_hook(hook_type, full_forward_name, name),
+            backward_hook=self._build_backward_hook(hook_type, full_backward_name),
+            backward_pre_hook=self._build_backward_pre_hook(hook_type, full_backward_name)
+        )
+        return hookset
+    def _need_exchange(self, module):
+        if not hasattr(module, 'has_pre_hook_called') or not module.has_pre_hook_called:
+            return False
+        else:
+            return True
+    def _get_params_dict(self, module):
+        params_dict = {}
+        if self.config.task != Const.STRUCTURE:
+            params_dict = {
+                    key.split(Const.SEP)[-1]: value
+                    for key, value in module.parameters_dict(recurse=False).items()
+                    }
+        return params_dict
+    def _build_backward_pre_hook(self, hook_type, name):
+        def backward_pre_hook(module, grad_input):
+            if self.config.level != Const.LEVEL_L2:
+                return
+            if not self._should_execute_hook(hook_type, module, False):
+                return
+            BaseHookManager.inner_switch = True
+            module_input = ModuleBackwardInputs(grad_input=grad_input)
+            self.data_collector.update_api_or_module_name(name)
+            self.data_collector.backward_input_data_collect(name, module, self._pid, module_input)
+            BaseHookManager.inner_switch = False
+        return backward_pre_hook

msprobe/mindspore/dump/hook_cell/primitive_hooks.py CHANGED Viewed

@@ -21,6 +21,7 @@ from mindspore.common.tensor import Tensor
 from msprobe.core.common.utils import Const, DumpException
 from msprobe.core.data_dump.data_processor.base import (ModuleBackwardInputs, ModuleBackwardOutputs,
                                                         ModuleForwardInputsOutputs)
+from msprobe.core.hook_manager import BaseHookManager
 from msprobe.mindspore.common.log import logger
@@ -58,7 +59,7 @@ class PrimitiveHookService:
             def backward_hook(grad):
                 captured_grads.extend(grad)
                 backward_primitive_name = f"{updated_primitive_name}{Const.SEP}{Const.BACKWARD}"
+                self.service_instance.inner_switch = True
                 try:
                     if hook_type == Const.INPUT:
                         self.service_instance.data_collector.update_api_or_module_name(backward_primitive_name)
@@ -77,6 +78,7 @@ class PrimitiveHookService:
                     logger.error(f"This is a primitive op {hook_type}_backward dump error: {exception}, "
                                  f"updated_primitive_name: {updated_primitive_name}")
                     raise DumpException(DumpException.BACKWARD_DATA_COLLECTION_ERROR) from exception
+                self.service_instance.inner_switch = False
             return backward_hook
@@ -137,6 +139,7 @@ class PrimitiveHookService:
         def pre_forward_hook(primitive_name, primitive_instance, args, kwargs):
             module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
+            self.service_instance.inner_switch = True
             try:
                 self.service_instance.data_collector.forward_input_data_collect(
                     primitive_name,
@@ -148,9 +151,11 @@ class PrimitiveHookService:
                 logger.error(f"This is a primitive op dump error during forward input data collection: {exception}, "
                              f"primitive_name: {primitive_name}")
                 raise DumpException(DumpException.FORWARD_DATA_COLLECTION_ERROR) from exception
+            self.service_instance.inner_switch = False
         def post_forward_hook(primitive_name, primitive_instance, args, kwargs, output):
             module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
+            self.service_instance.inner_switch = True
             try:
                 self.service_instance.data_collector.forward_output_data_collect(
                     primitive_name,
@@ -162,6 +167,7 @@ class PrimitiveHookService:
                 logger.error(f"This is a primitive op dump error during forward output data collection: {exception}, "
                              f"primitive_name: {primitive_name}")
                 raise DumpException(DumpException.FORWARD_DATA_COLLECTION_ERROR) from exception
+            self.service_instance.inner_switch = False
         def wrapped_primitive_call(instance_self, *args, **kwargs):
             """
@@ -179,7 +185,7 @@ class PrimitiveHookService:
             current_count = self.primitive_counters.get(primitive_name, 0)
             updated_primitive_name = f"{Const.PRIMITIVE_PREFIX}{Const.SEP}{primitive_name}{Const.SEP}{current_count}"
-            if not self.service_instance.primitive_switch:
+            if not self.service_instance.primitive_switch or BaseHookManager.inner_switch:
                 return origin_func(*args, **kwargs)
             captured_grads_input, captured_grads_output = [], []

msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml CHANGED Viewed

@@ -1025,3 +1025,21 @@ communication.comm_func:
   - recv
   - isend
   - irecv
+mint.distributed:
+  - send
+  - recv
+  - broadcast
+  - all_reduce
+  - reduce
+  - all_gather
+  - gather
+  - isend
+  - irecv
+  - scatter
+  - reduce_scatter
+  - all_to_all_single
+  - all_to_all
+  - all_gather_into_tensor
+  - reduce_scatter_tensor
+  - batch_isend_irecv

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl