PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/compare/pt_compare.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -13,92 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os.path
+from msprobe.core.compare.acc_compare import Comparator, ModeConfig, MappingConfig, setup_comparison
+from msprobe.pytorch.compare.utils import read_pt_data
-import torch
-from msprobe.core.common.const import FileCheckConst
-from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml
-from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
-    set_dump_path
-from msprobe.core.compare.acc_compare import Comparator, ModeConfig
-from msprobe.core.compare.utils import set_stack_json_path
-from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import load_pt
-class PTComparator(Comparator):
-    def __init__(self, mode_config, data_mapping=None):
-        super().__init__(mode_config)
-        self.stack_mode = mode_config.stack_mode
-        self.auto_analyze = mode_config.auto_analyze
-        self.fuzzy_match = mode_config.fuzzy_match
-        self.dump_mode = mode_config.dump_mode
-        self.frame_name = PTComparator.__name__
-        self.data_mapping = data_mapping
-        if isinstance(self.data_mapping, str) or self.data_mapping is None:
-            self.data_mapping_dict = self.load_mapping_file(self.data_mapping)
-        elif isinstance(self.data_mapping, dict):
-            self.data_mapping_dict = self.data_mapping
-        else:
-            raise TypeError(f"The type of parameter `data_mapping` must be dict, str or None, but got "
-                            f"{type(self.data_mapping)}")
-    @staticmethod
-    def load_mapping_file(mapping_file):
-        if isinstance(mapping_file, str):
-            mapping_dict = load_yaml(mapping_file)
-        else:
-            mapping_dict = {}
-        return mapping_dict
-    def read_npy_data(self, dir_path, file_name):
-        if not file_name:
-            return None
-        data_path = os.path.join(dir_path, file_name)
-        path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                                   FileCheckConst.PT_SUFFIX, False)
-        data_path = path_checker.common_check()
-        try:
-            # detach because numpy can not process gradient information
-            data_value = load_pt(data_path, to_cpu=True).detach()
-        except RuntimeError as e:
-            # 这里捕获 load_pt 中抛出的异常
-            logger.error(f"Failed to load the .pt file at {data_path}.")
-            raise CompareException(CompareException.INVALID_FILE_ERROR) from e
-        except AttributeError as e:
-            # 这里捕获 detach 方法抛出的异常
-            logger.error(f"Failed to detach the loaded tensor.")
-            raise CompareException(CompareException.DETACH_ERROR) from e
-        if data_value.dtype == torch.bfloat16:
-            data_value = data_value.to(torch.float32)
-        data_value = data_value.numpy()
-        return data_value
+def read_real_data(npu_dir, npu_data_name, bench_dir, bench_data_name, _) -> tuple:
+    n_value = read_pt_data(npu_dir, npu_data_name)
+    b_value = read_pt_data(bench_dir, bench_data_name)
+    return n_value, b_value
 def compare(input_param, output_path, **kwargs):
-    try:
-        auto_analyze = kwargs.get('auto_analyze', True)
-        fuzzy_match = kwargs.get('fuzzy_match', False)
-        data_mapping = kwargs.get('data_mapping', None)
-        suffix = kwargs.get('suffix', '')
-        set_dump_path(input_param)
-        dump_mode = get_dump_mode(input_param)
-        if "stack_json_path" in input_param:
-            stack_mode = kwargs.get('stack_mode', False)
-        else:
-            stack_mode = set_stack_json_path(input_param)  # set stack_mode and set "stack_json_path" in input_param
-        check_configuration_param(stack_mode, auto_analyze, fuzzy_match, input_param.get('is_print_compare_log', True))
-        create_directory(output_path)
-        check_compare_param(input_param, output_path, dump_mode, stack_mode)
-    except (CompareException, FileCheckException) as error:
-        logger.error('Compare failed. Please check the arguments and do it again!')
-        raise CompareException(error.code) from error
+    config = setup_comparison(input_param, output_path, **kwargs)
-    mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
-    pt_comparator = PTComparator(mode_config, data_mapping)
-    pt_comparator.compare_core(input_param, output_path, suffix=suffix)
+    mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match,
+                             config.dump_mode, config.compared_file_type)
+    mapping_config = MappingConfig(data_mapping=config.data_mapping)
+    pt_comparator = Comparator(read_real_data, mode_config, mapping_config)
+    pt_comparator.compare_core(input_param, output_path, suffix=config.suffix)

msprobe/pytorch/compare/utils.py ADDED Viewed

@@ -0,0 +1,47 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from msprobe.core.common.utils import logger, CompareException
+from msprobe.core.common.file_utils import FileChecker, FileCheckConst
+from msprobe.pytorch.common.utils import load_pt
+def read_pt_data(dir_path, file_name):
+    if not file_name:
+        return None
+    data_path = os.path.join(dir_path, file_name)
+    path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+                               FileCheckConst.PT_SUFFIX, False)
+    data_path = path_checker.common_check()
+    try:
+        # detach because numpy can not process gradient information
+        data_value = load_pt(data_path, to_cpu=True).detach()
+    except RuntimeError as e:
+        # 这里捕获 load_pt 中抛出的异常
+        logger.error(f"Failed to load the .pt file at {data_path}.")
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from e
+    except AttributeError as e:
+        # 这里捕获 detach 方法抛出的异常
+        logger.error(f"Failed to detach the loaded tensor.")
+        raise CompareException(CompareException.DETACH_ERROR) from e
+    if data_value.dtype == torch.bfloat16:
+        data_value = data_value.to(torch.float32)
+    data_value = data_value.numpy()
+    return data_value

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -13,11 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import torch
 from msprobe.core.common.const import Const
 from msprobe.core.common.exceptions import MsprobeException
 from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.common.utils import is_torch_nn_module
 class DebuggerConfig:
@@ -60,6 +59,7 @@ class DebuggerConfig:
                 if isinstance(task_config.online_run_ut_recompute, bool) else False
         self.check()
+        self._check_statistics_config(task_config)
         if self.level == Const.LEVEL_L2:
             self.is_backward_kernel_dump = False
@@ -78,10 +78,13 @@ class DebuggerConfig:
         if not isinstance(self.async_dump, bool):
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The parameters async_dump should be bool.")
-        if self.async_dump and self.task == Const.TENSOR and not self.list:
-            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                                   f"The parameters async_dump is true in tensor task, the parameters list cannot be "
-                                   f"empty.")
+        if self.async_dump and self.task == Const.TENSOR:
+            if self.level == Const.LEVEL_DEBUG:
+                self.list = [] # async_dump + debug level case ignore list
+            if not self.list and self.level != Const.LEVEL_DEBUG:
+                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                    f"The parameters async_dump is true in tensor task, the parameters list cannot be "
+                                    f"empty.")
         if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.warning_on_rank_0(
                 f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
@@ -93,25 +96,24 @@ class DebuggerConfig:
         self.check_kwargs()
         return True
-    def check_model(self, instance, start_model):
-        if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
-            if instance.model is not None or start_model is not None:
-                logger.info_on_rank_0(
-                    f"The current level is not L0 or mix level, so the model parameters will not be used.")
+    def check_model(self, instance, start_model, token_range=None):
+        instance.model = start_model if start_model is not None else instance.model
+        if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None:
             return
-        if start_model is None and instance.model is None:
+        if instance.model is None:
             logger.error_on_rank_0(
-                f"For level {self.level}, PrecisionDebugger or start interface must receive a 'model' parameter.")
+                f"For level {self.level} or non-empty token_range, "
+                f"PrecisionDebugger or start interface must receive a 'model' parameter.")
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"missing the parameter 'model'")
-        instance.model = start_model if start_model is not None else instance.model
-        if isinstance(instance.model, torch.nn.Module):
+        if is_torch_nn_module(instance.model):
             return
         error_model = None
         if isinstance(instance.model, (list, tuple)):
             for model in instance.model:
-                if not isinstance(model, torch.nn.Module):
+                if not is_torch_nn_module(model):
                     error_model = model
                     break
         else:
@@ -119,7 +121,7 @@ class DebuggerConfig:
         if error_model is not None:
             error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] "
-                          f"type, currently there is a {type(error_model)} type.")
+                          f"type, currently there is an unsupported {type(error_model)} type.")
             raise MsprobeException(
                 MsprobeException.INVALID_PARAM_ERROR, error_info)
@@ -130,8 +132,23 @@ class DebuggerConfig:
         if not self.list or len(self.list) != 1:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"When level is set to L2, the list must be configured as a list with one api name.")
+        if self.task != Const.TENSOR:
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                   f"When level is set to L2, the task must be set to tensor.")
         api_name = self.list[0]
         if api_name.endswith(Const.BACKWARD):
             self.is_backward_kernel_dump = True
             api_forward_name = api_name[:-len(Const.BACKWARD)] + Const.FORWARD
             self.list.append(api_forward_name)
+    def _check_statistics_config(self, task_config):
+        if self.task != Const.STATISTICS:
+            return
+        self.tensor_list = []
+        if not hasattr(task_config, "tensor_list"):
+            return
+        if self.level == Const.LEVEL_DEBUG and task_config.tensor_list:
+            logger.warning_on_rank_0("When level is set to debug, the tensor_list will be invalid.")
+            return
+        self.tensor_list = task_config.tensor_list

msprobe/pytorch/debugger/precision_debugger.py CHANGED Viewed

@@ -13,36 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections import namedtuple
+from torch.utils.data import dataloader
-import torch
-from msprobe.core.common.const import Const, FileCheckConst, MsgConst
+from msprobe.core.common.const import Const, MsgConst
 from msprobe.core.common.exceptions import MsprobeException
-from msprobe.core.common.file_utils import FileChecker
-from msprobe.core.common.utils import get_real_step_or_rank, check_init_step
+from msprobe.core.common.utils import check_token_range
+from msprobe.core.debugger.precision_debugger import BasePrecisionDebugger
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.common.utils import check_save_param
+from msprobe.pytorch.common.utils import check_save_param, is_torch_nn_module
 from msprobe.pytorch.debugger.debugger_config import DebuggerConfig
 from msprobe.pytorch.dump.module_dump.module_dump import ModuleDumper
 from msprobe.pytorch.grad_probe.grad_monitor import GradientMonitor
-from msprobe.pytorch.pt_config import parse_json_config
-from msprobe.pytorch.service import Service
-from torch.utils.data import dataloader
-ConfigParameters = namedtuple("ConfigParameters", ["config_path", "task",
-                                                   "dump_path", "level", "model"])
+from msprobe.pytorch.pytorch_service import PytorchService
+from msprobe.pytorch.pt_config import parse_task_config
-class PrecisionDebugger:
-    _instance = None
-    tasks_not_need_debugger = [Const.GRAD_PROBE]
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super(PrecisionDebugger, cls).__new__(cls)
-            cls._instance.config = None
-            cls._instance.enable_dataloader = False
-        return cls._instance
+class PrecisionDebugger(BasePrecisionDebugger):
     def __init__(
         self,
@@ -53,90 +39,65 @@ class PrecisionDebugger:
         model=None,
         step=None
     ):
-        if not hasattr(self, "initialized"):
-            config_params = ConfigParameters(config_path,
-                                             task,
-                                             dump_path,
-                                             level,
-                                             model)
-            self.check_input_params(config_params)
-            self.initialized = True
-            self.model = model
-            common_config, task_config = parse_json_config(config_path, task)
-            self.task = task if task else common_config.task
-            if self.task == Const.GRAD_PROBE:
-                self.gm = GradientMonitor(common_config, task_config)
-                return
-            if step is not None:
-                common_config.step = get_real_step_or_rank(step, Const.STEP)
-            self.config = DebuggerConfig(
-                common_config, task_config, task, dump_path, level
-            )
-            self.service = Service(self.config)
-            self.module_dumper = ModuleDumper(self.service)
-            self.enable_dataloader = self.config.enable_dataloader
-            if self.enable_dataloader:
-                logger.warning_on_rank_0("The enable_dataloader feature will be deprecated in the future.")
-                dataloader._BaseDataLoaderIter.__next__ = iter_tracer(dataloader._BaseDataLoaderIter.__next__)
-    @property
-    def instance(self):
-        return self._instance
+        if self.initialized:
+            return
+        super().__init__(config_path, task, dump_path, level, step)
+        self.model = model
+        if self.task == Const.GRAD_PROBE:
+            self.gm = GradientMonitor(self.common_config, self.task_config)
+            return
+        self.config = DebuggerConfig(
+            self.common_config, self.task_config, task, dump_path, level
+        )
+        self.service = PytorchService(self.config)
+        self.module_dumper = ModuleDumper(self.service)
+        self.ori_customer_func = {}
+        self.enable_dataloader = self.config.enable_dataloader
+        self._param_warning()
     @staticmethod
-    def check_input_params(args):
-        if args.config_path is not None:
-            if not isinstance(args.config_path, str):
-                raise MsprobeException(
-                    MsprobeException.INVALID_PARAM_ERROR, f"config_path must be a string")
-            file_checker = FileChecker(
-                file_path=args.config_path, path_type=FileCheckConst.FILE, file_type=FileCheckConst.JSON_SUFFIX)
-            file_checker.common_check()
+    def _get_task_config(task, json_config):
+        return parse_task_config(task, json_config)
-        if args.task is not None and args.task not in Const.TASK_LIST:
-            raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, f"task must be one of {Const.TASK_LIST}")
-        if args.dump_path is not None:
-            if not isinstance(args.dump_path, str):
+    @staticmethod
+    def _iter_tracer(func):
+        def func_wrapper(*args, **kwargs):
+            debugger_instance = PrecisionDebugger._instance
+            if not debugger_instance:
                 raise MsprobeException(
-                    MsprobeException.INVALID_PARAM_ERROR, f"dump_path must be a string")
+                    MsprobeException.INTERFACE_USAGE_ERROR,
+                    f"PrecisionDebugger must be instantiated before executing the dataloader iteration"
+                )
-        if args.level is not None and args.level not in Const.LEVEL_LIST:
-            raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, f"level must be one of {Const.LEVEL_LIST}")
+            debugger_instance.enable_dataloader = False
+            if not debugger_instance.service.first_start:
+                debugger_instance.stop()
+                debugger_instance.step()
+            result = func(*args, **kwargs)
+            debugger_instance.start()
+            debugger_instance.enable_dataloader = True
+            return result
-        if args.model is not None:
-            logger.warning_on_rank_0(
-                "The 'model' parameter in the PrecisionDebugger will be deprecated in the future."
-                "It is recommended to pass the 'model' parameter in the start interface instead."
-            )
+        return func_wrapper
     @classmethod
-    def start(cls, model=None):
-        instance = cls._instance
-        if not instance:
-            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
-        if instance.task in PrecisionDebugger.tasks_not_need_debugger:
+    def start(cls, model=None, token_range=None):
+        instance = cls._get_instance()
+        if instance is None:
             return
-        instance.config.check_model(instance, model)
+        check_token_range(token_range)
+        instance.config.check_model(instance, model, token_range)
         if instance.enable_dataloader:
             logger.warning_on_rank_0("DataLoader is enabled, start() skipped.")
         else:
-            instance.service.start(instance.model)
-    @classmethod
-    def forward_backward_dump_end(cls):
-        instance = cls._instance
-        instance.stop()
+            instance.service.start(instance.model, token_range)
     @classmethod
     def stop(cls):
-        instance = cls._instance
-        if not instance:
-            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
-        if instance.task in PrecisionDebugger.tasks_not_need_debugger:
+        instance = cls._get_instance()
+        if instance is None:
             return
         if instance.enable_dataloader:
             logger.warning_on_rank_0("DataLoader is enabled, stop() skipped.")
@@ -145,9 +106,8 @@ class PrecisionDebugger:
     @classmethod
     def step(cls):
-        if not cls._instance:
-            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
-        if cls._instance.task in PrecisionDebugger.tasks_not_need_debugger:
+        instance = cls._get_instance()
+        if instance is None:
             return
         cls._instance.service.step()
@@ -172,21 +132,23 @@ class PrecisionDebugger:
             return
         instance.service.save(variable, name, save_backward)
-    @classmethod
-    def set_init_step(cls, step):
-        instance = cls._instance
-        if not instance:
-            raise Exception(MsgConst.NOT_CREATED_INSTANCE)
-        check_init_step(step)
-        instance.service.init_step = step
-        instance.service.loop = 0
+    def _param_warning(self):
+        if self.model is not None:
+            logger.warning_on_rank_0(
+                "The 'model' parameter in the PrecisionDebugger will be deprecated in the future."
+                "It is recommended to pass the 'model' parameter in the start interface instead."
+            )
+        if self.enable_dataloader:
+            logger.warning_on_rank_0("The enable_dataloader feature will be deprecated in the future.")
+            dataloader._BaseDataLoaderIter.__next__ = self._iter_tracer(dataloader._BaseDataLoaderIter.__next__)
 def module_dump(module, dump_name):
-    if not isinstance(module, torch.nn.Module):
+    if not is_torch_nn_module(module):
         raise MsprobeException(
             MsprobeException.INVALID_PARAM_ERROR,
-            f"the module argument in module_dump must be a torch.nn.Module subclass"
+            f"the module argument in module_dump must be a torch.nn.Module type, "
+            f"but currently there is an unsupported {type(module)} type."
         )
     if not isinstance(dump_name, str):
         raise MsprobeException(
@@ -210,17 +172,3 @@ def module_dump_end():
             f"PrecisionDebugger must be instantiated before using module_dump_end interface"
         )
     instance.module_dumper.stop_module_dump()
-def iter_tracer(func):
-    def func_wrapper(*args, **kwargs):
-        debugger_instance = PrecisionDebugger.instance
-        debugger_instance.enable_dataloader = False
-        if not debugger_instance.service.first_start:
-            debugger_instance.stop()
-            debugger_instance.step()
-        result = func(*args, **kwargs)
-        debugger_instance.start()
-        debugger_instance.enable_dataloader = True
-        return result
-    return func_wrapper

msprobe/pytorch/dump/module_dump/hook_wrapper.py ADDED Viewed

@@ -0,0 +1,93 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+import torch
+from torch.utils.hooks import BackwardHook
+from msprobe.core.common.const import Const
+from msprobe.core.common.decorator import recursion_depth_decorator
+from msprobe.pytorch.common.log import logger
+from msprobe.pytorch.common.utils import is_float8_tensor
+def wrap_setup_backward_hook(func):
+    def requires_clone(tensor):
+        return isinstance(tensor, torch.Tensor) and not is_float8_tensor(tensor) and \
+            tensor.requires_grad and torch.is_grad_enabled()
+    @recursion_depth_decorator("Dump: wrap_setup_backward_hook.parse_tensor", max_depth=Const.DUMP_MAX_DEPTH)
+    def parse_tensor(item, tensor_list):
+        if requires_clone(item):
+            tensor_list.append(item)
+        elif isinstance(item, (list, tuple)):
+            for value in item:
+                parse_tensor(value, tensor_list)
+        elif isinstance(item, dict):
+            for value in item.values():
+                parse_tensor(value, tensor_list)
+    @recursion_depth_decorator("Dump: wrap_setup_backward_hook.rebuild_args", max_depth=Const.DUMP_MAX_DEPTH)
+    def rebuild_args(item, tensor_iter):
+        if requires_clone(item):
+            result = next(tensor_iter)
+            if hasattr(result, "_base") and result._base is not None:
+                if torch._C._autograd._get_creation_meta(result) != torch._C._autograd.CreationMeta(0):
+                    torch._C._autograd._set_creation_meta(result, torch._C._autograd.CreationMeta(0))
+            return result
+        if isinstance(item, list):
+            for index, value in enumerate(item):
+                item[index] = rebuild_args(value, tensor_iter)
+            return item
+        if isinstance(item, dict):
+            for key, value in item.items():
+                item[key] = rebuild_args(value, tensor_iter)
+            return item
+        if isinstance(item, tuple):
+            if hasattr(item, '_fields'):
+                return type(item)(*[rebuild_args(i, tensor_iter) for i in item])
+            return type(item)([rebuild_args(i, tensor_iter) for i in item])
+        return item
+    @wraps(func)
+    def wrap_setup_hook_func(*args, **kwargs):
+        if len(args) < 2:
+            return func(*args, **kwargs)
+        actual_args = args[1]
+        tensor_list = []
+        parse_tensor(actual_args, tensor_list)
+        new_args = args[0], tuple(tensor_list)
+        hooked_tensors = func(*new_args, **kwargs)
+        tensor_iter = iter(hooked_tensors)
+        try:
+            new_data = rebuild_args(actual_args, tensor_iter)
+        except Exception as e:
+            logger.debug(f"Unsupported data in setup input/output hook. The detail info: {e}")
+            new_data = actual_args
+        return new_data
+    return wrap_setup_hook_func
+def wrap_setup_input_output_hook():
+    BackwardHook.setup_input_hook = wrap_setup_backward_hook(BackwardHook.setup_input_hook)
+    BackwardHook.setup_output_hook = wrap_setup_backward_hook(BackwardHook.setup_output_hook)

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl