PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/core/data_dump/data_processor/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,17 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import inspect
 import os
 from dataclasses import dataclass, is_dataclass
-from typing import Tuple, Dict, Optional, Any
 from functools import partial
-import copy
-from typing import Union
+from typing import Tuple, Dict, Optional, Any, Union
 import numpy as np
 from msprobe.core.common.const import Const
+from msprobe.core.common.file_utils import save_npy
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import convert_tuple, CompareException
@@ -79,21 +79,17 @@ class ModuleBackwardOutputs:
 class TensorStatInfo:
-    def __init__(self, max_val=None, min_val=None, mean_val=None, norm_val=None, stack_tensor_stat=None):
+    def __init__(self, max_val=None, min_val=None, mean_val=None, norm_val=None):
         self.max = max_val
         self.min = min_val
         self.mean = mean_val
         self.norm = norm_val
-        self.stack_tensor_stat = stack_tensor_stat
 class BaseDataProcessor:
     _recursive_key_stack = []
-    special_type = (
-        np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_, np.ndarray,
-        bool, int, float, str, slice,
-        type(Ellipsis)
-    )
+    builtin_type = (bool, int, float, str, slice, type(Ellipsis))
+    np_type = (np.integer, np.floating, np.bool_, np.complexfloating, np.str_, np.byte, np.unicode_, np.ndarray)
     def __init__(self, config, data_writer):
         self.data_writer = data_writer
@@ -120,7 +116,10 @@ class BaseDataProcessor:
     @staticmethod
     def analyze_api_call_stack(name):
         try:
-            api_stack = inspect.stack()[5:]
+            if name.startswith("Primitive"):
+                api_stack = inspect.stack()[4:]
+            else:
+                api_stack = inspect.stack()[5:]
         except Exception as e:
             logger.warning(f"The call stack of <{name}> failed to retrieve, {e}.")
             api_stack = None
@@ -129,12 +128,14 @@ class BaseDataProcessor:
             for (_, path, line, func, code, _) in api_stack:
                 if not code:
                     continue
+                if any(filter_path in path for filter_path in Const.STACK_FILTER_KEYWORDS) and \
+                        Const.CALL_STACK_FLAG not in path:
+                    continue
                 stack_line = f"File {path}, line {str(line)}, in {func}, \n {code[0].strip()}"
                 stack_str.append(stack_line)
         else:
             stack_str.append(Const.WITHOUT_CALL_STACK)
-        stack_info_struct = {name: stack_str}
-        return stack_info_struct
+        return tuple(stack_str)
     @staticmethod
     def transfer_type(data):
@@ -178,20 +179,8 @@ class BaseDataProcessor:
                                  "invalid data_structure type or invalid index")
     @staticmethod
-    def _convert_numpy_to_builtin(arg):
-        type_mapping = {
-            np.integer: int,
-            np.floating: float,
-            np.bool_: bool,
-            np.complexfloating: complex,
-            np.str_: str,
-            np.byte: bytes,
-            np.unicode_: str
-        }
-        for numpy_type, builtin_type in type_mapping.items():
-            if isinstance(arg, numpy_type):
-                return builtin_type(arg), type(arg).__name__
-        return arg, ''
+    def is_distributed_op(module):
+        return getattr(module, "op_is_distributed", False)
     @staticmethod
     def _analyze_builtin(arg):
@@ -217,21 +206,40 @@ class BaseDataProcessor:
         return single_arg
     @staticmethod
-    def _analyze_numpy(ndarray, numpy_type):
+    def _analyze_numpy(arg):
+        return {"type": type(arg).__name__, "value": arg.item()}
+    @staticmethod
+    def _analyze_ndarray(ndarray, _):
         ndarray_json = {}
         ndarray_json.update({'type': 'numpy.ndarray'})
         ndarray_json.update({'dtype': str(ndarray.dtype)})
         ndarray_json.update({'shape': ndarray.shape})
-        if ndarray.size > 0:
-            ndarray_json.update({"Max": np.max(ndarray).item()})
-            ndarray_json.update({"Min": np.min(ndarray).item()})
-            ndarray_json.update({"Mean": np.mean(ndarray).item()})
-            ndarray_json.update({"Norm": np.linalg.norm(ndarray).item()})
-        else:
-            ndarray_json.update({"Max": None})
-            ndarray_json.update({"Min": None})
-            ndarray_json.update({"Mean": None})
-            ndarray_json.update({"Norm": None})
+        # 先初始化默认值
+        stats = {
+            "Max": None,
+            "Min": None,
+            "Mean": None,
+            "Norm": None
+        }
+        try:
+            # 只有非空时才尝试计算
+            if ndarray.size > 0:
+                stats = {
+                    "Max": np.max(ndarray).item(),
+                    "Min": np.min(ndarray).item(),
+                    "Mean": np.mean(ndarray).item(),
+                    "Norm": np.linalg.norm(ndarray).item()
+                }
+        except Exception as e:
+            # 决定打印内容或切片
+            logger.warning(f"Error analyzing ndarray stats: {e}")
+        # 最后一次性更新
+        ndarray_json.update(stats)
         return ndarray_json
     @staticmethod
@@ -248,12 +256,12 @@ class BaseDataProcessor:
     @classmethod
     def get_special_types(cls):
-        return cls.special_type
+        return cls.builtin_type + cls.np_type
     @classmethod
     def recursive_apply_transform(cls, args, transform, depth=0) -> Union[dict, list, None]:
-        if depth > Const.MAX_DEPTH:
-            logger.error(f"The maximum depth of recursive transform, {Const.MAX_DEPTH} is reached.")
+        if depth > Const.DUMP_MAX_DEPTH:
+            logger.error(f"The maximum depth of recursive transform, {Const.DUMP_MAX_DEPTH} is reached.")
             raise CompareException(CompareException.RECURSION_LIMIT_ERROR)
         if isinstance(args, cls.get_special_types()):
             arg_transform = transform(args, cls._recursive_key_stack)
@@ -303,6 +311,7 @@ class BaseDataProcessor:
             def real_hook_fn(grad):
                 return wrap_hook_fn(grad)
             element.register_hook(real_hook_fn)
     def if_return_forward_new_output(self):
@@ -350,6 +359,8 @@ class BaseDataProcessor:
         return api_info_struct
     def analyze_forward_output(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        if self.is_distributed_op(module):
+            module_input_output.update_output_with_args_and_kwargs()
         api_info_struct = {}
         # check whether data_mode contains forward or input
         if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT):
@@ -427,6 +438,7 @@ class BaseDataProcessor:
         api_info_struct = {}
         self.save_name = name + Const.SEP + param_name
         data_info = self.analyze_element(grad)
+        self.save_name = None
         grad_info_dict = {param_name: [data_info]}
         api_info_struct[name] = grad_info_dict
         return api_info_struct
@@ -435,10 +447,10 @@ class BaseDataProcessor:
         file_format = Const.PT_SUFFIX if self.config.framework == Const.PT_FRAMEWORK else Const.NUMPY_SUFFIX
         if self.save_name is not None:
             dump_data_name = (self.save_name + file_format)
-            self.save_name = None
         else:
-            dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP +
-                              suffix + file_format)
+            suffix_with_seq = (Const.SEP + suffix) if suffix else ""
+            dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + suffix_with_seq +
+                              file_format)
         file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name)
         return dump_data_name, file_path
@@ -447,23 +459,32 @@ class BaseDataProcessor:
     def analyze_debug_forward(self, variable, name_with_count):
         self.current_api_or_module_name = name_with_count
-        self.api_data_category = Const.TENSOR
-        # these two attributes are used to construct tensor file name {name_with_count}.tensor.{indexes}.npy/pt
+        self.api_data_category = Const.DEBUG
+        # these two attributes are used to construct tensor file name {name_with_count}.debug.{indexes}.npy/pt
         data_info = self.analyze_element(variable)
         return data_info
-    def analyze_debug_backward(self, variable, grad_name_with_count, nested_data_structure):
+    def analyze_debug_backward(self, variable, grad_name_with_count_category, nested_data_structure):
         def hook_fn(grad, indexes):
             suffix = Const.SEP.join([str(index) for index in indexes])
-            self.save_name = grad_name_with_count + Const.SEP + Const.TENSOR + Const.SEP + suffix
+            suffix_with_sep = (Const.SEP + suffix) if suffix else ""
+            self.save_name = grad_name_with_count_category + suffix_with_sep
             grad_data_info = self.analyze_element(grad)
             self.save_name = None
-            full_index = [grad_name_with_count] + indexes
+            full_index = [grad_name_with_count_category] + indexes
             try:
                 self.set_value_into_nested_structure(nested_data_structure, full_index, grad_data_info)
             except (ValueError, IndexError) as e:
-                logger.warning(f"error occured while recording statistics of {grad_name_with_count} variable, "
-                               f"skip current recording, detailed infomation: {e}")
+                logger.warning(f"error occurred while recording statistics of {grad_name_with_count_category} variable,"
+                               f"skip current recording, detailed information: {e}")
             return grad
         wrap_register_hook_single_element = partial(self.register_hook_single_element, hook_fn=hook_fn)
-        self.recursive_apply_transform(variable, wrap_register_hook_single_element)
+        self.recursive_apply_transform(variable, wrap_register_hook_single_element)
+    def _analyze_and_save_ndarray(self, ndarray, suffix):
+        dump_data_name, file_path = self.get_save_file_path(suffix)
+        save_npy(ndarray, file_path)
+        ndarray_json = BaseDataProcessor._analyze_ndarray(ndarray, suffix)
+        ndarray_json.update({"data_name": dump_data_name})
+        return ndarray_json

msprobe/core/data_dump/data_processor/mindspore_processor.py CHANGED Viewed

@@ -17,16 +17,17 @@ import zlib
 import mindspore as ms
 from mindspore import mint, ops, hal
+from mindspore.mint import distributed
 from mindspore._c_expression.typing import Number
 import numpy as np
 from msprobe.core.common.const import Const
 from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo,
                                                         ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs)
-from msprobe.core.common.file_utils import path_len_exceeds_limit, save_npy
+from msprobe.core.common.file_utils import path_len_exceeds_limit
 from msprobe.mindspore.common.utils import convert_bf16_to_fp32, save_tensor_as_npy
 from msprobe.mindspore.common.log import logger
-from msprobe.mindspore.dump.hook_cell.api_registry import api_register
+from msprobe.mindspore.dump.hook_cell.api_register import get_api_register
 has_adump = True
 try:
@@ -36,7 +37,7 @@ except ImportError:
 class MindsporeDataProcessor(BaseDataProcessor):
-    mindspore_special_type = tuple([ms.Tensor, Number])
+    mindspore_special_type = tuple([ms.Tensor, Number, distributed.P2POp])
     def __init__(self, config, data_writer):
         super().__init__(config, data_writer)
@@ -44,6 +45,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
             "dtype": self.analyze_dtype_in_kwargs
         }
         self._async_dump_cache = {}
+        self.api_register = get_api_register()
     @staticmethod
     def get_md5_for_tensor(x):
@@ -64,7 +66,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
             tensor_stat.max = np.max(data_np).item()
             tensor_stat.min = np.min(data_np).item()
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.item()
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
         elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
             data_abs = np.abs(data.asnumpy())
             tensor_stat.max = np.max(data_abs).item()
@@ -74,83 +76,98 @@ class MindsporeDataProcessor(BaseDataProcessor):
         else:
             if not ops.is_floating_point(data) or data.dtype == ms.float64:
                 data = data.to(ms.float32)
-            api_register.norm_inner_op_set_ori_func()
-            get_max_value = api_register.mint_ops_ori_attr.get("max", mint.max)
-            get_min_value = api_register.mint_ops_ori_attr.get("min", mint.min)
-            get_mean_value = api_register.mint_ops_ori_attr.get("mean", mint.mean)
-            if hasattr(mint, "norm"):
-                get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
-            else:
-                get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
-            tensor_stat.max = get_max_value(data).item()
-            tensor_stat.min = get_min_value(data).item()
-            tensor_stat.mean = get_mean_value(data).item()
-            tensor_stat.norm = get_norm_value(data).item()
-            api_register.norm_inner_op_set_hook_func()
+            get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
+            tensor_stat.max = mint.max(data)
+            tensor_stat.min = mint.min(data)
+            tensor_stat.mean = mint.mean(data)
+            tensor_stat.norm = get_norm_value(data)
         return tensor_stat
     @staticmethod
     def get_stat_info_async(data):
         tensor_stat = TensorStatInfo()
-        stack_method = api_register.functional_ori_attr.get("stack", ms.ops.stack)
-        if data.dtype == ms.complex64 or data.dtype == ms.complex128:
+        if data.dtype == ms.bool_:
+            tensor_stat.max = mint.any(data)
+            tensor_stat.min = mint.all(data)
+        elif not data.shape:
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+        elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
             logger.warning("Async dump do not support complex data!")
             return tensor_stat
-        elif data.dtype == ms.bool_:
-            tensor_stat.stack_tensor_stat = (["Max", "Min"], stack_method([data.any(), data.all()]))
-        elif not data.shape:
-            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], stack_method([data, data, data, data]))
         else:
             if not ops.is_floating_point(data) or data.dtype == ms.float64:
                 data = data.to(ms.float32)
-            api_register.norm_inner_op_set_ori_func()
-            get_max_value = api_register.mint_ops_ori_attr.get("max", mint.max)
-            get_min_value = api_register.mint_ops_ori_attr.get("min", mint.min)
-            get_mean_value = api_register.mint_ops_ori_attr.get("mean", mint.mean)
-            if hasattr(mint, "norm"):
-                get_norm_value = api_register.mint_ops_ori_attr.get("norm", mint.norm)
-            else:
-                get_norm_value = api_register.functional_ori_attr.get("norm", ops.norm)
-            tensor_stat.stack_tensor_stat = (["Max", "Min", "Mean", "Norm"], stack_method(
-                [get_max_value(data), get_min_value(data), get_mean_value(data), get_norm_value(data)]))
-            api_register.norm_inner_op_set_hook_func()
+            get_norm_value = mint.norm if hasattr(mint, "norm") else ops.norm
+            tensor_stat.max = mint.max(data)
+            tensor_stat.min = mint.min(data)
+            tensor_stat.mean = mint.mean(data)
+            tensor_stat.norm = get_norm_value(data)
         return tensor_stat
     @staticmethod
     def is_hookable_element(element):
         return hasattr(element, "register_hook") and callable(element.register_hook)
+    @staticmethod
+    def process_group_hash(arg):
+        group_ranks = distributed.get_process_group_ranks(arg)
+        group_ranks_hash = zlib.crc32(str(group_ranks).encode('utf-8'))
+        return f"{group_ranks_hash:08x}"
     @classmethod
     def get_special_types(cls):
         return super().get_special_types() + cls.mindspore_special_type
+    def dump_async_data(self):
+        for file_path, tensor in self._async_dump_cache.items():
+            save_tensor_as_npy(tensor, file_path)
+        self._async_dump_cache.clear()
     def get_stat_info(self, data):
+        self.api_register.restore_inner_used_api()
         tensor_stat = TensorStatInfo()
         if data.numel() == 0:
-            return tensor_stat
+            stat_info = tensor_stat
         else:
             if self.config.async_dump:
-                return MindsporeDataProcessor.get_stat_info_async(data)
+                stat_info = MindsporeDataProcessor.get_stat_info_async(data)
             else:
-                return MindsporeDataProcessor.get_stat_info_sync(data)
+                stat_info = MindsporeDataProcessor.get_stat_info_sync(data)
+        self.api_register.register_inner_used_api()
+        return stat_info
     def analyze_single_element(self, element, suffix_stack):
         if suffix_stack and suffix_stack[-1] in self.mindspore_object_key:
             return self.mindspore_object_key[suffix_stack[-1]](element)
-        converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
-        if converted_numpy is not element:
-            return {"type": numpy_type, "value": converted_numpy}
-        if isinstance(element, Number):
-            return self.analyze_dtype_in_kwargs(element)
-        if isinstance(element, ms.Tensor):
-            return self._analyze_tensor(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, np.ndarray):
-            return self._analyze_numpy(element, Const.SEP.join([str(suffix) for suffix in suffix_stack]))
-        if isinstance(element, (bool, int, float, str, slice, type(Ellipsis))):
-            return self._analyze_builtin(element)
+        suffix_str = Const.SEP.join(str(s) for s in suffix_stack)
+        type_analyzer = [
+            (MindsporeDataProcessor.builtin_type, self._analyze_builtin),
+            (ms.Tensor, lambda e: self._analyze_tensor(e, suffix_str)),
+            (Number, self.analyze_dtype_in_kwargs),
+            (MindsporeDataProcessor.np_type[:-1], self._analyze_numpy),
+            (np.ndarray, lambda e: self._analyze_ndarray(e, suffix_str)),
+            (distributed.P2POp, lambda e: self._analyze_p2pop(e, suffix_str))
+        ]
+        for type_key, analyze_fn in type_analyzer:
+            if isinstance(element, type_key):
+                return analyze_fn(element)
         return {}
+    def _analyze_p2pop(self, arg, suffix):
+        p2pop_info = {"class_type": "mindspore.mint.distributed.P2POp"}
+        try:
+            tensor_info = self._analyze_tensor(arg.tensor, suffix)
+            p2pop_info.update({"tensor": tensor_info})
+            p2pop_info.update({"op": arg.op})
+            p2pop_info.update({"peer": arg.peer})
+            p2pop_info.update({"tag": arg.tag})
+            group_id = self.process_group_hash(arg.group) if arg.group else None
+            p2pop_info.update({"group_id": group_id})
+        except Exception as e:
+            logger.warning(f"Failed to parse the P2POp content with error info: {e}.")
+        return p2pop_info
     def _analyze_tensor(self, tensor, suffix):
         tensor_stat = self.get_stat_info(tensor)
         tensor_json = {
@@ -159,45 +176,54 @@ class MindsporeDataProcessor(BaseDataProcessor):
             'shape': tensor.shape
         }
-        if tensor_stat.stack_tensor_stat is None:
-            tensor_json.update({'Max': self.transfer_type(tensor_stat.max)})
-            tensor_json.update({'Min': self.transfer_type(tensor_stat.min)})
-            tensor_json.update({'Mean': self.transfer_type(tensor_stat.mean)})
-            tensor_json.update({'Norm': self.transfer_type(tensor_stat.norm)})
-        else:
-            tensor_json.update({'tensor_stat': tensor_stat.stack_tensor_stat})
+        # 将统计值存入全局 buffer，并返回占位索引
+        stat_values = [
+            tensor_stat.max,
+            tensor_stat.min,
+            tensor_stat.mean,
+            tensor_stat.norm
+        ]
+        placeholder_index = self.data_writer.append_stat_to_buffer(stat_values)
+        tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index})
         if self.config.summary_mode == Const.MD5 and not self.config.async_dump:
             tensor_md5 = self.get_md5_for_tensor(tensor)
             tensor_json.update({Const.MD5: tensor_md5})
         return tensor_json
-class StatisticsDataProcessor(MindsporeDataProcessor):
-    pass
-class TensorDataProcessor(MindsporeDataProcessor):
-    def dump_async_data(self):
-        for file_path, tensor in self._async_dump_cache.items():
-            save_tensor_as_npy(tensor, file_path)
-        self._async_dump_cache.clear()
-    def _analyze_tensor(self, tensor, suffix):
+    def _analyze_and_save_tensor(self, tensor, suffix):
         dump_data_name, file_path = self.get_save_file_path(suffix)
-        single_arg = super()._analyze_tensor(tensor, suffix)
+        single_arg = MindsporeDataProcessor._analyze_tensor(self, tensor, suffix)
         single_arg.update({"data_name": dump_data_name})
         if self.config.async_dump:
             self._async_dump_cache[file_path] = tensor.copy()
         else:
             save_tensor_as_npy(tensor, file_path)
         return single_arg
-    def _analyze_numpy(self, ndarray, suffix):
-        dump_data_name, file_path = self.get_save_file_path(suffix)
-        save_npy(ndarray, file_path)
-        ndarray_json = super()._analyze_numpy(ndarray, suffix)
-        ndarray_json.update({"data_name": dump_data_name})
-        return ndarray_json
+class StatisticsDataProcessor(MindsporeDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        if any(item in self.current_api_or_module_name for item in self.config.tensor_list):
+            return self._analyze_and_save_tensor(tensor, suffix)
+        else:
+            return super()._analyze_tensor(tensor, suffix)
+    def _analyze_ndarray(self, ndarray, suffix):
+        if any(item in self.current_api_or_module_name for item in self.config.tensor_list):
+            return self._analyze_and_save_ndarray(ndarray, suffix)
+        else:
+            return super()._analyze_ndarray(ndarray, suffix)
+class TensorDataProcessor(MindsporeDataProcessor):
+    def _analyze_tensor(self, tensor, suffix):
+        return self._analyze_and_save_tensor(tensor, suffix)
+    def _analyze_ndarray(self, ndarray, suffix):
+        return self._analyze_and_save_ndarray(ndarray, suffix)
 class OverflowCheckDataProcessor(MindsporeDataProcessor):
@@ -262,11 +288,26 @@ class OverflowCheckDataProcessor(MindsporeDataProcessor):
         self.cached_tensors_and_file_paths = {}
     def _analyze_maybe_overflow_tensor(self, tensor_json):
-        if tensor_json['Max'] is None:
+        tensor_stat_index = tensor_json.get(Const.TENSOR_STAT_INDEX)
+        if tensor_stat_index is None:
+            logger.warning("tensor_stat_index does not exist in tensor_json.")
+            return
+        max_tensor = self.data_writer.get_buffer_values_max(tensor_stat_index)
+        min_tensor = self.data_writer.get_buffer_values_min(tensor_stat_index)
+        if max_tensor is None or min_tensor is None:
             return
-        if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']):
+        def check_inf_nan(value):
+            # Use .item() if it's a tensor-like structure
+            if hasattr(value, "item"):
+                value = value.item()
+            return np.isinf(value) or np.isnan(value)
+        if check_inf_nan(max_tensor):
             self.has_overflow = True
-        if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']):
+            return
+        if check_inf_nan(min_tensor):
             self.has_overflow = True
     def _analyze_tensor(self, tensor, suffix):

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl