PyPI - mindstudio-probe - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +7 -6
mindstudio_probe-1.2.1.dist-info/RECORD +396 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -1
msprobe/CMakeLists.txt +5 -0
msprobe/README.md +51 -20
msprobe/config.json +2 -3
msprobe/core/advisor/advisor.py +8 -3
msprobe/core/common/const.py +264 -15
msprobe/core/common/exceptions.py +27 -3
msprobe/core/common/file_utils.py +176 -26
msprobe/core/common/inplace_op_checker.py +15 -0
msprobe/core/common/inplace_ops.yaml +3 -0
msprobe/core/common/log.py +27 -9
msprobe/core/common/utils.py +204 -77
msprobe/core/common_config.py +49 -14
msprobe/core/compare/acc_compare.py +274 -198
msprobe/core/compare/check.py +32 -33
msprobe/core/compare/compare_cli.py +32 -14
msprobe/core/compare/highlight.py +283 -127
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +246 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +249 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +95 -0
msprobe/core/compare/merge_result/merge_result.py +380 -0
msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
msprobe/core/compare/multiprocessing_compute.py +2 -2
msprobe/core/compare/npy_compare.py +135 -144
msprobe/core/compare/utils.py +419 -274
msprobe/core/data_dump/data_collector.py +60 -28
msprobe/core/data_dump/data_processor/base.py +84 -36
msprobe/core/data_dump/data_processor/factory.py +5 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +152 -18
msprobe/core/data_dump/data_processor/pytorch_processor.py +267 -110
msprobe/core/data_dump/json_writer.py +29 -1
msprobe/core/data_dump/scope.py +119 -39
msprobe/core/grad_probe/constant.py +27 -13
msprobe/core/grad_probe/grad_compare.py +18 -1
msprobe/core/grad_probe/utils.py +30 -2
msprobe/core/overflow_check/abnormal_scene.py +189 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +96 -7
msprobe/docs/02.config_introduction.md +50 -23
msprobe/docs/03.config_examples.md +2 -9
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +93 -61
msprobe/docs/06.data_dump_MindSpore.md +200 -95
msprobe/docs/07.accuracy_checker_PyTorch.md +28 -28
msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
msprobe/docs/10.accuracy_compare_PyTorch.md +114 -50
msprobe/docs/11.accuracy_compare_MindSpore.md +340 -48
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
msprobe/docs/17.grad_probe.md +5 -6
msprobe/docs/19.monitor.md +561 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +466 -0
msprobe/docs/22.visualization_MindSpore.md +481 -0
msprobe/docs/23.generate_operator_PyTorch.md +107 -0
msprobe/docs/24.code_mapping_Mindspore.md +28 -0
msprobe/docs/25.tool_function_introduction.md +29 -0
msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
msprobe/docs/27.dump_json_instruction.md +521 -0
msprobe/docs/FAQ.md +29 -2
msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
msprobe/docs/img/visualization/tensorboard_1.png +0 -0
msprobe/docs/img/visualization/tensorboard_2.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_browser_2.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/docs/visualization/GPTModel.png +0 -0
msprobe/docs/visualization/ParallelMLP.png +0 -0
msprobe/docs/visualization/layer_mapping_example.md +132 -0
msprobe/docs/visualization/mapping.png +0 -0
msprobe/docs/visualization/mapping1.png +0 -0
msprobe/docs/visualization/module_name.png +0 -0
msprobe/docs/visualization/module_name1.png +0 -0
msprobe/docs/visualization/no_mapping.png +0 -0
msprobe/docs/visualization/no_mapping1.png +0 -0
msprobe/docs/visualization/no_mapping_analyze.png +0 -0
msprobe/docs/visualization/top_layer.png +0 -0
msprobe/mindspore/__init__.py +25 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -151
msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +64 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +64 -31
msprobe/mindspore/api_accuracy_checker/data_manager.py +301 -0
msprobe/mindspore/api_accuracy_checker/main.py +28 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +212 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +60 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +33 -12
msprobe/mindspore/code_mapping/bind.py +264 -0
msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
msprobe/mindspore/code_mapping/graph.py +49 -0
msprobe/mindspore/code_mapping/graph_parser.py +226 -0
msprobe/mindspore/code_mapping/main.py +24 -0
msprobe/mindspore/code_mapping/processor.py +34 -0
msprobe/mindspore/common/const.py +35 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +88 -4
msprobe/mindspore/compare/distributed_compare.py +22 -24
msprobe/mindspore/compare/ms_compare.py +333 -268
msprobe/mindspore/compare/ms_graph_compare.py +95 -52
msprobe/mindspore/debugger/debugger_config.py +7 -1
msprobe/mindspore/debugger/precision_debugger.py +87 -12
msprobe/mindspore/dump/dump_tool_factory.py +3 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +95 -18
msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +45 -30
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +36 -1
msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
msprobe/mindspore/dump/jit_dump.py +17 -5
msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
msprobe/mindspore/dump/kernel_graph_dump.py +9 -4
msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +156 -41
msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
msprobe/mindspore/free_benchmark/common/utils.py +19 -4
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
msprobe/mindspore/grad_probe/global_context.py +28 -8
msprobe/mindspore/grad_probe/grad_analyzer.py +50 -24
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +35 -12
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/mindtorch/__init__.py +18 -0
msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
msprobe/mindspore/ms_config.py +27 -16
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +9 -4
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +285 -113
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +48 -10
msprobe/pytorch/__init__.py +8 -6
msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +103 -271
msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +478 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +63 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +21 -15
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +54 -22
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +140 -71
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +49 -8
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +142 -16
msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
msprobe/pytorch/bench_functions/swiglu.py +10 -2
msprobe/pytorch/common/parse_json.py +7 -6
msprobe/pytorch/common/utils.py +101 -7
msprobe/pytorch/compare/distributed_compare.py +17 -30
msprobe/pytorch/compare/pt_compare.py +44 -22
msprobe/pytorch/debugger/debugger_config.py +46 -27
msprobe/pytorch/debugger/precision_debugger.py +42 -12
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +81 -10
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +15 -0
msprobe/pytorch/free_benchmark/common/params.py +10 -2
msprobe/pytorch/free_benchmark/common/utils.py +29 -4
msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -5
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +41 -47
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/__init__.py +1 -1
msprobe/pytorch/hook_module/hook_module.py +14 -11
msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +35 -0
msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
msprobe/pytorch/hook_module/wrap_functional.py +0 -38
msprobe/pytorch/monitor/__init__.py +0 -0
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +425 -0
msprobe/pytorch/monitor/csv2tb.py +166 -0
msprobe/pytorch/monitor/distributed/__init__.py +0 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +283 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +1076 -0
msprobe/pytorch/monitor/module_metric.py +172 -0
msprobe/pytorch/monitor/module_spec_verifier.py +95 -0
msprobe/pytorch/monitor/optimizer_collect.py +333 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +160 -0
msprobe/pytorch/monitor/utils.py +321 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +29 -38
msprobe/pytorch/online_dispatch/dispatch.py +58 -27
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +53 -32
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
msprobe/pytorch/online_dispatch/utils.py +49 -21
msprobe/pytorch/parse_tool/lib/compare.py +21 -27
msprobe/pytorch/parse_tool/lib/config.py +6 -8
msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
msprobe/pytorch/parse_tool/lib/parse_tool.py +12 -12
msprobe/pytorch/parse_tool/lib/utils.py +33 -53
msprobe/pytorch/parse_tool/lib/visualization.py +11 -10
msprobe/pytorch/pt_config.py +31 -8
msprobe/pytorch/service.py +188 -108
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +222 -0
msprobe/visualization/builder/msprobe_adapter.py +227 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +180 -0
msprobe/visualization/compare/mode_adapter.py +197 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +119 -0
msprobe/visualization/graph/distributed_analyzer.py +318 -0
msprobe/visualization/graph/graph.py +209 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +288 -0
msprobe/visualization/utils.py +217 -0
mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
msprobe/docs/04.acl_config_examples.md +0 -78
msprobe/mindspore/compare/layer_mapping.py +0 -146
msprobe/mindspore/compare/modify_mapping.py +0 -107
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
msprobe/pytorch/functional/module_dump.py +0 -84
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
/msprobe/mindspore/{free_benchmark/decorator → code_mapping}/__init__.py +0 -0
/msprobe/pytorch/{functional → dump/module_dump}/__init__.py +0 -0

msprobe/pytorch/common/utils.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import io
 import os
+import pickle
 import random
 import stat
 from functools import wraps
@@ -24,7 +25,7 @@ import torch
 import torch.distributed as dist
 from msprobe.core.common.exceptions import DistributedNotInitializedError
 from msprobe.core.common.file_utils import (FileCheckConst, change_mode,
-                                            check_file_or_directory_path, check_path_before_create)
+                                            check_file_or_directory_path, check_path_before_create, FileOpen)
 from msprobe.core.common.log import logger
 from msprobe.core.common.utils import check_seed_all
 from packaging import version
@@ -75,7 +76,7 @@ def parameter_adapter(func):
                 else:
                     res = [input_tensor[tensor_index] for tensor_index in indices]
                     return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0)
-        if self.op_name_ == "__eq__" and args[1] is None:
+        if self.op_name_ == "__eq__" and len(args) > 1 and args[1] is None:
             return False
         return func(self, *args, **kwargs)
@@ -104,8 +105,49 @@ def get_rank_if_initialized():
         raise DistributedNotInitializedError("torch distributed environment is not initialized")
-def seed_all(seed=1234, mode=False):
-    check_seed_all(seed, mode)
+def remove_dropout():
+    if torch.__version__ > "1.8":
+        logger.info_on_rank_0("For precision comparison, the probability p in the dropout method is set to 0.")
+        import torch.nn.functional as F
+        from torch import _VF
+        from torch.overrides import has_torch_function_unary, handle_torch_function
+        def function_dropout(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
+                             inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input_tensor):
+                return handle_torch_function(
+                    function_dropout, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.dropout_(input_tensor, 0., training) if inplace else _VF.dropout(input_tensor, 0., training)
+        def function_dropout2d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
+                               inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input_tensor):
+                return handle_torch_function(
+                    function_dropout2d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
+                                                                                                        0., training)
+        def function_dropout3d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
+                               inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input_tensor):
+                return handle_torch_function(
+                    function_dropout3d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
+                                                                                                        0., training)
+        F.dropout = function_dropout
+        F.dropout2d = function_dropout2d
+        F.dropout3d = function_dropout3d
+def seed_all(seed=1234, mode=False, rm_dropout=True):
+    check_seed_all(seed, mode, rm_dropout)
     try:
         random.seed(seed)
         os.environ['PYTHONHASHSEED'] = str(seed)
@@ -125,6 +167,8 @@ def seed_all(seed=1234, mode=False):
         else:
             torch_npu.npu.manual_seed_all(seed)
             torch_npu.npu.manual_seed(seed)
+        if rm_dropout:
+            remove_dropout()
     except Exception as e:
         logger.error(f"There is an unexpected error while determinating randomness. {e}")
@@ -269,17 +313,17 @@ def load_pt(pt_path, to_cpu=False):
     check_file_or_directory_path(pt_path)
     try:
         if to_cpu:
-            pt = torch.load(pt_path, map_location=torch.device("cpu"))
+            pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True)
         else:
-            pt = torch.load(pt_path)
+            pt = torch.load(pt_path, weights_only=True)
     except Exception as e:
         raise RuntimeError(f"load pt file {pt_path} failed") from e
     return pt
 def save_pt(tensor, filepath):
-    filepath = os.path.realpath(filepath)
     check_path_before_create(filepath)
+    filepath = os.path.realpath(filepath)
     try:
         torch.save(tensor, filepath)
     except Exception as e:
@@ -290,6 +334,56 @@ def save_pt(tensor, filepath):
     change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
+class TypeCheckingUnpickler(pickle.Unpickler):
+    """
+    This class is a subclass of pickle.Unpickler, which is used to unpickle pickled objects.
+    It overrides the find_class method to add type checking functionality.
+    """
+    allowed_types = [
+        "str",
+        "ApiData",
+        "OrderedDict",
+        "_rebuild_tensor_v2",  # from torch.utils
+        "_load_from_bytes"  # from torch.storage
+    ]
+    def find_class(self, module, name):
+        """
+        Method to find the class of the object to be unpickled.
+        Throws pickle.UnpicklingError If the object type is not in the allowed types list.
+        """
+        if name in self.allowed_types:
+            return super().find_class(module, name)
+        raise pickle.UnpicklingError("Unsupported object type: {}.{}".format(module, name))
+def save_pkl(tensor, filepath):
+    """Save ApiData or str objection by pickle"""
+    check_path_before_create(filepath)
+    filepath = os.path.realpath(filepath)
+    try:
+        with FileOpen(filepath, 'wb') as f:
+            pickle.dump(tensor, f)
+    except Exception as e:
+        logger.error("Save pt file failed, please check according possible error causes: "
+                     "1. out of disk space or disk error, "
+                     "2. no permission to write files, etc.")
+        raise RuntimeError(f"save pt file {filepath} failed") from e
+    change_mode(filepath, FileCheckConst.DATA_FILE_AUTHORITY)
+def load_pkl(pt_path):
+    """Load ApiData or str objection by pickle for accuracy_checker_online"""
+    check_file_or_directory_path(pt_path)
+    pt_path = os.path.realpath(pt_path)
+    try:
+        with FileOpen(pt_path, 'rb') as f:
+            pt = TypeCheckingUnpickler(f).load()
+    except Exception as e:
+        raise RuntimeError(f"load pt file {pt_path} failed: {e}") from e
+    return pt
 def save_api_data(api_data):
     """Save data to io stream"""
     try:

msprobe/pytorch/compare/distributed_compare.py CHANGED Viewed

@@ -14,53 +14,40 @@
 # limitations under the License.
 import os
-from msprobe.core.common.utils import CompareException, check_compare_param, \
-    check_configuration_param, task_dumppath_get
-from msprobe.core.common.file_utils import create_directory
 from msprobe.core.common.exceptions import FileCheckException
+from msprobe.core.common.file_utils import create_directory
+from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
+    set_dump_path
+from msprobe.core.compare.acc_compare import ModeConfig
+from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json, set_stack_json_path
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.compare.pt_compare import PTComparator
-from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json
+from msprobe.pytorch.compare.pt_compare import PTComparator, compare
 def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
-    if kwargs.get('suffix'):
+    if kwargs.get("suffix"):
         logger.error("Argument 'suffix' is not supported for compare_distributed.")
         raise CompareException(CompareException.INVALID_PARAM_ERROR)
-    stack_mode = kwargs.get('stack_mode', False)
-    auto_analyze = kwargs.get('auto_analyze', True)
-    fuzzy_match = kwargs.get('fuzzy_match', False)
+    is_print_compare_log = kwargs.get("is_print_compare_log", True)
     # get the ranks and match by order
     npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank'))
     bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank'))
     if len(npu_ranks) != len(bench_ranks):
-        logger.error('The number of ranks in the two runs are different. '
-                        'Unable to match the ranks. Please use another folder to compare '
-                        'or use compare() api and manually match the ranks.')
+        logger.error(
+            "The number of ranks in the two runs are different. "
+            "Unable to match the ranks. "
+            "Please use another folder to compare or use compare() api and manually match the ranks.")
         raise CompareException(CompareException.INVALID_PATH_ERROR)
     for nr, br in zip(npu_ranks, bench_ranks):
         npu_data_dir = os.path.join(npu_dump_dir, nr)
         bench_data_dir = os.path.join(bench_dump_dir, br)
         npu_path = extract_json(npu_data_dir, stack_json=False)
         bench_path = extract_json(bench_data_dir, stack_json=False)
-        stack_path = extract_json(npu_data_dir, stack_json=True)
         dump_result_param = {
-            'npu_json_path': npu_path,
-            'bench_json_path': bench_path,
-            'stack_json_path': stack_path,
-            'is_print_compare_log': True
+            "npu_json_path": npu_path,
+            "bench_json_path": bench_path,
+            "is_print_compare_log": is_print_compare_log
         }
-        try:
-            summary_compare, md5_compare = task_dumppath_get(dump_result_param)
-            check_configuration_param(stack_mode, auto_analyze, fuzzy_match,
-                                      dump_result_param.get('is_print_compare_log', True))
-            create_directory(output_path)
-            check_compare_param(dump_result_param, output_path,
-                                summary_compare=summary_compare, md5_compare=md5_compare)
-        except (CompareException, FileCheckException) as error:
-            logger.error('Compare failed. Please check the arguments and do it again!')
-            raise CompareException(error.code) from error
-        pt_comparator = PTComparator()
-        pt_comparator.compare_core(dump_result_param, output_path, suffix=f'_{nr}-{br}',
-                                   summary_compare=summary_compare, md5_compare=md5_compare, **kwargs)
+        compare(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}-{br}', **kwargs)

msprobe/pytorch/compare/pt_compare.py CHANGED Viewed

@@ -14,19 +14,29 @@
 # limitations under the License.
 import os.path
 import torch
 from msprobe.core.common.const import FileCheckConst
-from msprobe.pytorch.common.log import logger
 from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.compare.acc_compare import Comparator
-from msprobe.core.common.utils import check_configuration_param, task_dumppath_get, check_compare_param, \
-    CompareException
 from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml
+from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
+    set_dump_path
+from msprobe.core.compare.acc_compare import Comparator, ModeConfig
+from msprobe.core.compare.utils import set_stack_json_path
+from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.common.utils import load_pt
-class PTComparator (Comparator):
-    def __init__(self, data_mapping=None):
+class PTComparator(Comparator):
+    def __init__(self, mode_config, data_mapping=None):
+        super().__init__(mode_config)
+        self.stack_mode = mode_config.stack_mode
+        self.auto_analyze = mode_config.auto_analyze
+        self.fuzzy_match = mode_config.fuzzy_match
+        self.dump_mode = mode_config.dump_mode
         self.frame_name = PTComparator.__name__
         self.data_mapping = data_mapping
         if isinstance(self.data_mapping, str) or self.data_mapping is None:
@@ -37,21 +47,24 @@ class PTComparator (Comparator):
             raise TypeError(f"The type of parameter `data_mapping` must be dict, str or None, but got "
                             f"{type(self.data_mapping)}")
-    def load_mapping_file(self, mapping_file):
+    @staticmethod
+    def load_mapping_file(mapping_file):
         if isinstance(mapping_file, str):
             mapping_dict = load_yaml(mapping_file)
         else:
             mapping_dict = {}
         return mapping_dict
     def read_npy_data(self, dir_path, file_name):
+        if not file_name:
+            return None
         data_path = os.path.join(dir_path, file_name)
         path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                                FileCheckConst.PT_SUFFIX, False)
+                                   FileCheckConst.PT_SUFFIX, False)
         data_path = path_checker.common_check()
         try:
-            data_value = load_pt(data_path,
-                                 to_cpu=True).detach()  # detach because numpy can not process gradient information
+            # detach because numpy can not process gradient information
+            data_value = load_pt(data_path, to_cpu=True).detach()
         except RuntimeError as e:
             # 这里捕获 load_pt 中抛出的异常
             logger.error(f"Failed to load the .pt file at {data_path}.")
@@ -63,20 +76,29 @@ class PTComparator (Comparator):
         if data_value.dtype == torch.bfloat16:
             data_value = data_value.to(torch.float32)
         data_value = data_value.numpy()
-        return data_value
-def compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False, **kwargs):
+        return data_value
+def compare(input_param, output_path, **kwargs):
     try:
-        summary_compare, md5_compare = task_dumppath_get(input_param)
+        auto_analyze = kwargs.get('auto_analyze', True)
+        fuzzy_match = kwargs.get('fuzzy_match', False)
+        data_mapping = kwargs.get('data_mapping', None)
+        suffix = kwargs.get('suffix', '')
+        set_dump_path(input_param)
+        dump_mode = get_dump_mode(input_param)
+        if "stack_json_path" in input_param:
+            stack_mode = kwargs.get('stack_mode', False)
+        else:
+            stack_mode = set_stack_json_path(input_param)  # set stack_mode and set "stack_json_path" in input_param
         check_configuration_param(stack_mode, auto_analyze, fuzzy_match, input_param.get('is_print_compare_log', True))
         create_directory(output_path)
-        check_compare_param(input_param, output_path, summary_compare, md5_compare)
-        data_mapping = kwargs.get('data_mapping', None)
+        check_compare_param(input_param, output_path, dump_mode, stack_mode)
     except (CompareException, FileCheckException) as error:
         logger.error('Compare failed. Please check the arguments and do it again!')
         raise CompareException(error.code) from error
-    pt_comparator = PTComparator(data_mapping)
-    pt_comparator.compare_core(input_param, output_path, stack_mode=stack_mode,
-                 auto_analyze=auto_analyze, fuzzy_match=fuzzy_match, summary_compare=summary_compare,
-                 md5_compare=md5_compare)
+    mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
+    pt_comparator = PTComparator(mode_config, data_mapping)
+    pt_comparator.compare_core(input_param, output_path, suffix=suffix)

msprobe/pytorch/debugger/debugger_config.py CHANGED Viewed

@@ -31,13 +31,14 @@ class DebuggerConfig:
         self.scope = task_config.scope if task_config.scope else []
         self.list = task_config.list if task_config.list else []
         self.data_mode = task_config.data_mode if task_config.data_mode else ["all"]
-        self.backward_input_list = task_config.backward_input if task_config.backward_input else []
-        self.backward_input = {}
-        self.acl_config = common_config.acl_config if common_config.acl_config else ""
-        self.is_forward_acl_dump = True
         self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS
         self.overflow_nums = task_config.overflow_nums if task_config.overflow_nums else 1
         self.framework = Const.PT_FRAMEWORK
+        self.async_dump = common_config.async_dump if common_config.async_dump else False
+        if self.level == Const.LEVEL_L2:
+            self.is_backward_kernel_dump = False
+            self._check_and_adjust_config_with_l2()
         if self.task == Const.FREE_BENCHMARK:
             self.fuzz_device = task_config.fuzz_device
@@ -59,20 +60,11 @@ class DebuggerConfig:
             self.tls_path = task_config.tls_path if task_config.tls_path else ""
             self.host = task_config.host if task_config.host else ""
             self.port = task_config.port if task_config.port else -1
+            self.online_run_ut_recompute = task_config.online_run_ut_recompute \
+                if isinstance(task_config.online_run_ut_recompute, bool) else False
         self.check()
-        if self.level == "L2":
-            if not self.scope or not isinstance(self.scope, list) or len(self.scope) != 1:
-                raise ValueError("scope must be configured as a list with one api name")
-            if isinstance(self.scope[0], str) and Const.BACKWARD in self.scope[0] and not self.backward_input_list:
-                raise ValueError("backward_input must be configured when scope contains 'backward'")
-            if Const.BACKWARD in self.scope[0]:
-                self.is_forward_acl_dump = False
-                for index, scope_spec in enumerate(self.scope):
-                    self.scope[index] = scope_spec.replace(Const.BACKWARD, Const.FORWARD)
-                    self.backward_input[self.scope[index]] = self.backward_input_list[index]
     def check_kwargs(self):
         if self.task and self.task not in Const.TASK_LIST:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
@@ -83,26 +75,53 @@ class DebuggerConfig:
         if not self.dump_path:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The dump_path not found.")
+        if not isinstance(self.async_dump, bool):
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                   f"The parameters async_dump should be bool.")
     def check(self):
         self.check_kwargs()
         return True
     def check_model(self, instance, start_model):
-        if self.level not in ["L0", "mix"]:
+        if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             if instance.model is not None or start_model is not None:
-                logger.warning_on_rank_0(
+                logger.info_on_rank_0(
                     f"The current level is not L0 or mix level, so the model parameters will not be used.")
             return
-        if start_model is None:
-            if instance.model is None:
-                logger.error_on_rank_0(
-                    f"For level {self.level}, PrecisionDebugger or start interface must receive a 'model' argument.")
-                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"missing the parameter 'model'")
-            return
-        if isinstance(start_model, torch.nn.Module):
-            instance.model = start_model
+        if start_model is None and instance.model is None:
+            logger.error_on_rank_0(
+                f"For level {self.level}, PrecisionDebugger or start interface must receive a 'model' parameter.")
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"missing the parameter 'model'")
+        instance.model = start_model if start_model is not None else instance.model
+        if isinstance(instance.model, torch.nn.Module):
+            return
+        error_model = None
+        if isinstance(instance.model, (list, tuple)):
+            for model in instance.model:
+                if not isinstance(model, torch.nn.Module):
+                    error_model = model
+                    break
         else:
-            logger.error_on_rank_0(f"The 'model' parameter of start must be a torch.nn.Module type.")
+            error_model = instance.model
+        if error_model is not None:
+            error_info = (f"The 'model' parameter must be a torch.nn.Moudle or list[torch.nn.Moudle] "
+                          f"type, currently there is a {type(error_model)} type.")
             raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, f"model must be a torch.nn.Module")
+                MsprobeException.INVALID_PARAM_ERROR, error_info)
+    def _check_and_adjust_config_with_l2(self):
+        if self.scope:
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                   f"When level is set to L2, the scope cannot be configured.")
+        if not self.list or len(self.list) != 1:
+            raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
+                                   f"When level is set to L2, the list must be configured as a list with one api name.")
+        api_name = self.list[0]
+        if api_name.endswith(Const.BACKWARD):
+            self.is_backward_kernel_dump = True
+            api_forward_name = api_name[:-len(Const.BACKWARD)] + Const.FORWARD
+            self.list.append(api_forward_name)

msprobe/pytorch/debugger/precision_debugger.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0  (the "License");
@@ -22,6 +22,7 @@ from msprobe.core.common.file_utils import FileChecker
 from msprobe.core.common.utils import get_real_step_or_rank
 from msprobe.pytorch.common.log import logger
 from msprobe.pytorch.debugger.debugger_config import DebuggerConfig
+from msprobe.pytorch.dump.module_dump.module_dump import ModuleDumper
 from msprobe.pytorch.grad_probe.grad_monitor import GradientMonitor
 from msprobe.pytorch.pt_config import parse_json_config
 from msprobe.pytorch.service import Service
@@ -49,7 +50,7 @@ class PrecisionDebugger:
         dump_path=None,
         level=None,
         model=None,
-        step=None,
+        step=None
     ):
         if not hasattr(self, "initialized"):
             config_params = ConfigParameters(config_path,
@@ -59,7 +60,6 @@ class PrecisionDebugger:
                                              model)
             self.check_input_params(config_params)
-            self.api_origin = False
             self.initialized = True
             self.model = model
             common_config, task_config = parse_json_config(config_path, task)
@@ -67,12 +67,13 @@ class PrecisionDebugger:
             if self.task == Const.GRAD_PROBE:
                 self.gm = GradientMonitor(common_config, task_config)
                 return
-            if step:
+            if step is not None:
                 common_config.step = get_real_step_or_rank(step, Const.STEP)
             self.config = DebuggerConfig(
                 common_config, task_config, task, dump_path, level
             )
             self.service = Service(self.config)
+            self.module_dumper = ModuleDumper(self.service)
             self.enable_dataloader = self.config.enable_dataloader
             if self.enable_dataloader:
                 logger.warning_on_rank_0("The enable_dataloader feature will be deprecated in the future.")
@@ -105,9 +106,11 @@ class PrecisionDebugger:
             raise MsprobeException(
                 MsprobeException.INVALID_PARAM_ERROR, f"level must be one of {Const.LEVEL_LIST}")
-        if args.model is not None and not isinstance(args.model, torch.nn.Module):
-            raise MsprobeException(
-                MsprobeException.INVALID_PARAM_ERROR, f"model must be a torch.nn.Module")
+        if args.model is not None:
+            logger.warning_on_rank_0(
+                "The 'model' parameter in the PrecisionDebugger will be deprecated in the future."
+                "It is recommended to pass the 'model' parameter in the start interface instead."
+            )
     @classmethod
     def start(cls, model=None):
@@ -120,15 +123,12 @@ class PrecisionDebugger:
         if instance.enable_dataloader:
             logger.warning_on_rank_0("DataLoader is enabled, start() skipped.")
         else:
-            instance.service.start(instance.model, instance.api_origin)
-            instance.api_origin = False
+            instance.service.start(instance.model)
-    # 指定代码段dump前反向结束符，之后的计算过程数据将被忽略，无法被dump
     @classmethod
     def forward_backward_dump_end(cls):
         instance = cls._instance
-        instance.service.forward_backward_dump_end()
-        instance.api_origin = True
+        instance.stop()
     @classmethod
     def stop(cls):
@@ -159,6 +159,36 @@ class PrecisionDebugger:
         cls._instance.gm.monitor(model)
+def module_dump(module, dump_name):
+    if not isinstance(module, torch.nn.Module):
+        raise MsprobeException(
+            MsprobeException.INVALID_PARAM_ERROR,
+            f"the module argument in module_dump must be a torch.nn.Module subclass"
+        )
+    if not isinstance(dump_name, str):
+        raise MsprobeException(
+            MsprobeException.INVALID_PARAM_ERROR,
+            f"the dump_name argument in module_dump must be a str type"
+        )
+    instance = PrecisionDebugger._instance
+    if not instance:
+        raise MsprobeException(
+            MsprobeException.INTERFACE_USAGE_ERROR,
+            f"PrecisionDebugger must be instantiated before using module_dump interface"
+        )
+    instance.module_dumper.start_module_dump(module, dump_name)
+def module_dump_end():
+    instance = PrecisionDebugger._instance
+    if not instance:
+        raise MsprobeException(
+            MsprobeException.INTERFACE_USAGE_ERROR,
+            f"PrecisionDebugger must be instantiated before using module_dump_end interface"
+        )
+    instance.module_dumper.stop_module_dump()
 def iter_tracer(func):
     def func_wrapper(*args, **kwargs):
         debugger_instance = PrecisionDebugger.instance

msprobe/pytorch/dump/kernel_dump/kernel_config.py ADDED Viewed

@@ -0,0 +1,33 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from msprobe.core.common.file_utils import save_json
+def create_kernel_config_json(dump_path, cur_rank):
+    kernel_config_name = "kernel_config.json" if cur_rank == '' else f"kernel_config_{cur_rank}.json"
+    kernel_config_path = os.path.join(dump_path, kernel_config_name)
+    config_info = {
+        "dump": {
+            "dump_list": [],
+            "dump_path": dump_path,
+            "dump_mode": "all",
+            "dump_op_switch": "on"
+        }
+    }
+    save_json(kernel_config_path, config_info, indent=4)
+    return kernel_config_path

mindstudio-probe 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

mindstudio-probe 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl