PyPI - mindstudio-probe - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (278) hide show

{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/METADATA +5 -5
mindstudio_probe-1.1.1.dist-info/RECORD +341 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/WHEEL +1 -1
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/entry_points.txt +0 -1
msprobe/README.md +84 -18
msprobe/__init__.py +16 -1
msprobe/config.json +1 -5
msprobe/core/advisor/advisor.py +16 -11
msprobe/core/advisor/advisor_const.py +6 -7
msprobe/core/advisor/advisor_result.py +12 -12
msprobe/core/common/const.py +164 -3
msprobe/core/common/exceptions.py +26 -4
msprobe/core/common/file_utils.py +196 -27
msprobe/core/common/inplace_op_checker.py +53 -0
msprobe/core/common/inplace_ops.yaml +251 -0
msprobe/core/common/log.py +46 -18
msprobe/core/common/utils.py +308 -209
msprobe/core/common_config.py +60 -38
msprobe/core/compare/acc_compare.py +332 -94
msprobe/core/compare/check.py +104 -22
msprobe/core/compare/compare_cli.py +42 -5
msprobe/core/compare/highlight.py +162 -57
msprobe/core/compare/layer_mapping/__init__.py +19 -0
msprobe/core/compare/layer_mapping/data_scope_parser.py +235 -0
msprobe/core/compare/layer_mapping/layer_mapping.py +242 -0
msprobe/core/compare/layer_mapping/postprocess_pass.py +94 -0
msprobe/core/compare/multiprocessing_compute.py +33 -8
msprobe/core/compare/npy_compare.py +73 -29
msprobe/core/compare/utils.py +306 -247
msprobe/core/data_dump/data_collector.py +44 -43
msprobe/core/data_dump/data_processor/base.py +88 -35
msprobe/core/data_dump/data_processor/factory.py +20 -3
msprobe/core/data_dump/data_processor/mindspore_processor.py +14 -8
msprobe/core/data_dump/data_processor/pytorch_processor.py +180 -66
msprobe/core/data_dump/json_writer.py +63 -42
msprobe/core/data_dump/scope.py +143 -48
msprobe/core/grad_probe/constant.py +31 -13
msprobe/core/grad_probe/grad_compare.py +20 -4
msprobe/core/grad_probe/utils.py +44 -3
msprobe/core/overflow_check/abnormal_scene.py +185 -0
msprobe/core/overflow_check/api_info.py +55 -0
msprobe/core/overflow_check/checker.py +138 -0
msprobe/core/overflow_check/filter.py +157 -0
msprobe/core/overflow_check/ignore_rules.yaml +55 -0
msprobe/core/overflow_check/level.py +22 -0
msprobe/core/overflow_check/utils.py +28 -0
msprobe/docs/01.installation.md +29 -9
msprobe/docs/02.config_introduction.md +83 -84
msprobe/docs/03.config_examples.md +3 -20
msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
msprobe/docs/05.data_dump_PyTorch.md +143 -13
msprobe/docs/06.data_dump_MindSpore.md +197 -88
msprobe/docs/07.accuracy_checker_PyTorch.md +69 -46
msprobe/docs/08.accuracy_checker_online_PyTorch.md +52 -17
msprobe/docs/09.accuracy_checker_MindSpore.md +51 -15
msprobe/docs/10.accuracy_compare_PyTorch.md +187 -99
msprobe/docs/11.accuracy_compare_MindSpore.md +253 -31
msprobe/docs/12.overflow_check_PyTorch.md +1 -1
msprobe/docs/13.overflow_check_MindSpore.md +6 -6
msprobe/docs/15.free_benchmarking_PyTorch.md +60 -55
msprobe/docs/16.free_benchmarking_MindSpore.md +159 -0
msprobe/docs/17.grad_probe.md +19 -22
msprobe/docs/18.online_dispatch.md +89 -0
msprobe/docs/19.monitor.md +468 -0
msprobe/docs/20.monitor_performance_baseline.md +52 -0
msprobe/docs/21.visualization_PyTorch.md +386 -0
msprobe/docs/22.visualization_MindSpore.md +384 -0
msprobe/docs/23.tool_function_introduction.md +28 -0
msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +25 -10
msprobe/docs/data_dump_Mindspore/dynamic_graph_quick_start_example.md +211 -0
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/monitor/cpu_info.png +0 -0
msprobe/docs/img/ms_dump.png +0 -0
msprobe/docs/img/ms_layer.png +0 -0
msprobe/docs/img/pt_dump.png +0 -0
msprobe/mindspore/__init__.py +16 -0
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +130 -138
msprobe/mindspore/api_accuracy_checker/api_info.py +27 -5
msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +63 -1
msprobe/mindspore/api_accuracy_checker/compute_element.py +59 -24
msprobe/mindspore/api_accuracy_checker/data_manager.py +264 -0
msprobe/mindspore/api_accuracy_checker/main.py +27 -3
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +206 -0
msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +58 -0
msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
msprobe/mindspore/cell_processor.py +58 -13
msprobe/mindspore/common/const.py +35 -13
msprobe/mindspore/common/log.py +5 -9
msprobe/mindspore/common/utils.py +60 -5
msprobe/mindspore/compare/distributed_compare.py +15 -28
msprobe/mindspore/compare/ms_compare.py +319 -158
msprobe/mindspore/compare/ms_graph_compare.py +99 -49
msprobe/mindspore/debugger/debugger_config.py +20 -14
msprobe/mindspore/debugger/precision_debugger.py +43 -13
msprobe/mindspore/dump/dump_tool_factory.py +18 -1
msprobe/mindspore/dump/hook_cell/api_registry.py +23 -3
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +203 -0
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +107 -10
msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
msprobe/mindspore/dump/jit_dump.py +56 -20
msprobe/mindspore/dump/kernel_graph_dump.py +19 -5
msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -6
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +162 -41
msprobe/mindspore/free_benchmark/common/config.py +15 -0
msprobe/mindspore/free_benchmark/common/handler_params.py +15 -1
msprobe/mindspore/free_benchmark/common/utils.py +37 -8
msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
msprobe/mindspore/free_benchmark/handler/base_handler.py +20 -5
msprobe/mindspore/free_benchmark/handler/check_handler.py +21 -7
msprobe/mindspore/free_benchmark/handler/fix_handler.py +18 -3
msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -6
msprobe/mindspore/free_benchmark/perturbation/add_noise.py +23 -8
msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +29 -5
msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +25 -10
msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +45 -19
msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +29 -8
msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +22 -7
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +17 -2
msprobe/mindspore/grad_probe/global_context.py +44 -14
msprobe/mindspore/grad_probe/grad_analyzer.py +27 -13
msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
msprobe/mindspore/grad_probe/hook.py +24 -10
msprobe/mindspore/grad_probe/utils.py +18 -5
msprobe/mindspore/ms_config.py +22 -15
msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -6
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
msprobe/mindspore/runtime.py +15 -0
msprobe/mindspore/service.py +75 -150
msprobe/mindspore/task_handler_factory.py +15 -0
msprobe/msprobe.py +24 -7
msprobe/pytorch/__init__.py +23 -3
msprobe/pytorch/api_accuracy_checker/common/config.py +81 -2
msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +50 -25
msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +454 -0
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +73 -33
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +44 -18
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +32 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +122 -172
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +158 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +30 -24
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +68 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +26 -9
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
msprobe/pytorch/bench_functions/__init__.py +18 -3
msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
msprobe/pytorch/bench_functions/confusion_transpose.py +20 -1
msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
msprobe/pytorch/bench_functions/linear.py +15 -0
msprobe/pytorch/bench_functions/matmul_backward.py +33 -6
msprobe/pytorch/bench_functions/npu_fusion_attention.py +280 -157
msprobe/pytorch/bench_functions/rms_norm.py +15 -0
msprobe/pytorch/bench_functions/rotary_mul.py +32 -9
msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
msprobe/pytorch/bench_functions/swiglu.py +29 -6
msprobe/pytorch/common/__init__.py +15 -0
msprobe/pytorch/common/log.py +18 -6
msprobe/pytorch/common/parse_json.py +31 -16
msprobe/pytorch/common/utils.py +96 -40
msprobe/pytorch/compare/distributed_compare.py +13 -14
msprobe/pytorch/compare/match.py +15 -0
msprobe/pytorch/compare/pt_compare.py +44 -10
msprobe/pytorch/debugger/debugger_config.py +69 -52
msprobe/pytorch/debugger/precision_debugger.py +72 -24
msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
msprobe/pytorch/free_benchmark/__init__.py +20 -5
msprobe/pytorch/free_benchmark/common/constant.py +15 -0
msprobe/pytorch/free_benchmark/common/counter.py +15 -0
msprobe/pytorch/free_benchmark/common/enums.py +43 -0
msprobe/pytorch/free_benchmark/common/params.py +23 -1
msprobe/pytorch/free_benchmark/common/utils.py +43 -5
msprobe/pytorch/free_benchmark/compare/grad_saver.py +47 -9
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +17 -0
msprobe/pytorch/free_benchmark/main.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +18 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +21 -4
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +28 -2
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +19 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +65 -16
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +21 -5
msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
msprobe/pytorch/function_factory.py +17 -2
msprobe/pytorch/functional/module_dump.py +84 -0
msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
msprobe/pytorch/hook_module/__init__.py +16 -1
msprobe/pytorch/hook_module/api_registry.py +13 -8
msprobe/pytorch/hook_module/hook_module.py +17 -19
msprobe/pytorch/hook_module/support_wrap_ops.yaml +1 -0
msprobe/pytorch/hook_module/utils.py +4 -6
msprobe/pytorch/hook_module/wrap_aten.py +12 -11
msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
msprobe/pytorch/hook_module/wrap_functional.py +21 -20
msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
msprobe/pytorch/hook_module/wrap_torch.py +4 -6
msprobe/pytorch/hook_module/wrap_vf.py +4 -6
msprobe/pytorch/module_processer.py +18 -6
msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
msprobe/pytorch/monitor/anomaly_detect.py +340 -0
msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +272 -0
msprobe/pytorch/monitor/features.py +108 -0
msprobe/pytorch/monitor/module_hook.py +870 -0
msprobe/pytorch/monitor/module_metric.py +193 -0
msprobe/pytorch/monitor/module_spec_verifier.py +93 -0
msprobe/pytorch/monitor/optimizer_collect.py +295 -0
msprobe/pytorch/monitor/unittest/__init__.py +0 -0
msprobe/pytorch/monitor/unittest/test_monitor.py +145 -0
msprobe/pytorch/monitor/utils.py +250 -0
msprobe/pytorch/monitor/visualizer.py +59 -0
msprobe/pytorch/online_dispatch/__init__.py +2 -3
msprobe/pytorch/online_dispatch/compare.py +38 -48
msprobe/pytorch/online_dispatch/dispatch.py +50 -25
msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
msprobe/pytorch/online_dispatch/single_compare.py +60 -39
msprobe/pytorch/online_dispatch/torch_ops_config.yaml +9 -1
msprobe/pytorch/online_dispatch/utils.py +48 -23
msprobe/pytorch/parse.py +15 -0
msprobe/pytorch/parse_tool/cli.py +5 -6
msprobe/pytorch/parse_tool/lib/compare.py +19 -26
msprobe/pytorch/parse_tool/lib/config.py +1 -1
msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -2
msprobe/pytorch/parse_tool/lib/utils.py +40 -55
msprobe/pytorch/parse_tool/lib/visualization.py +3 -1
msprobe/pytorch/pt_config.py +192 -40
msprobe/pytorch/service.py +110 -35
msprobe/visualization/__init__.py +14 -0
msprobe/visualization/builder/__init__.py +14 -0
msprobe/visualization/builder/graph_builder.py +165 -0
msprobe/visualization/builder/msprobe_adapter.py +205 -0
msprobe/visualization/compare/__init__.py +14 -0
msprobe/visualization/compare/graph_comparator.py +130 -0
msprobe/visualization/compare/mode_adapter.py +211 -0
msprobe/visualization/graph/__init__.py +14 -0
msprobe/visualization/graph/base_node.py +124 -0
msprobe/visualization/graph/graph.py +200 -0
msprobe/visualization/graph/node_colors.py +95 -0
msprobe/visualization/graph/node_op.py +39 -0
msprobe/visualization/graph_service.py +214 -0
msprobe/visualization/utils.py +232 -0
mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
msprobe/docs/04.acl_config_examples.md +0 -76
msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -43
msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -107
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
msprobe/pytorch/functional/dump_module.py +0 -39
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore/free_benchmark/decorator → pytorch/monitor}/__init__.py +0 -0
/msprobe/pytorch/{functional/data_processor.py → monitor/distributed/__init__.py} +0 -0

msprobe/mindspore/service.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,37 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ============================================================================
-import os
 import copy
 import functools
+import os
 from collections import defaultdict
 import mindspore as ms
-from mindspore.common.tensor import Tensor
-from mindspore import ops
 from mindspore import nn
 try:
     from mindspore.common._pijit_context import PIJitCaptureContext
-    pijit_label = True
 except ImportError:
     pijit_label = False
+else:
+    pijit_label = True
+from msprobe.core.common.exceptions import DistributedNotInitializedError, MsprobeException
+from msprobe.core.common.file_utils import create_directory
+from msprobe.core.common.utils import Const, print_tools_ends_info
 from msprobe.core.data_dump.data_collector import build_data_collector
+from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs
 from msprobe.core.data_dump.scope import BaseScope
-from msprobe.mindspore.common.utils import get_rank_if_initialized
-from msprobe.core.common.file_utils import create_directory
+from msprobe.mindspore.cell_processor import CellProcessor
 from msprobe.mindspore.common.log import logger
-from msprobe.core.common.utils import Const
-from msprobe.core.common.exceptions import DistributedNotInitializedError
+from msprobe.mindspore.common.utils import get_rank_if_initialized
 from msprobe.mindspore.dump.hook_cell.api_registry import api_register
-from msprobe.core.data_dump.data_processor.base import ModuleBackwardInputsOutputs, ModuleForwardInputsOutputs, \
-    ModuleBackwardInputs, ModuleBackwardOutputs
-from msprobe.core.common.exceptions import MsprobeException
-from msprobe.mindspore.dump.hook_cell.hook_cell import HOOKCell
-from msprobe.mindspore.cell_processor import CellProcessor
+from msprobe.mindspore.dump.hook_cell.primitive_hooks import PrimitiveHookService
 from msprobe.mindspore.dump.jit_dump import JitDump
@@ -52,11 +49,12 @@ class Service:
         self.config.level = self.config.level_ori
         self.data_collector = build_data_collector(self.config)
         self.cell_processor = CellProcessor(self.data_collector.scope)
+        self.primitive_hook_service = PrimitiveHookService(self)
         self.switch = False
+        self.primitive_switch = False
         self.current_iter = 0
         self.first_start = True
         self.current_rank = None
-        self.primitive_counters = {}
         self.dump_iter_dir = None
         self.start_call = False
         self.check_level_valid()
@@ -71,28 +69,30 @@ class Service:
         )
     def check_level_valid(self):
-        if self.config.level == "L2":
+        if self.config.level == Const.LEVEL_L2:
             raise MsprobeException(
                 MsprobeException.INVALID_PARAM_ERROR, "L2 level dump function is currently not supported."
             )
     def build_hook(self, target_type, name):
-        def forward_hook(api_or_cell_name, cell, input, output):
+        def forward_hook(api_or_cell_name, cell, input_data, output):
             if not self.should_excute_hook():
+                if hasattr(cell, 'input_kwargs'):
+                    del cell.input_kwargs
                 return None
             if target_type == BaseScope.Module_Type_Module:
-                api_or_cell_name = cell.mindstudio_reserved_name
-                module_input_output = ModuleForwardInputsOutputs(args=input, kwargs={}, output=output)
+                api_or_cell_name = self.cell_processor.set_and_get_reserved_name(cell, api_or_cell_name)
+                module_input_output = ModuleForwardInputsOutputs(args=input_data, kwargs={}, output=output)
             else:
-                module_input_output = ModuleForwardInputsOutputs(args=input, kwargs=cell.input_kwargs,
+                module_input_output = ModuleForwardInputsOutputs(args=input_data, kwargs=cell.input_kwargs,
                                                                  output=output)
             self.data_collector.update_api_or_module_name(api_or_cell_name)
             self.data_collector.forward_data_collect(api_or_cell_name, cell, pid, module_input_output)
             if self.data_collector.if_return_forward_new_output():
                 return self.data_collector.get_forward_new_output()
-            if target_type == BaseScope.Module_Type_API:
+            if hasattr(cell, 'input_kwargs'):
                 del cell.input_kwargs
             return output
@@ -100,12 +100,19 @@ class Service:
             if not self.should_excute_hook():
                 return
+            need_exchange = True
             if target_type == BaseScope.Module_Type_Module:
-                api_or_cell_name = cell.mindstudio_reserved_name
+                if not hasattr(cell, 'has_pre_hook_called') or not cell.has_pre_hook_called:
+                    need_exchange = False
+                api_or_cell_name = self.cell_processor.set_and_get_reserved_name(cell, api_or_cell_name)
             self.data_collector.update_api_or_module_name(api_or_cell_name)
             if self.data_collector:
                 # 框架最新接口变更，grad_input和grad_output的含义发生了变化，与torch含义保持一致，因此此处调换顺序传入
-                module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
+                if need_exchange:
+                    module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_output, grad_output=grad_input)
+                else:
+                    module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output)
                 self.data_collector.backward_data_collect(api_or_cell_name, cell, pid, module_input_output)
         pid = os.getpid()
@@ -114,145 +121,40 @@ class Service:
         forward_hook = functools.partial(forward_hook, forward_name_template)
         backward_hook = functools.partial(backward_hook, backward_name_template)
-        def wrap_forward_hook(cell, input, output):
-            return forward_hook(cell, input, output)
+        def wrap_forward_hook(cell, input_data, output_data):
+            return forward_hook(cell, input_data, output_data)
         def wrap_backward_hook(cell, grad_input, grad_output):
             return backward_hook(cell, grad_input, grad_output)
         return wrap_forward_hook, wrap_backward_hook
-    def wrap_primitive(self, origin_func, primitive_name):
-        service_instance = self
-        def create_backward_hook(captured_grads, num_tensors, updated_primitive_name, hook_type):
-            def backward_hook(grad):
-                captured_grads.append(grad)
-                backward_primitive_name = f"{updated_primitive_name}.{Const.BACKWARD}"
-                try:
-                    if len(captured_grads) == num_tensors and hook_type == Const.INPUT:
-                        service_instance.data_collector.update_api_or_module_name(backward_primitive_name)
-                        new_module_input_output = ModuleBackwardOutputs(grad_output=tuple(captured_grads))
-                        service_instance.data_collector.backward_output_data_collect(
-                            backward_primitive_name, service_instance, os.getpid(), new_module_input_output
-                        )
-                        captured_grads.clear()
-                    elif len(captured_grads) == num_tensors and hook_type == Const.OUTPUT:
-                        service_instance.data_collector.update_api_or_module_name(backward_primitive_name)
-                        new_module_input_output = ModuleBackwardInputs(grad_input=tuple(captured_grads))
-                        service_instance.data_collector.backward_input_data_collect(
-                            backward_primitive_name, service_instance, os.getpid(), new_module_input_output
-                        )
-                        captured_grads.clear()
-                except Exception as exception:
-                    raise Exception(f"This is a primitive op {hook_type}_backward dump error: {exception},"
-                                    f" updated_primitive_name: {updated_primitive_name}") from exception
-            return backward_hook
-        def hook_primitive_inputs(args, captured_grads_input, updated_primitive_name):
-            hooked_inputs = []
-            num_tensors = sum(isinstance(arg, Tensor) for arg in args)
-            input_backward_hook = create_backward_hook(captured_grads_input, num_tensors, updated_primitive_name,
-                                                       Const.INPUT)
-            for _, arg in enumerate(args):
-                if isinstance(arg, Tensor):
-                    arg_hooked = ops.HookBackward(input_backward_hook)(arg)
-                    hooked_inputs.append(arg_hooked)
-                else:
-                    hooked_inputs.append(arg)
-            return hooked_inputs
-        def hook_primitive_outputs(out, captured_grads_output, updated_primitive_name):
-            if isinstance(out, tuple):
-                num_output_tensors = sum(isinstance(tensor, Tensor) for tensor in out)
-            else:
-                num_output_tensors = 1
-            output_backward_hook = create_backward_hook(captured_grads_output, num_output_tensors,
-                                                        updated_primitive_name, Const.OUTPUT)
-            if isinstance(out, Tensor):
-                return ops.HookBackward(output_backward_hook)(out)
-            elif isinstance(out, tuple):
-                hooked_outputs = []
-                for tensor in out:
-                    if isinstance(tensor, Tensor):
-                        hooked_outputs.append(ops.HookBackward(output_backward_hook)(tensor))
-                    else:
-                        hooked_outputs.append(tensor)
-                return tuple(hooked_outputs)
-            return out
-        def wrapped_primitive_call(instance_self, *args, **kwargs):
-            service_instance.update_primitive_counters(primitive_name)
-            current_count = service_instance.primitive_counters.get(primitive_name, 0)
-            updated_primitive_name = f"{Const.PRIMITIVE_PREFIX}.{primitive_name}.{current_count}"
-            if not service_instance.switch:
-                return origin_func(*args, **kwargs)
-            captured_grads_input, captured_grads_output = [], []
-            try:
-                hooked_inputs = hook_primitive_inputs(args, captured_grads_input, updated_primitive_name)
-            except Exception as exception:
-                raise Exception("This is a primitive op dump error during input hooking: {},"
-                                " primitive_name: {}".format(exception, primitive_name)) from exception
-            try:
-                out = origin_func(*hooked_inputs, **kwargs)
-            except Exception as exception:
-                raise Exception("This is a primitive op dump error during function call: {},"
-                                " primitive_name: {}".format(exception, primitive_name)) from exception
-            forward_primitive_name = f"{updated_primitive_name}.{Const.FORWARD}"
-            service_instance.data_collector.update_api_or_module_name(forward_primitive_name)
-            if service_instance.data_collector:
-                module_input_output = ModuleForwardInputsOutputs(args=hooked_inputs, kwargs=kwargs, output=out)
-                try:
-                    service_instance.data_collector.forward_data_collect(forward_primitive_name, instance_self,
-                                                                         os.getpid(), module_input_output)
-                except Exception as exception:
-                    raise Exception("This is a primitive op dump error during forward data collection: {},"
-                                    " primitive_name: {}".format(exception, primitive_name)) from exception
-                if service_instance.data_collector.if_return_forward_new_output():
-                    out = service_instance.data_collector.get_forward_new_output()
-            try:
-                out = hook_primitive_outputs(out, captured_grads_output, updated_primitive_name)
-            except Exception as exception:
-                raise Exception("This is a primitive op dump error during output hooking: {},"
-                                " primitive_name: {}".format(exception, primitive_name)) from exception
-            return out
-        return wrapped_primitive_call
     def update_primitive_counters(self, primitive_name):
         if primitive_name not in self.primitive_counters:
             self.primitive_counters[primitive_name] = 0
         else:
             self.primitive_counters[primitive_name] += 1
-    def register_hooks(self):
+    def register_primitive_hooks(self):
         primitive_set = set()
         for _, cell in self.model.cells_and_names():
             for pname, primitive in cell._primitives.items():
                 primitive_set.add((pname, primitive))
         for pname, primitive in primitive_set:
-            NewPrimitive = type('NewPrimitive', (primitive.__class__,),
-                                {'__call__': self.wrap_primitive(primitive.__call__, pname)})
-            primitive.__class__ = NewPrimitive
+            primitive_class_name = primitive.__class__.__name__
+            primitive_combined_name = pname + Const.SEP + primitive_class_name
+            new_primitive = type('NewPrimitive', (primitive.__class__,),
+                                 {'__call__': self.primitive_hook_service.wrap_primitive(primitive.__call__,
+                                                                                         primitive_combined_name)})
+            primitive.__class__ = new_primitive
     def step(self):
         self.current_iter += 1
         self.data_collector.update_iter(self.current_iter)
-        HOOKCell.cell_count = defaultdict(int)
-        CellProcessor.cell_count = {}
-        self.primitive_counters.clear()
+        self.primitive_hook_service.primitive_counters.clear()
+        self.data_collector.data_writer.reset_cache()
+        JitDump.jit_count = defaultdict(int)
     def start(self, model=None):
         self.start_call = True
@@ -262,9 +164,8 @@ class Service:
             api_register.api_set_ori_func()
             self.should_stop_service = True
             self.switch = False
-            logger.info("************************************************")
-            logger.info(f"*          {Const.TOOL_NAME} ends successfully.          *")
-            logger.info("************************************************")
+            self.primitive_switch = False
+            print_tools_ends_info()
             return
         if self.config.step and self.current_iter not in self.config.step:
             return
@@ -281,7 +182,7 @@ class Service:
             if self.config.rank and self.current_rank not in self.config.rank:
                 return
             self.register_hook_new()
-            if self.config.level == "L1":
+            if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
                 JitDump.set_config(self.config)
                 JitDump.set_data_collector(self.data_collector)
                 ms.common.api._MindsporeFunctionExecutor = JitDump
@@ -291,10 +192,32 @@ class Service:
                     PIJitCaptureContext.__exit__ = self.empty
             self.first_start = False
+        api_register.api_set_hook_func()
         self.switch = True
+        self.primitive_switch = True
         logger.info(f"Dump switch is turned on at step {self.current_iter}. ")
         self.create_dirs()
         logger.info(f"Dump data will be saved in {self.dump_iter_dir}.")
+        JitDump.jit_dump_switch = True
+    def forward_backward_dump_end(self):
+        if self.should_stop_service:
+            return
+        logger.info(f"{Const.TOOL_NAME}: debugger.forward_backward_dump_end() is set successfully. ")
+        if not self.start_call:
+            logger.error(f"{Const.TOOL_NAME}: debugger.start() is not set in the current scope.")
+            raise Exception("debugger.start() is not set in the current scope.")
+        if not self.switch:
+            logger.error(f"{Const.TOOL_NAME}: debugger.forward_backward_dump_end() should be called between "
+                         "debugger.start() and debugger.stop() ")
+            raise Exception("debugger.stop() is already called. ")
+        if self.config.step and self.current_iter not in self.config.step:
+            return
+        if self.config.rank and self.current_rank not in self.config.rank:
+            return
+        self.primitive_switch = False
+        api_register.api_set_ori_func()
+        JitDump.jit_dump_switch = False
     def stop(self):
         if self.should_stop_service:
@@ -309,8 +232,10 @@ class Service:
         if self.config.rank and self.current_rank not in self.config.rank:
             return
         self.switch = False
+        self.primitive_switch = False
         self.start_call = False
         self.data_collector.write_json()
+        JitDump.jit_dump_switch = False
     def need_end_service(self):
         if self.config.step and self.current_iter > max(self.config.step):
@@ -349,16 +274,16 @@ class Service:
     def register_hook_new(self):
         logger.info("The {} hook function is successfully mounted to the model.".format(self.config.task))
-        if self.config.level == "L1":
+        if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L1]:
             api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
             api_register.api_set_hook_func()
-            if self.model:
-                self.register_hooks()
+            if self.model and self.config.task in Const.DUMP_DATA_COLLECTION_LIST:
+                self.register_primitive_hooks()
-        if self.config.level == "L0":
+        if self.config.level in [Const.LEVEL_MIX, Const.LEVEL_L0]:
             if not self.model:
                 raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                                       "The current level is L0, the model cannot be None")
+                                       f"The current level is {self.config.level}, the model cannot be None")
             for name, cell in self.model.cells_and_names():
                 if cell == self.model:
                     continue

msprobe/mindspore/task_handler_factory.py CHANGED Viewed

@@ -1,3 +1,18 @@
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from msprobe.core.common.const import Const
 from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
 from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory

msprobe/msprobe.py CHANGED Viewed

@@ -45,10 +45,15 @@ def main():
     multi_run_ut_cmd_parser = subparsers.add_parser('multi_run_ut')
     api_precision_compare_cmd_parser = subparsers.add_parser('api_precision_compare')
     run_overflow_check_cmd_parser = subparsers.add_parser('run_overflow_check')
+    graph_service_cmd_parser = subparsers.add_parser('graph')
     _compare_parser(compare_cmd_parser)
-    is_torch_available=is_module_available("torch")
+    is_torch_available = is_module_available("torch")
     is_mindspore_available = is_module_available("mindspore")
-    if is_torch_available:
+    if len(sys.argv) < 4:
+        parser.print_help()
+        sys.exit(0)
+    framework_args = parser.parse_args(sys.argv[1:3])
+    if framework_args.framework == Const.PT_FRAMEWORK:
         from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command
         from msprobe.pytorch.parse_tool.cli import parse as cli_parse
         from msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut
@@ -56,20 +61,24 @@ def main():
             _api_precision_compare_command
         from msprobe.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \
             _run_overflow_check_command
+        from msprobe.visualization.graph_service import _pt_graph_service_parser, _pt_graph_service_command
         _run_ut_parser(run_ut_cmd_parser)
         _run_ut_parser(multi_run_ut_cmd_parser)
         multi_run_ut_cmd_parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8,
-                                        help='Number of splits for parallel processing. Range: 1-64')
+                                             help='Number of splits for parallel processing. Range: 1-64')
         _api_precision_compare_parser(api_precision_compare_cmd_parser)
         _run_overflow_check_parser(run_overflow_check_cmd_parser)
-    elif is_mindspore_available:
+        _pt_graph_service_parser(graph_service_cmd_parser)
+    elif framework_args.framework == Const.MS_FRAMEWORK:
         from msprobe.mindspore.api_accuracy_checker.cmd_parser import add_api_accuracy_checker_argument
+        from msprobe.visualization.graph_service import _ms_graph_service_parser, _ms_graph_service_command
         add_api_accuracy_checker_argument(run_ut_cmd_parser)
+        from msprobe.mindspore.api_accuracy_checker.cmd_parser import multi_add_api_accuracy_checker_argument
+        multi_add_api_accuracy_checker_argument(multi_run_ut_cmd_parser)
+        _ms_graph_service_parser(graph_service_cmd_parser)
-    if len(sys.argv) == 1:
-        parser.print_help()
-        sys.exit(0)
     args = parser.parse_args(sys.argv[1:])
     if sys.argv[2] == Const.PT_FRAMEWORK:
         if not is_torch_available:
@@ -86,6 +95,8 @@ def main():
             _api_precision_compare_command(args)
         elif sys.argv[3] == "run_overflow_check":
             _run_overflow_check_command(args)
+        elif sys.argv[3] == "graph":
+            _pt_graph_service_command(args)
         elif sys.argv[3] == "compare":
             if args.cell_mapping is not None or args.api_mapping is not None:
                 logger.error("Argument -cm or -am is not supported in PyTorch framework")
@@ -100,6 +111,12 @@ def main():
         elif sys.argv[3] == "run_ut":
             from msprobe.mindspore.api_accuracy_checker.main import api_checker_main
             api_checker_main(args)
+        elif sys.argv[3] == "multi_run_ut":
+            from msprobe.mindspore.api_accuracy_checker.main import mul_api_checker_main
+            mul_api_checker_main(args)
+        elif sys.argv[3] == "graph":
+            _ms_graph_service_command(args)
 if __name__ == "__main__":
     main()

msprobe/pytorch/__init__.py CHANGED Viewed

@@ -1,4 +1,24 @@
-from .debugger.precision_debugger import PrecisionDebugger
-from .common.utils import seed_all
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from msprobe.pytorch.monitor.module_hook import TrainerMon
 from .compare.distributed_compare import compare_distributed
-from .compare.pt_compare import compare
+from .compare.pt_compare import compare
+from .common.utils import seed_all
+from .debugger.precision_debugger import PrecisionDebugger
+from .functional.module_dump import module_dump, module_dump_end

msprobe/pytorch/api_accuracy_checker/common/config.py CHANGED Viewed

@@ -1,8 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
+from collections import namedtuple
 from msprobe.core.common.file_utils import load_yaml, check_file_or_directory_path
+from msprobe.core.common.utils import is_int
 from msprobe.pytorch.pt_config import RunUTConfig
+RunUtConfig = namedtuple('RunUtConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path',
+                                         'save_error_data', 'is_continue_run_ut', 'real_data_path', 'white_list',
+                                         'black_list', 'error_data_path', 'online_config'])
+OnlineConfig = namedtuple('OnlineConfig', ['is_online', 'nfs_path', 'host', 'port', 'rank_list', 'tls_path'])
 class Config:
     def __init__(self, yaml_file):
         check_file_or_directory_path(yaml_file, False)
@@ -33,8 +58,10 @@ class Config:
             raise ValueError(f"{key} must be one of {validators.keys()}")
         if not isinstance(value, validators.get(key)):
             raise ValueError(f"{key} must be {validators[key].__name__} type")
-        if key == 'precision' and value < 0:
-            raise ValueError("precision must be greater than 0")
+        if key == 'precision' and not is_int(value):
+            raise ValueError("precision must be an integer")
+        if key == 'precision' and (value < 0 or value > 20):
+            raise ValueError("precision must be greater than or equal to 0 and less than 21")
         if key == 'white_list':
             RunUTConfig.check_filter_list_config(key, value)
         if key == 'black_list':
@@ -51,3 +78,55 @@ class Config:
 cur_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 yaml_path = os.path.join(cur_path, "config.yaml")
 msCheckerConfig = Config(yaml_path)
+class CheckerConfig:
+    def __init__(self, task_config=None):
+        self.white_list = msCheckerConfig.white_list
+        self.black_list = msCheckerConfig.black_list
+        self.error_data_path = msCheckerConfig.error_data_path
+        self.is_online = msCheckerConfig.is_online
+        self.nfs_path = msCheckerConfig.nfs_path
+        self.host = msCheckerConfig.host
+        self.port = msCheckerConfig.port
+        self.rank_list = msCheckerConfig.rank_list
+        self.tls_path = msCheckerConfig.tls_path
+        if task_config:
+            self.load_config(task_config)
+    def load_config(self, task_config):
+        self.white_list = task_config.white_list
+        self.black_list = task_config.black_list
+        self.error_data_path = task_config.error_data_path
+        self.is_online = task_config.is_online
+        self.nfs_path = task_config.nfs_path
+        self.host = task_config.host
+        self.port = task_config.port
+        self.rank_list = task_config.rank_list
+        self.tls_path = task_config.tls_path
+    def get_online_config(self):
+        return OnlineConfig(
+            is_online=self.is_online,
+            nfs_path=self.nfs_path,
+            host=self.host,
+            port=self.port,
+            rank_list=self.rank_list,
+            tls_path=self.tls_path
+        )
+    def get_run_ut_config(self, **config_params):
+        return RunUtConfig(
+            forward_content=config_params.get('forward_content'),
+            backward_content=config_params.get('backward_content'),
+            result_csv_path=config_params.get('result_csv_path'),
+            details_csv_path=config_params.get('details_csv_path'),
+            save_error_data=config_params.get('save_error_data'),
+            is_continue_run_ut=config_params.get('is_continue_run_ut'),
+            real_data_path=config_params.get('real_data_path'),
+            white_list=self.white_list,
+            black_list=self.black_list,
+            error_data_path=config_params.get('error_data_path'),
+            online_config=self.get_online_config()
+        )

mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl

mindstudio-probe 1.0.4py3-none-any.whl → 1.1.1py3-none-any.whl