mindstudio-probe 1.0.4__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/METADATA +5 -5
- mindstudio_probe-1.1.1.dist-info/RECORD +341 -0
- {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/WHEEL +1 -1
- {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/entry_points.txt +0 -1
- msprobe/README.md +84 -18
- msprobe/__init__.py +16 -1
- msprobe/config.json +1 -5
- msprobe/core/advisor/advisor.py +16 -11
- msprobe/core/advisor/advisor_const.py +6 -7
- msprobe/core/advisor/advisor_result.py +12 -12
- msprobe/core/common/const.py +164 -3
- msprobe/core/common/exceptions.py +26 -4
- msprobe/core/common/file_utils.py +196 -27
- msprobe/core/common/inplace_op_checker.py +53 -0
- msprobe/core/common/inplace_ops.yaml +251 -0
- msprobe/core/common/log.py +46 -18
- msprobe/core/common/utils.py +308 -209
- msprobe/core/common_config.py +60 -38
- msprobe/core/compare/acc_compare.py +332 -94
- msprobe/core/compare/check.py +104 -22
- msprobe/core/compare/compare_cli.py +42 -5
- msprobe/core/compare/highlight.py +162 -57
- msprobe/core/compare/layer_mapping/__init__.py +19 -0
- msprobe/core/compare/layer_mapping/data_scope_parser.py +235 -0
- msprobe/core/compare/layer_mapping/layer_mapping.py +242 -0
- msprobe/core/compare/layer_mapping/postprocess_pass.py +94 -0
- msprobe/core/compare/multiprocessing_compute.py +33 -8
- msprobe/core/compare/npy_compare.py +73 -29
- msprobe/core/compare/utils.py +306 -247
- msprobe/core/data_dump/data_collector.py +44 -43
- msprobe/core/data_dump/data_processor/base.py +88 -35
- msprobe/core/data_dump/data_processor/factory.py +20 -3
- msprobe/core/data_dump/data_processor/mindspore_processor.py +14 -8
- msprobe/core/data_dump/data_processor/pytorch_processor.py +180 -66
- msprobe/core/data_dump/json_writer.py +63 -42
- msprobe/core/data_dump/scope.py +143 -48
- msprobe/core/grad_probe/constant.py +31 -13
- msprobe/core/grad_probe/grad_compare.py +20 -4
- msprobe/core/grad_probe/utils.py +44 -3
- msprobe/core/overflow_check/abnormal_scene.py +185 -0
- msprobe/core/overflow_check/api_info.py +55 -0
- msprobe/core/overflow_check/checker.py +138 -0
- msprobe/core/overflow_check/filter.py +157 -0
- msprobe/core/overflow_check/ignore_rules.yaml +55 -0
- msprobe/core/overflow_check/level.py +22 -0
- msprobe/core/overflow_check/utils.py +28 -0
- msprobe/docs/01.installation.md +29 -9
- msprobe/docs/02.config_introduction.md +83 -84
- msprobe/docs/03.config_examples.md +3 -20
- msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
- msprobe/docs/05.data_dump_PyTorch.md +143 -13
- msprobe/docs/06.data_dump_MindSpore.md +197 -88
- msprobe/docs/07.accuracy_checker_PyTorch.md +69 -46
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +52 -17
- msprobe/docs/09.accuracy_checker_MindSpore.md +51 -15
- msprobe/docs/10.accuracy_compare_PyTorch.md +187 -99
- msprobe/docs/11.accuracy_compare_MindSpore.md +253 -31
- msprobe/docs/12.overflow_check_PyTorch.md +1 -1
- msprobe/docs/13.overflow_check_MindSpore.md +6 -6
- msprobe/docs/15.free_benchmarking_PyTorch.md +60 -55
- msprobe/docs/16.free_benchmarking_MindSpore.md +159 -0
- msprobe/docs/17.grad_probe.md +19 -22
- msprobe/docs/18.online_dispatch.md +89 -0
- msprobe/docs/19.monitor.md +468 -0
- msprobe/docs/20.monitor_performance_baseline.md +52 -0
- msprobe/docs/21.visualization_PyTorch.md +386 -0
- msprobe/docs/22.visualization_MindSpore.md +384 -0
- msprobe/docs/23.tool_function_introduction.md +28 -0
- msprobe/docs/{FAQ_PyTorch.md → FAQ.md} +25 -10
- msprobe/docs/data_dump_Mindspore/dynamic_graph_quick_start_example.md +211 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/docs/img/ms_dump.png +0 -0
- msprobe/docs/img/ms_layer.png +0 -0
- msprobe/docs/img/pt_dump.png +0 -0
- msprobe/mindspore/__init__.py +16 -0
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +130 -138
- msprobe/mindspore/api_accuracy_checker/api_info.py +27 -5
- msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +63 -1
- msprobe/mindspore/api_accuracy_checker/compute_element.py +59 -24
- msprobe/mindspore/api_accuracy_checker/data_manager.py +264 -0
- msprobe/mindspore/api_accuracy_checker/main.py +27 -3
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +206 -0
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +58 -0
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
- msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
- msprobe/mindspore/cell_processor.py +58 -13
- msprobe/mindspore/common/const.py +35 -13
- msprobe/mindspore/common/log.py +5 -9
- msprobe/mindspore/common/utils.py +60 -5
- msprobe/mindspore/compare/distributed_compare.py +15 -28
- msprobe/mindspore/compare/ms_compare.py +319 -158
- msprobe/mindspore/compare/ms_graph_compare.py +99 -49
- msprobe/mindspore/debugger/debugger_config.py +20 -14
- msprobe/mindspore/debugger/precision_debugger.py +43 -13
- msprobe/mindspore/dump/dump_tool_factory.py +18 -1
- msprobe/mindspore/dump/hook_cell/api_registry.py +23 -3
- msprobe/mindspore/dump/hook_cell/primitive_hooks.py +203 -0
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +107 -10
- msprobe/mindspore/dump/hook_cell/wrap_api.py +21 -13
- msprobe/mindspore/dump/jit_dump.py +56 -20
- msprobe/mindspore/dump/kernel_graph_dump.py +19 -5
- msprobe/mindspore/dump/kernel_kbyk_dump.py +19 -6
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +162 -41
- msprobe/mindspore/free_benchmark/common/config.py +15 -0
- msprobe/mindspore/free_benchmark/common/handler_params.py +15 -1
- msprobe/mindspore/free_benchmark/common/utils.py +37 -8
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
- msprobe/mindspore/free_benchmark/handler/base_handler.py +20 -5
- msprobe/mindspore/free_benchmark/handler/check_handler.py +21 -7
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +18 -3
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +21 -6
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +23 -8
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +29 -5
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +25 -10
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +45 -19
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +29 -8
- msprobe/mindspore/free_benchmark/perturbation/no_change.py +16 -1
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +22 -7
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +17 -2
- msprobe/mindspore/grad_probe/global_context.py +44 -14
- msprobe/mindspore/grad_probe/grad_analyzer.py +27 -13
- msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
- msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
- msprobe/mindspore/grad_probe/hook.py +24 -10
- msprobe/mindspore/grad_probe/utils.py +18 -5
- msprobe/mindspore/ms_config.py +22 -15
- msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +20 -6
- msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +15 -0
- msprobe/mindspore/runtime.py +15 -0
- msprobe/mindspore/service.py +75 -150
- msprobe/mindspore/task_handler_factory.py +15 -0
- msprobe/msprobe.py +24 -7
- msprobe/pytorch/__init__.py +23 -3
- msprobe/pytorch/api_accuracy_checker/common/config.py +81 -2
- msprobe/pytorch/api_accuracy_checker/common/utils.py +53 -21
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +19 -2
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +50 -25
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +51 -21
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +23 -6
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +28 -8
- msprobe/pytorch/api_accuracy_checker/config.yaml +1 -1
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +454 -0
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +73 -33
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +44 -18
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +32 -11
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +122 -172
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +158 -4
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +30 -24
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +68 -31
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +27 -4
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +115 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +26 -9
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/torch_ops_config.yaml +63 -0
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +44 -0
- msprobe/pytorch/bench_functions/__init__.py +18 -3
- msprobe/pytorch/bench_functions/apply_adam_w.py +15 -0
- msprobe/pytorch/bench_functions/confusion_transpose.py +20 -1
- msprobe/pytorch/bench_functions/fast_gelu.py +15 -0
- msprobe/pytorch/bench_functions/layer_norm_eval.py +15 -0
- msprobe/pytorch/bench_functions/linear.py +15 -0
- msprobe/pytorch/bench_functions/matmul_backward.py +33 -6
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +280 -157
- msprobe/pytorch/bench_functions/rms_norm.py +15 -0
- msprobe/pytorch/bench_functions/rotary_mul.py +32 -9
- msprobe/pytorch/bench_functions/scaled_mask_softmax.py +15 -0
- msprobe/pytorch/bench_functions/swiglu.py +29 -6
- msprobe/pytorch/common/__init__.py +15 -0
- msprobe/pytorch/common/log.py +18 -6
- msprobe/pytorch/common/parse_json.py +31 -16
- msprobe/pytorch/common/utils.py +96 -40
- msprobe/pytorch/compare/distributed_compare.py +13 -14
- msprobe/pytorch/compare/match.py +15 -0
- msprobe/pytorch/compare/pt_compare.py +44 -10
- msprobe/pytorch/debugger/debugger_config.py +69 -52
- msprobe/pytorch/debugger/precision_debugger.py +72 -24
- msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
- msprobe/pytorch/free_benchmark/__init__.py +20 -5
- msprobe/pytorch/free_benchmark/common/constant.py +15 -0
- msprobe/pytorch/free_benchmark/common/counter.py +15 -0
- msprobe/pytorch/free_benchmark/common/enums.py +43 -0
- msprobe/pytorch/free_benchmark/common/params.py +23 -1
- msprobe/pytorch/free_benchmark/common/utils.py +43 -5
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +47 -9
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +17 -0
- msprobe/pytorch/free_benchmark/main.py +19 -4
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +15 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +19 -4
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +18 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +21 -4
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +28 -2
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +19 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +15 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +15 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +15 -0
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +65 -16
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +15 -0
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +21 -5
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +15 -0
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +19 -4
- msprobe/pytorch/function_factory.py +17 -2
- msprobe/pytorch/functional/module_dump.py +84 -0
- msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
- msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
- msprobe/pytorch/hook_module/__init__.py +16 -1
- msprobe/pytorch/hook_module/api_registry.py +13 -8
- msprobe/pytorch/hook_module/hook_module.py +17 -19
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +1 -0
- msprobe/pytorch/hook_module/utils.py +4 -6
- msprobe/pytorch/hook_module/wrap_aten.py +12 -11
- msprobe/pytorch/hook_module/wrap_distributed.py +6 -7
- msprobe/pytorch/hook_module/wrap_functional.py +21 -20
- msprobe/pytorch/hook_module/wrap_npu_custom.py +9 -17
- msprobe/pytorch/hook_module/wrap_tensor.py +4 -6
- msprobe/pytorch/hook_module/wrap_torch.py +4 -6
- msprobe/pytorch/hook_module/wrap_vf.py +4 -6
- msprobe/pytorch/module_processer.py +18 -6
- msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
- msprobe/pytorch/monitor/anomaly_detect.py +340 -0
- msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
- msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +272 -0
- msprobe/pytorch/monitor/features.py +108 -0
- msprobe/pytorch/monitor/module_hook.py +870 -0
- msprobe/pytorch/monitor/module_metric.py +193 -0
- msprobe/pytorch/monitor/module_spec_verifier.py +93 -0
- msprobe/pytorch/monitor/optimizer_collect.py +295 -0
- msprobe/pytorch/monitor/unittest/__init__.py +0 -0
- msprobe/pytorch/monitor/unittest/test_monitor.py +145 -0
- msprobe/pytorch/monitor/utils.py +250 -0
- msprobe/pytorch/monitor/visualizer.py +59 -0
- msprobe/pytorch/online_dispatch/__init__.py +2 -3
- msprobe/pytorch/online_dispatch/compare.py +38 -48
- msprobe/pytorch/online_dispatch/dispatch.py +50 -25
- msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
- msprobe/pytorch/online_dispatch/single_compare.py +60 -39
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +9 -1
- msprobe/pytorch/online_dispatch/utils.py +48 -23
- msprobe/pytorch/parse.py +15 -0
- msprobe/pytorch/parse_tool/cli.py +5 -6
- msprobe/pytorch/parse_tool/lib/compare.py +19 -26
- msprobe/pytorch/parse_tool/lib/config.py +1 -1
- msprobe/pytorch/parse_tool/lib/parse_tool.py +4 -2
- msprobe/pytorch/parse_tool/lib/utils.py +40 -55
- msprobe/pytorch/parse_tool/lib/visualization.py +3 -1
- msprobe/pytorch/pt_config.py +192 -40
- msprobe/pytorch/service.py +110 -35
- msprobe/visualization/__init__.py +14 -0
- msprobe/visualization/builder/__init__.py +14 -0
- msprobe/visualization/builder/graph_builder.py +165 -0
- msprobe/visualization/builder/msprobe_adapter.py +205 -0
- msprobe/visualization/compare/__init__.py +14 -0
- msprobe/visualization/compare/graph_comparator.py +130 -0
- msprobe/visualization/compare/mode_adapter.py +211 -0
- msprobe/visualization/graph/__init__.py +14 -0
- msprobe/visualization/graph/base_node.py +124 -0
- msprobe/visualization/graph/graph.py +200 -0
- msprobe/visualization/graph/node_colors.py +95 -0
- msprobe/visualization/graph/node_op.py +39 -0
- msprobe/visualization/graph_service.py +214 -0
- msprobe/visualization/utils.py +232 -0
- mindstudio_probe-1.0.4.dist-info/RECORD +0 -276
- msprobe/docs/04.acl_config_examples.md +0 -76
- msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -43
- msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -107
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/ssl_config.py +0 -10
- msprobe/pytorch/functional/dump_module.py +0 -39
- {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.0.4.dist-info → mindstudio_probe-1.1.1.dist-info}/top_level.txt +0 -0
- /msprobe/{mindspore/free_benchmark/decorator → pytorch/monitor}/__init__.py +0 -0
- /msprobe/pytorch/{functional/data_processor.py → monitor/distributed/__init__.py} +0 -0
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
5
|
# you may not use this file except in compliance with the License.
|
|
7
6
|
# You may obtain a copy of the License at
|
|
8
7
|
#
|
|
@@ -13,7 +12,6 @@
|
|
|
13
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
13
|
# See the License for the specific language governing permissions and
|
|
15
14
|
# limitations under the License.
|
|
16
|
-
"""
|
|
17
15
|
|
|
18
16
|
import os
|
|
19
17
|
from functools import wraps
|
|
@@ -23,6 +21,7 @@ from msprobe.pytorch.hook_module.hook_module import HOOKModule
|
|
|
23
21
|
from msprobe.pytorch.common.utils import torch_device_guard
|
|
24
22
|
from msprobe.core.common.const import Const
|
|
25
23
|
from msprobe.core.common.file_utils import load_yaml
|
|
24
|
+
from msprobe.core.common.inplace_op_checker import InplaceOpChecker
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
cur_path = os.path.dirname(os.path.realpath(__file__))
|
|
@@ -50,7 +49,7 @@ class DistributedOPTemplate(HOOKModule):
|
|
|
50
49
|
self.op_name_ = op_name
|
|
51
50
|
self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP
|
|
52
51
|
super().__init__(build_hook)
|
|
53
|
-
if not self.stop_hook and self.op_name_
|
|
52
|
+
if not self.stop_hook and InplaceOpChecker.check(self.op_name_, InplaceOpChecker.OP_DISTRIBUTED):
|
|
54
53
|
self.op_is_inplace = True
|
|
55
54
|
|
|
56
55
|
@torch_device_guard
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
5
|
# you may not use this file except in compliance with the License.
|
|
7
6
|
# You may obtain a copy of the License at
|
|
8
7
|
#
|
|
@@ -13,7 +12,6 @@
|
|
|
13
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
13
|
# See the License for the specific language governing permissions and
|
|
15
14
|
# limitations under the License.
|
|
16
|
-
"""
|
|
17
15
|
|
|
18
16
|
import os
|
|
19
17
|
import torch
|
|
@@ -32,31 +30,34 @@ def remove_dropout():
|
|
|
32
30
|
from torch import _VF
|
|
33
31
|
from torch.overrides import has_torch_function_unary, handle_torch_function
|
|
34
32
|
|
|
35
|
-
def function_dropout(
|
|
33
|
+
def function_dropout(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
|
|
36
34
|
inplace: bool = False) -> torch.Tensor:
|
|
37
|
-
if has_torch_function_unary(
|
|
38
|
-
return handle_torch_function(
|
|
35
|
+
if has_torch_function_unary(input_tensor):
|
|
36
|
+
return handle_torch_function(
|
|
37
|
+
function_dropout, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
|
|
39
38
|
if p < 0.0 or p > 1.0:
|
|
40
39
|
raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
|
|
41
|
-
return _VF.dropout_(
|
|
42
|
-
|
|
40
|
+
return _VF.dropout_(input_tensor, 0., training) if inplace else _VF.dropout(input_tensor, 0., training)
|
|
43
41
|
|
|
44
|
-
def function_dropout2d(
|
|
42
|
+
def function_dropout2d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
|
|
45
43
|
inplace: bool = False) -> torch.Tensor:
|
|
46
|
-
if has_torch_function_unary(
|
|
47
|
-
return handle_torch_function(
|
|
44
|
+
if has_torch_function_unary(input_tensor):
|
|
45
|
+
return handle_torch_function(
|
|
46
|
+
function_dropout2d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
|
|
48
47
|
if p < 0.0 or p > 1.0:
|
|
49
48
|
raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
|
|
50
|
-
return _VF.feature_dropout_(
|
|
51
|
-
|
|
49
|
+
return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
|
|
50
|
+
0., training)
|
|
52
51
|
|
|
53
|
-
def function_dropout3d(
|
|
52
|
+
def function_dropout3d(input_tensor: torch.Tensor, p: float = 0.5, training: bool = True,
|
|
54
53
|
inplace: bool = False) -> torch.Tensor:
|
|
55
|
-
if has_torch_function_unary(
|
|
56
|
-
return handle_torch_function(
|
|
54
|
+
if has_torch_function_unary(input_tensor):
|
|
55
|
+
return handle_torch_function(
|
|
56
|
+
function_dropout3d, (input_tensor,), input_tensor, p=0., training=training, inplace=inplace)
|
|
57
57
|
if p < 0.0 or p > 1.0:
|
|
58
58
|
raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
|
|
59
|
-
return _VF.feature_dropout_(
|
|
59
|
+
return _VF.feature_dropout_(input_tensor, 0., training) if inplace else _VF.feature_dropout(input_tensor,
|
|
60
|
+
0., training)
|
|
60
61
|
|
|
61
62
|
F.dropout = function_dropout
|
|
62
63
|
F.dropout2d = function_dropout2d
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
5
|
# you may not use this file except in compliance with the License.
|
|
7
6
|
# You may obtain a copy of the License at
|
|
8
7
|
#
|
|
@@ -13,7 +12,6 @@
|
|
|
13
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
13
|
# See the License for the specific language governing permissions and
|
|
15
14
|
# limitations under the License.
|
|
16
|
-
"""
|
|
17
15
|
|
|
18
16
|
import os
|
|
19
17
|
import torch
|
|
@@ -21,24 +19,19 @@ import torch
|
|
|
21
19
|
from msprobe.pytorch.hook_module.hook_module import HOOKModule
|
|
22
20
|
from msprobe.pytorch.common.utils import torch_device_guard, torch_without_guard_version
|
|
23
21
|
from msprobe.core.common.const import Const
|
|
22
|
+
from msprobe.core.common.log import logger
|
|
24
23
|
from msprobe.core.common.file_utils import load_yaml
|
|
25
24
|
from msprobe.pytorch.function_factory import npu_custom_functions
|
|
26
25
|
|
|
27
|
-
cur_path = os.path.dirname(os.path.realpath(__file__))
|
|
28
|
-
yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
|
|
29
|
-
|
|
30
|
-
|
|
31
26
|
try:
|
|
32
27
|
import torch_npu
|
|
33
28
|
except ImportError:
|
|
34
|
-
|
|
35
|
-
else:
|
|
36
|
-
is_gpu = False
|
|
29
|
+
logger.info("Failing to import torch_npu.")
|
|
37
30
|
|
|
38
31
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
}
|
|
32
|
+
cur_path = os.path.dirname(os.path.realpath(__file__))
|
|
33
|
+
yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
|
|
34
|
+
cuda_func_mapping = {"npu_fusion_attention" : "gpu_fusion_attention"}
|
|
42
35
|
|
|
43
36
|
|
|
44
37
|
def get_npu_ops():
|
|
@@ -83,7 +76,6 @@ class NpuOPTemplate(HOOKModule):
|
|
|
83
76
|
def wrap_npu_op(op_name, hook):
|
|
84
77
|
def npu_op_template(*args, **kwargs):
|
|
85
78
|
return NpuOPTemplate(op_name, hook)(*args, **kwargs)
|
|
86
|
-
|
|
87
79
|
return npu_op_template
|
|
88
80
|
|
|
89
81
|
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
5
|
# you may not use this file except in compliance with the License.
|
|
7
6
|
# You may obtain a copy of the License at
|
|
8
7
|
#
|
|
@@ -13,7 +12,6 @@
|
|
|
13
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
13
|
# See the License for the specific language governing permissions and
|
|
15
14
|
# limitations under the License.
|
|
16
|
-
"""
|
|
17
15
|
|
|
18
16
|
import os
|
|
19
17
|
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
5
|
# you may not use this file except in compliance with the License.
|
|
7
6
|
# You may obtain a copy of the License at
|
|
8
7
|
#
|
|
@@ -13,7 +12,6 @@
|
|
|
13
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
13
|
# See the License for the specific language governing permissions and
|
|
15
14
|
# limitations under the License.
|
|
16
|
-
"""
|
|
17
15
|
|
|
18
16
|
import os
|
|
19
17
|
import torch
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
5
|
# you may not use this file except in compliance with the License.
|
|
7
6
|
# You may obtain a copy of the License at
|
|
8
7
|
#
|
|
@@ -13,7 +12,6 @@
|
|
|
13
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
13
|
# See the License for the specific language governing permissions and
|
|
15
14
|
# limitations under the License.
|
|
16
|
-
"""
|
|
17
15
|
|
|
18
16
|
import os
|
|
19
17
|
import torch
|
|
@@ -1,10 +1,25 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
1
16
|
from functools import wraps
|
|
2
17
|
|
|
3
18
|
import torch
|
|
19
|
+
from msprobe.core.common.const import Const
|
|
20
|
+
from msprobe.core.data_dump.scope import ModuleRangeScope, MixRangeScope
|
|
4
21
|
from torch.utils.hooks import BackwardHook
|
|
5
22
|
|
|
6
|
-
from msprobe.core.common.const import Const
|
|
7
|
-
from msprobe.core.data_dump.scope import ModuleRangeScope
|
|
8
23
|
torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
|
|
9
24
|
|
|
10
25
|
|
|
@@ -15,10 +30,7 @@ class ModuleProcesser:
|
|
|
15
30
|
module_node = {}
|
|
16
31
|
|
|
17
32
|
def __init__(self, scope):
|
|
18
|
-
if isinstance(scope, ModuleRangeScope)
|
|
19
|
-
self.scope = scope
|
|
20
|
-
else:
|
|
21
|
-
self.scope = None
|
|
33
|
+
self.scope = scope if isinstance(scope, (ModuleRangeScope, MixRangeScope)) else None
|
|
22
34
|
BackwardHook.setup_input_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_input_hook)
|
|
23
35
|
BackwardHook.setup_output_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_output_hook)
|
|
24
36
|
BackwardHook.setup_output_hook = ModuleProcesser.filter_tensor_and_tuple(BackwardHook.setup_output_hook)
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
import argparse
|
|
19
|
+
import ast
|
|
20
|
+
import heapq
|
|
21
|
+
|
|
22
|
+
from msprobe.core.common.log import logger
|
|
23
|
+
from msprobe.core.common.const import MonitorConst
|
|
24
|
+
from msprobe.core.common.file_utils import check_path_before_create, save_json, create_directory, remove_path, \
|
|
25
|
+
check_file_or_directory_path, load_json
|
|
26
|
+
from msprobe.pytorch.monitor.anomaly_detect import GradAnomalyData
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AnomalyDataWriter:
|
|
30
|
+
"""
|
|
31
|
+
异常数据写入类,负责将异常数据写入到JSON文件中。
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, dump_path, rank) -> None:
|
|
35
|
+
self.dump_path = dump_path
|
|
36
|
+
self.dump_rank_dir = os.path.join(self.dump_path, f"rank{rank}")
|
|
37
|
+
self.json_path = os.path.join(self.dump_rank_dir, MonitorConst.ANOMALY_JSON)
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def get_anomaly_dict(anomalies):
|
|
41
|
+
"""将GradAnomalyData列表转换为json"""
|
|
42
|
+
anomalies_json = {}
|
|
43
|
+
for anomaly in anomalies:
|
|
44
|
+
anomalies_json.update({anomaly.get_key(): anomaly.to_dict()})
|
|
45
|
+
return anomalies_json
|
|
46
|
+
|
|
47
|
+
def init_detected_json(self):
|
|
48
|
+
"""初始化落盘文件"""
|
|
49
|
+
check_path_before_create(self.dump_path)
|
|
50
|
+
if not os.path.exists(self.dump_path):
|
|
51
|
+
create_directory(self.dump_path)
|
|
52
|
+
|
|
53
|
+
if not os.path.exists(self.dump_rank_dir):
|
|
54
|
+
create_directory(self.dump_rank_dir)
|
|
55
|
+
|
|
56
|
+
if os.path.exists(self.json_path):
|
|
57
|
+
check_file_or_directory_path(self.json_path, isdir=False)
|
|
58
|
+
logger.warning(f"The existing file will be deleted: {self.json_path}.")
|
|
59
|
+
remove_path(self.json_path)
|
|
60
|
+
save_json(self.json_path, {}, indent=1)
|
|
61
|
+
|
|
62
|
+
def write_detected_json(self, anomalies):
|
|
63
|
+
"""
|
|
64
|
+
落盘异常数据
|
|
65
|
+
Args:
|
|
66
|
+
anomalies: GradAnomalyData对象列表
|
|
67
|
+
"""
|
|
68
|
+
anomalies_json = self.get_anomaly_dict(anomalies)
|
|
69
|
+
logger.info(f"{MonitorConst.ANOMALY_JSON} is at {self.dump_rank_dir}.")
|
|
70
|
+
|
|
71
|
+
data_to_write = load_json(self.json_path) if os.path.exists(self.json_path) else {}
|
|
72
|
+
data_to_write.update(anomalies_json)
|
|
73
|
+
save_json(self.json_path, data_to_write, indent=1)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class AnomalyDataLoader:
|
|
77
|
+
def __init__(self, data_path) -> None:
|
|
78
|
+
self.data_path = data_path
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def create_instances_from_dict(anomalies_dict: dict):
|
|
82
|
+
instances = []
|
|
83
|
+
for values in anomalies_dict.values():
|
|
84
|
+
try:
|
|
85
|
+
instances.append(GradAnomalyData(**values))
|
|
86
|
+
except KeyError as e:
|
|
87
|
+
logger.warning(f"Missing key in anomaly data: {e}.")
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.warning(f"Value error when creating a GradAnomalyData instance: {e}.")
|
|
90
|
+
return instances
|
|
91
|
+
|
|
92
|
+
def get_anomalies_from_jsons(self):
|
|
93
|
+
"""遍历文件夹,从rankK/anomaly.json中读取异常数据
|
|
94
|
+
return: anomalies: GradAnomalyData对象列表
|
|
95
|
+
"""
|
|
96
|
+
anomalies = []
|
|
97
|
+
check_file_or_directory_path(self.data_path, isdir=True)
|
|
98
|
+
for rank_dir in os.listdir(self.data_path):
|
|
99
|
+
rank_path = os.path.join(self.data_path, rank_dir)
|
|
100
|
+
if not os.path.isdir(rank_path):
|
|
101
|
+
continue
|
|
102
|
+
json_path = os.path.join(rank_path, MonitorConst.ANOMALY_JSON)
|
|
103
|
+
if not os.path.exists(json_path):
|
|
104
|
+
continue
|
|
105
|
+
data_anomalies = load_json(json_path)
|
|
106
|
+
instances = self.create_instances_from_dict(data_anomalies)
|
|
107
|
+
anomalies.extend(instances)
|
|
108
|
+
return anomalies
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class AnomalyAnalyse:
|
|
112
|
+
def __init__(self) -> None:
|
|
113
|
+
self.sorted_anomalies = []
|
|
114
|
+
|
|
115
|
+
def get_range_top_k(self, topk, step_list, anomalies):
|
|
116
|
+
"""
|
|
117
|
+
获取前topk个step_list范围内的异常。
|
|
118
|
+
"""
|
|
119
|
+
if not step_list:
|
|
120
|
+
filtered_anomalies = anomalies
|
|
121
|
+
else:
|
|
122
|
+
filtered_anomalies = [
|
|
123
|
+
anomaly
|
|
124
|
+
for anomaly in anomalies
|
|
125
|
+
if anomaly.step in step_list
|
|
126
|
+
]
|
|
127
|
+
if topk >= len(filtered_anomalies):
|
|
128
|
+
self.sorted_anomalies = sorted(filtered_anomalies)
|
|
129
|
+
else:
|
|
130
|
+
self.sorted_anomalies = list(heapq.nsmallest(topk, filtered_anomalies))
|
|
131
|
+
return self.sorted_anomalies
|
|
132
|
+
|
|
133
|
+
def rewrite_sorted_anomalies(self, output_path):
|
|
134
|
+
"""
|
|
135
|
+
将排序后的异常数据重新落盘
|
|
136
|
+
"""
|
|
137
|
+
check_file_or_directory_path(output_path, isdir=True)
|
|
138
|
+
|
|
139
|
+
sorted_data = AnomalyDataWriter.get_anomaly_dict(self.sorted_anomalies)
|
|
140
|
+
logger.info(f"{MonitorConst.ANALYSE_JSON} is at {output_path}.")
|
|
141
|
+
json_path = os.path.join(output_path, MonitorConst.ANALYSE_JSON)
|
|
142
|
+
if os.path.exists(json_path):
|
|
143
|
+
logger.warning(f"The existing file will be deleted: {json_path}.")
|
|
144
|
+
remove_path(json_path)
|
|
145
|
+
save_json(json_path, sorted_data, indent=1)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _get_parse_args():
|
|
149
|
+
parser = argparse.ArgumentParser()
|
|
150
|
+
parser.add_argument("-d", "--data_path", dest="data_path_dir", default="./", type=str,
|
|
151
|
+
help="<Required> The anomaly detect result dictionary: generate from monitor tool.",
|
|
152
|
+
required=True,
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
|
|
155
|
+
help="<optional> The analyse task result out path.",
|
|
156
|
+
required=False,
|
|
157
|
+
)
|
|
158
|
+
parser.add_argument("-k", "--topk", dest="top_k_number", default=8, type=int,
|
|
159
|
+
help="<optional> Top K number of earliest anomalies.",
|
|
160
|
+
required=False,
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument("-s", "--step", dest="step_list", default="[]", type=str,
|
|
163
|
+
help="<optional> Analyse which steps.",
|
|
164
|
+
required=False,
|
|
165
|
+
)
|
|
166
|
+
return parser.parse_args(sys.argv[1:])
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _get_step_and_stop(args):
|
|
170
|
+
try:
|
|
171
|
+
step_list = ast.literal_eval(args.step_list)
|
|
172
|
+
if not isinstance(step_list, list):
|
|
173
|
+
raise ValueError(f"{args.step_list} is not a list.")
|
|
174
|
+
except (ValueError, SyntaxError, RecursionError) as e:
|
|
175
|
+
raise Exception(f"The step list must be a resolvable list type.") from e
|
|
176
|
+
if args.top_k_number <= 0:
|
|
177
|
+
raise Exception("The top k number must be greater than 0.")
|
|
178
|
+
return step_list, args.top_k_number
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _anomaly_analyse():
|
|
182
|
+
args = _get_parse_args()
|
|
183
|
+
step_list, top_k_number = _get_step_and_stop(args)
|
|
184
|
+
loader = AnomalyDataLoader(args.data_path_dir)
|
|
185
|
+
anomalies = loader.get_anomalies_from_jsons()
|
|
186
|
+
analyser = AnomalyAnalyse()
|
|
187
|
+
top_anomalies = analyser.get_range_top_k(
|
|
188
|
+
top_k_number, step_list, anomalies
|
|
189
|
+
)
|
|
190
|
+
analyser.rewrite_sorted_anomalies(
|
|
191
|
+
args.out_path if args.out_path else args.data_path_dir
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
logger.info(f"Top {top_k_number} anomalies are listed as follows:")
|
|
195
|
+
for index, anomaly in enumerate(top_anomalies):
|
|
196
|
+
logger.info(f"{index}: {anomaly.message}")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
_anomaly_analyse()
|
|
201
|
+
logger.info("Analyse task completed.")
|