mindstudio-probe 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mindstudio_probe-1.0.1.dist-info/LICENSE +201 -0
- mindstudio_probe-1.0.1.dist-info/METADATA +30 -0
- mindstudio_probe-1.0.1.dist-info/RECORD +228 -0
- mindstudio_probe-1.0.1.dist-info/WHEEL +5 -0
- mindstudio_probe-1.0.1.dist-info/entry_points.txt +2 -0
- mindstudio_probe-1.0.1.dist-info/top_level.txt +1 -0
- msprobe/README.md +182 -0
- msprobe/__init__.py +0 -0
- msprobe/config/README.md +397 -0
- msprobe/config/config.json +28 -0
- msprobe/config/img/free_benchmark.png +0 -0
- msprobe/core/common/const.py +241 -0
- msprobe/core/common/exceptions.py +88 -0
- msprobe/core/common/file_check.py +265 -0
- msprobe/core/common/log.py +55 -0
- msprobe/core/common/utils.py +516 -0
- msprobe/core/common_config.py +58 -0
- msprobe/core/data_dump/data_collector.py +140 -0
- msprobe/core/data_dump/data_processor/base.py +245 -0
- msprobe/core/data_dump/data_processor/factory.py +61 -0
- msprobe/core/data_dump/data_processor/pytorch_processor.py +346 -0
- msprobe/core/data_dump/json_writer.py +116 -0
- msprobe/core/data_dump/scope.py +178 -0
- msprobe/mindspore/__init__.py +1 -0
- msprobe/mindspore/debugger/__init__.py +0 -0
- msprobe/mindspore/debugger/debugger_config.py +51 -0
- msprobe/mindspore/debugger/precision_debugger.py +32 -0
- msprobe/mindspore/doc/dump.md +65 -0
- msprobe/mindspore/dump/__init__.py +0 -0
- msprobe/mindspore/dump/api_kbk_dump.py +55 -0
- msprobe/mindspore/dump/dump_tool_factory.py +38 -0
- msprobe/mindspore/dump/kernel_graph_dump.py +60 -0
- msprobe/mindspore/ms_config.py +78 -0
- msprobe/mindspore/overflow_check/__init__.py +0 -0
- msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +45 -0
- msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +32 -0
- msprobe/mindspore/task_handler_factory.py +21 -0
- msprobe/msprobe.py +67 -0
- msprobe/pytorch/__init__.py +4 -0
- msprobe/pytorch/advisor/advisor.py +124 -0
- msprobe/pytorch/advisor/advisor_const.py +59 -0
- msprobe/pytorch/advisor/advisor_result.py +58 -0
- msprobe/pytorch/api_accuracy_checker/.keep +0 -0
- msprobe/pytorch/api_accuracy_checker/__init__.py +0 -0
- msprobe/pytorch/api_accuracy_checker/common/.keep +0 -0
- msprobe/pytorch/api_accuracy_checker/common/__init__.py +0 -0
- msprobe/pytorch/api_accuracy_checker/common/config.py +50 -0
- msprobe/pytorch/api_accuracy_checker/common/utils.py +224 -0
- msprobe/pytorch/api_accuracy_checker/compare/__init__.py +0 -0
- msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +216 -0
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +545 -0
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +133 -0
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml +390 -0
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +345 -0
- msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +74 -0
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +249 -0
- msprobe/pytorch/api_accuracy_checker/config.yaml +4 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/.keep +0 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/__init__.py +0 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +328 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +203 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +127 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +493 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +7 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json +5 -0
- msprobe/pytorch/common/__init__.py +2 -0
- msprobe/pytorch/common/compare_script.template +14 -0
- msprobe/pytorch/common/log.py +32 -0
- msprobe/pytorch/common/parse_json.py +37 -0
- msprobe/pytorch/common/utils.py +224 -0
- msprobe/pytorch/compare/acc_compare.py +1024 -0
- msprobe/pytorch/compare/distributed_compare.py +111 -0
- msprobe/pytorch/compare/highlight.py +100 -0
- msprobe/pytorch/compare/mapping.yaml +607 -0
- msprobe/pytorch/compare/match.py +36 -0
- msprobe/pytorch/compare/npy_compare.py +244 -0
- msprobe/pytorch/debugger/__init__.py +0 -0
- msprobe/pytorch/debugger/debugger_config.py +86 -0
- msprobe/pytorch/debugger/precision_debugger.py +95 -0
- msprobe/pytorch/doc/FAQ.md +193 -0
- msprobe/pytorch/doc/api_accuracy_checker.md +269 -0
- msprobe/pytorch/doc/atat/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/205/342/225/226/320/265/321/205/320/225/342/225/226/321/206/320/245/342/226/221/321/206/320/235/320/276dump/321/206/320/260/320/227/321/205/320/227/320/226/321/206/320/220/320/267/321/210/320/223/342/225/234/321/205/320/257/342/225/221/321/207/342/225/221/342/224/220/321/206/320/232/320/265/321/205/320/241/320/232.md +182 -0
- msprobe/pytorch/doc/dump.md +207 -0
- msprobe/pytorch/doc/img/BLOOM-7B_1.png +0 -0
- msprobe/pytorch/doc/img/BLOOM-7B_2.png +0 -0
- msprobe/pytorch/doc/img/BLOOM-7B_3.png +0 -0
- msprobe/pytorch/doc/img/BLOOM-7B_4.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_1.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_2.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_3.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_4.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_5.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_6.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_7.png +0 -0
- msprobe/pytorch/doc/img/GPT-3_8.png +0 -0
- msprobe/pytorch/doc/img/YOLOV5S_1.png +0 -0
- msprobe/pytorch/doc/img/YOLOV5S_2.png +0 -0
- msprobe/pytorch/doc/img/accuracy_checking_details.png +0 -0
- msprobe/pytorch/doc/img/accuracy_checking_result.png +0 -0
- msprobe/pytorch/doc/img/api_precision_compare_details.png +0 -0
- msprobe/pytorch/doc/img/api_precision_compare_result.png +0 -0
- msprobe/pytorch/doc/img/auto_analyze_log.png +0 -0
- msprobe/pytorch/doc/img/compare_result_pkl.png +0 -0
- msprobe/pytorch/doc/img/compare_result_pkl_md5.png.png +0 -0
- msprobe/pytorch/doc/img/cpu_info.png +0 -0
- msprobe/pytorch/doc/img/module_compare.png +0 -0
- msprobe/pytorch/doc/parse_tool.md +286 -0
- msprobe/pytorch/doc/ptdbg_ascend_compare.md +176 -0
- msprobe/pytorch/doc/ptdbg_ascend_overview.md +68 -0
- msprobe/pytorch/doc/ptdbg_ascend_quickstart.md +381 -0
- msprobe/pytorch/doc/run_overflow_check.md +25 -0
- msprobe/pytorch/doc//321/205/320/254/320/270/321/207/342/225/221/342/224/220/321/207/342/226/223/342/225/233/321/205/342/225/221/320/266/321/206/320/277/320/244/321/205/320/277/342/225/243.md +90 -0
- msprobe/pytorch/free_benchmark/__init__.py +8 -0
- msprobe/pytorch/free_benchmark/common/__init__.py +0 -0
- msprobe/pytorch/free_benchmark/common/constant.py +67 -0
- msprobe/pytorch/free_benchmark/common/counter.py +72 -0
- msprobe/pytorch/free_benchmark/common/enums.py +37 -0
- msprobe/pytorch/free_benchmark/common/params.py +129 -0
- msprobe/pytorch/free_benchmark/common/utils.py +98 -0
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +183 -0
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +104 -0
- msprobe/pytorch/free_benchmark/main.py +102 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/__init__.py +0 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +13 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +41 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/__init__.py +0 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +90 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +104 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +63 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +68 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +28 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +45 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +19 -0
- msprobe/pytorch/free_benchmark/result_handlers/__init__.py +0 -0
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +203 -0
- msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +39 -0
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +24 -0
- msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +31 -0
- msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +170 -0
- msprobe/pytorch/functional/__init__.py +0 -0
- msprobe/pytorch/functional/data_processor.py +0 -0
- msprobe/pytorch/functional/dump_module.py +39 -0
- msprobe/pytorch/hook_module/__init__.py +1 -0
- msprobe/pytorch/hook_module/api_registry.py +161 -0
- msprobe/pytorch/hook_module/hook_module.py +109 -0
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +1876 -0
- msprobe/pytorch/hook_module/utils.py +29 -0
- msprobe/pytorch/hook_module/wrap_aten.py +100 -0
- msprobe/pytorch/hook_module/wrap_distributed.py +75 -0
- msprobe/pytorch/hook_module/wrap_functional.py +108 -0
- msprobe/pytorch/hook_module/wrap_npu_custom.py +73 -0
- msprobe/pytorch/hook_module/wrap_tensor.py +72 -0
- msprobe/pytorch/hook_module/wrap_torch.py +88 -0
- msprobe/pytorch/hook_module/wrap_vf.py +64 -0
- msprobe/pytorch/module_processer.py +98 -0
- msprobe/pytorch/online_dispatch/__init__.py +20 -0
- msprobe/pytorch/online_dispatch/compare.py +236 -0
- msprobe/pytorch/online_dispatch/dispatch.py +274 -0
- msprobe/pytorch/online_dispatch/dump_compare.py +186 -0
- msprobe/pytorch/online_dispatch/single_compare.py +391 -0
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +50 -0
- msprobe/pytorch/online_dispatch/utils.py +187 -0
- msprobe/pytorch/parse.py +4 -0
- msprobe/pytorch/parse_tool/__init__.py +0 -0
- msprobe/pytorch/parse_tool/cli.py +32 -0
- msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
- msprobe/pytorch/parse_tool/lib/compare.py +259 -0
- msprobe/pytorch/parse_tool/lib/config.py +51 -0
- msprobe/pytorch/parse_tool/lib/file_desc.py +31 -0
- msprobe/pytorch/parse_tool/lib/interactive_cli.py +102 -0
- msprobe/pytorch/parse_tool/lib/parse_exception.py +54 -0
- msprobe/pytorch/parse_tool/lib/parse_tool.py +158 -0
- msprobe/pytorch/parse_tool/lib/utils.py +367 -0
- msprobe/pytorch/parse_tool/lib/visualization.py +90 -0
- msprobe/pytorch/pt_config.py +93 -0
- msprobe/pytorch/service.py +167 -0
- msprobe/test/core_ut/common/test_utils.py +345 -0
- msprobe/test/core_ut/data_dump/test_data_collector.py +47 -0
- msprobe/test/core_ut/data_dump/test_json_writer.py +183 -0
- msprobe/test/core_ut/data_dump/test_scope.py +151 -0
- msprobe/test/core_ut/test_common_config.py +152 -0
- msprobe/test/core_ut/test_file_check.py +218 -0
- msprobe/test/core_ut/test_log.py +109 -0
- msprobe/test/mindspore_ut/test_api_kbk_dump.py +51 -0
- msprobe/test/mindspore_ut/test_debugger_config.py +42 -0
- msprobe/test/mindspore_ut/test_dump_tool_factory.py +51 -0
- msprobe/test/mindspore_ut/test_kernel_graph_dump.py +66 -0
- msprobe/test/mindspore_ut/test_kernel_graph_overflow_check.py +63 -0
- msprobe/test/mindspore_ut/test_ms_config.py +69 -0
- msprobe/test/mindspore_ut/test_overflow_check_tool_factory.py +51 -0
- msprobe/test/mindspore_ut/test_precision_debugger.py +56 -0
- msprobe/test/mindspore_ut/test_task_handler_factory.py +58 -0
- msprobe/test/pytorch_ut/advisor/test_advisor.py +83 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/common/test_common_utils.py +108 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/common/test_config.py +39 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_algorithm.py +112 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_api_precision_compare.py +77 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare.py +125 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_column.py +10 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/compare/test_compare_utils.py +43 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/dump.json +179 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/forward.json +63 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_data_generate.py +99 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_multi_run_ut.py +115 -0
- msprobe/test/pytorch_ut/api_accuracy_checker/run_ut/test_run_ut.py +72 -0
- msprobe/test/pytorch_ut/compare/test_acc_compare.py +17 -0
- msprobe/test/pytorch_ut/free_benchmark/perturbed_layers/test_perturbed_layser.py +105 -0
- msprobe/test/pytorch_ut/free_benchmark/result_handlers/test_result_handler.py +121 -0
- msprobe/test/pytorch_ut/free_benchmark/test_main.py +101 -0
- msprobe/test/pytorch_ut/functional/test_dump_module.py +15 -0
- msprobe/test/pytorch_ut/hook_module/test_api_registry.py +130 -0
- msprobe/test/pytorch_ut/hook_module/test_hook_module.py +42 -0
- msprobe/test/pytorch_ut/hook_module/test_wrap_aten.py +65 -0
- msprobe/test/pytorch_ut/hook_module/test_wrap_distributed.py +35 -0
- msprobe/test/pytorch_ut/hook_module/test_wrap_functional.py +20 -0
- msprobe/test/pytorch_ut/hook_module/test_wrap_tensor.py +35 -0
- msprobe/test/pytorch_ut/hook_module/test_wrap_torch.py +43 -0
- msprobe/test/pytorch_ut/hook_module/test_wrap_vf.py +11 -0
- msprobe/test/pytorch_ut/test_pt_config.py +69 -0
- msprobe/test/pytorch_ut/test_service.py +59 -0
- msprobe/test/resources/advisor.txt +3 -0
- msprobe/test/resources/compare_result_20230703104808.csv +9 -0
- msprobe/test/resources/compare_result_without_accuracy.csv +9 -0
- msprobe/test/resources/config.yaml +3 -0
- msprobe/test/resources/npu_test.pkl +8 -0
- msprobe/test/run_test.sh +30 -0
- msprobe/test/run_ut.py +58 -0
- msprobe/test/test_module_processer.py +64 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import numpy as np
|
|
3
|
+
from msprobe.core.common.utils import format_value
|
|
4
|
+
from msprobe.core.common.const import Const, CompareConst
|
|
5
|
+
from msprobe.pytorch.common.log import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def handle_inf_nan(n_value, b_value):
|
|
9
|
+
"""处理inf和nan的数据"""
|
|
10
|
+
n_inf = np.isinf(n_value)
|
|
11
|
+
b_inf = np.isinf(b_value)
|
|
12
|
+
n_nan = np.isnan(n_value)
|
|
13
|
+
b_nan = np.isnan(b_value)
|
|
14
|
+
n_invalid = np.any(n_inf) or np.any(n_nan)
|
|
15
|
+
b_invalid = np.any(b_inf) or np.any(b_nan)
|
|
16
|
+
if n_invalid or b_invalid:
|
|
17
|
+
if np.array_equal(n_inf, b_inf) and np.array_equal(n_nan, b_nan):
|
|
18
|
+
n_value[n_inf] = 0
|
|
19
|
+
b_value[b_inf] = 0
|
|
20
|
+
n_value[n_nan] = 0
|
|
21
|
+
b_value[b_nan] = 0
|
|
22
|
+
else:
|
|
23
|
+
return CompareConst.NAN, CompareConst.NAN
|
|
24
|
+
return n_value, b_value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_error_type(n_value, b_value, error_flag):
|
|
28
|
+
"""判断数据是否有异常并返回异常的n_value, b_value,同时返回error_flag"""
|
|
29
|
+
if error_flag:
|
|
30
|
+
return CompareConst.READ_NONE, CompareConst.READ_NONE, True
|
|
31
|
+
if n_value.size == 0: # 判断读取到的数据是否为空
|
|
32
|
+
return CompareConst.NONE, CompareConst.NONE, True
|
|
33
|
+
if n_value.shape != b_value.shape: # 判断NPU和bench的数据结构是否一致
|
|
34
|
+
return CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, True
|
|
35
|
+
if not n_value.shape: # 判断数据是否为标量
|
|
36
|
+
return n_value, b_value, False
|
|
37
|
+
|
|
38
|
+
n_value, b_value = handle_inf_nan(n_value, b_value) # 判断是否有nan/inf数据
|
|
39
|
+
if n_value is CompareConst.NAN or b_value is CompareConst.NAN:
|
|
40
|
+
return CompareConst.NAN, CompareConst.NAN, True
|
|
41
|
+
return n_value, b_value, False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def reshape_value(n_value, b_value):
|
|
45
|
+
"""返回reshape后的数据"""
|
|
46
|
+
if not n_value.shape: # 判断数据是否为标量
|
|
47
|
+
if n_value.dtype == bool:
|
|
48
|
+
n_value = n_value.astype(float)
|
|
49
|
+
b_value = b_value.astype(float)
|
|
50
|
+
return n_value, b_value
|
|
51
|
+
|
|
52
|
+
n_value = n_value.reshape(-1).astype(float)
|
|
53
|
+
b_value = b_value.reshape(-1).astype(float)
|
|
54
|
+
return n_value, b_value
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_error_message(n_value, b_value, op_name, error_flag, error_file=None):
|
|
58
|
+
"""获取异常情况的错误信息"""
|
|
59
|
+
if error_flag:
|
|
60
|
+
if n_value == CompareConst.READ_NONE:
|
|
61
|
+
if error_file:
|
|
62
|
+
return "Dump file: {} not found.".format(error_file)
|
|
63
|
+
return CompareConst.NO_BENCH
|
|
64
|
+
if n_value == CompareConst.NONE:
|
|
65
|
+
return "This is empty data, can not compare."
|
|
66
|
+
if n_value == CompareConst.SHAPE_UNMATCH:
|
|
67
|
+
return "Shape of NPU and bench Tensor do not match. Skipped."
|
|
68
|
+
if n_value == CompareConst.NAN:
|
|
69
|
+
return "The position of inf or nan in NPU and bench Tensor do not match."
|
|
70
|
+
else:
|
|
71
|
+
if not n_value.shape:
|
|
72
|
+
return "This is type of scalar data, can not compare."
|
|
73
|
+
if n_value.dtype != b_value.dtype:
|
|
74
|
+
logger.warning("Dtype of NPU and bench Tensor do not match: {}".format(op_name))
|
|
75
|
+
return "Dtype of NPU and bench Tensor do not match."
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TensorComparisonBasic(abc.ABC):
|
|
80
|
+
"""NPU和bench中npy数据的比较模板"""
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
def apply(self, n_value, b_value, error_flag, relative_err=None):
|
|
83
|
+
raise NotImplementedError
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class GetCosineSimilarity(TensorComparisonBasic):
|
|
87
|
+
"""计算cosine相似度"""
|
|
88
|
+
@staticmethod
|
|
89
|
+
def correct_data(result):
|
|
90
|
+
if result == CompareConst.NAN:
|
|
91
|
+
return result
|
|
92
|
+
if float(result) > CompareConst.COSINE_THRESHOLD:
|
|
93
|
+
return 1.0
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
def apply(self, n_value, b_value, error_flag, relative_err=None):
|
|
97
|
+
if error_flag:
|
|
98
|
+
if n_value == CompareConst.READ_NONE:
|
|
99
|
+
return CompareConst.NONE, ''
|
|
100
|
+
if n_value == CompareConst.NONE:
|
|
101
|
+
return CompareConst.UNSUPPORTED, ''
|
|
102
|
+
if n_value == CompareConst.SHAPE_UNMATCH:
|
|
103
|
+
return CompareConst.SHAPE_UNMATCH, ''
|
|
104
|
+
if n_value == CompareConst.NAN:
|
|
105
|
+
return "N/A", ''
|
|
106
|
+
|
|
107
|
+
if not n_value.shape:
|
|
108
|
+
return CompareConst.UNSUPPORTED, ''
|
|
109
|
+
|
|
110
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
111
|
+
if len(n_value) == 1:
|
|
112
|
+
return CompareConst.UNSUPPORTED, "This tensor is scalar."
|
|
113
|
+
num = n_value.dot(b_value)
|
|
114
|
+
a_norm = np.linalg.norm(n_value)
|
|
115
|
+
b_norm = np.linalg.norm(b_value)
|
|
116
|
+
|
|
117
|
+
if a_norm <= Const.FLOAT_EPSILON and b_norm <= Const.FLOAT_EPSILON:
|
|
118
|
+
return 1.0, ''
|
|
119
|
+
if a_norm <= Const.FLOAT_EPSILON:
|
|
120
|
+
return CompareConst.NAN, 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.'
|
|
121
|
+
if b_norm <= Const.FLOAT_EPSILON:
|
|
122
|
+
return CompareConst.NAN, 'Cannot compare by Cosine Similarity, All the data is Zero in Bench dump data.'
|
|
123
|
+
|
|
124
|
+
cos = num / (a_norm * b_norm)
|
|
125
|
+
if np.isnan(cos):
|
|
126
|
+
return CompareConst.NAN, 'Cannot compare by Cosine Similarity, the dump data has NaN.'
|
|
127
|
+
result = format_value(cos)
|
|
128
|
+
result = self.correct_data(result)
|
|
129
|
+
return 1.0 if float(result) > 0.99999 else result, ''
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class GetMaxAbsErr(TensorComparisonBasic):
|
|
133
|
+
"""计算最大绝对误差"""
|
|
134
|
+
def apply(self, n_value, b_value, error_flag, relative_err=None):
|
|
135
|
+
if error_flag:
|
|
136
|
+
if n_value == CompareConst.READ_NONE:
|
|
137
|
+
return CompareConst.NONE, ""
|
|
138
|
+
if n_value == CompareConst.NONE:
|
|
139
|
+
return 0, ""
|
|
140
|
+
if n_value == CompareConst.SHAPE_UNMATCH:
|
|
141
|
+
return CompareConst.SHAPE_UNMATCH, ""
|
|
142
|
+
if n_value == CompareConst.NAN:
|
|
143
|
+
return "N/A", ""
|
|
144
|
+
|
|
145
|
+
temp_res = n_value - b_value
|
|
146
|
+
max_value = np.max(np.abs(temp_res))
|
|
147
|
+
return format_value(max_value), ""
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def get_relative_err(n_value, b_value):
|
|
151
|
+
"""计算相对误差"""
|
|
152
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
153
|
+
if b_value.dtype not in CompareConst.FLOAT_TYPE:
|
|
154
|
+
n_value, b_value = n_value.astype(float), b_value.astype(float)
|
|
155
|
+
zero_mask = (b_value == 0)
|
|
156
|
+
b_value[zero_mask] += np.finfo(b_value.dtype).eps
|
|
157
|
+
n_value[zero_mask] += np.finfo(b_value.dtype).eps
|
|
158
|
+
relative_err = np.divide((n_value - b_value), b_value)
|
|
159
|
+
return np.abs(relative_err)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class GetMaxRelativeErr(TensorComparisonBasic):
|
|
163
|
+
"""计算最大相对误差"""
|
|
164
|
+
def apply(self, n_value, b_value, error_flag, relative_err=None):
|
|
165
|
+
if error_flag:
|
|
166
|
+
if n_value == CompareConst.READ_NONE:
|
|
167
|
+
return CompareConst.NONE, ''
|
|
168
|
+
if n_value == CompareConst.NONE:
|
|
169
|
+
return 0, ''
|
|
170
|
+
if n_value == CompareConst.SHAPE_UNMATCH:
|
|
171
|
+
return CompareConst.SHAPE_UNMATCH, ''
|
|
172
|
+
if n_value == CompareConst.NAN:
|
|
173
|
+
return "N/A", ''
|
|
174
|
+
|
|
175
|
+
if relative_err is None:
|
|
176
|
+
relative_err = get_relative_err(n_value, b_value)
|
|
177
|
+
max_relative_err = np.max(np.abs(relative_err))
|
|
178
|
+
if np.isnan(max_relative_err):
|
|
179
|
+
message = 'Cannot compare by MaxRelativeError, the data contains nan in dump data.'
|
|
180
|
+
return CompareConst.NAN, message
|
|
181
|
+
return format_value(max_relative_err), ''
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class GetThousandErrRatio(TensorComparisonBasic):
|
|
185
|
+
"""计算相对误差小于千分之一的比例"""
|
|
186
|
+
def apply(self, n_value, b_value, error_flag, relative_err=None):
|
|
187
|
+
if error_flag:
|
|
188
|
+
if n_value == CompareConst.READ_NONE:
|
|
189
|
+
return CompareConst.NONE, ""
|
|
190
|
+
if n_value == CompareConst.NONE:
|
|
191
|
+
return 0, ""
|
|
192
|
+
if n_value == CompareConst.SHAPE_UNMATCH:
|
|
193
|
+
return CompareConst.SHAPE_UNMATCH, ""
|
|
194
|
+
if n_value == CompareConst.NAN:
|
|
195
|
+
return "N/A", ""
|
|
196
|
+
|
|
197
|
+
if not n_value.shape:
|
|
198
|
+
return CompareConst.NAN, ""
|
|
199
|
+
if relative_err is None:
|
|
200
|
+
relative_err = get_relative_err(n_value, b_value)
|
|
201
|
+
if not np.size(relative_err):
|
|
202
|
+
return CompareConst.NAN, ""
|
|
203
|
+
return format_value(np.sum(relative_err < CompareConst.THOUSAND_RATIO_THRESHOLD) / np.size(relative_err)), ""
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class GetFiveThousandErrRatio(TensorComparisonBasic):
|
|
207
|
+
"""计算相对误差小于千分之五的比例"""
|
|
208
|
+
def apply(self, n_value, b_value, error_flag, relative_err=None):
|
|
209
|
+
if error_flag:
|
|
210
|
+
if n_value == CompareConst.READ_NONE:
|
|
211
|
+
return CompareConst.NONE, ""
|
|
212
|
+
if n_value == CompareConst.NONE:
|
|
213
|
+
return 0, ""
|
|
214
|
+
if n_value == CompareConst.SHAPE_UNMATCH:
|
|
215
|
+
return CompareConst.SHAPE_UNMATCH, ""
|
|
216
|
+
if n_value == CompareConst.NAN:
|
|
217
|
+
return "N/A", ""
|
|
218
|
+
|
|
219
|
+
if not n_value.shape:
|
|
220
|
+
return CompareConst.NAN, ""
|
|
221
|
+
if relative_err is None:
|
|
222
|
+
relative_err = get_relative_err(n_value, b_value)
|
|
223
|
+
if not np.size(relative_err):
|
|
224
|
+
return CompareConst.NAN, ""
|
|
225
|
+
return format_value(np.sum(relative_err < CompareConst.FIVE_THOUSAND_RATIO_THRESHOLD) / np.size(relative_err)), ""
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class CompareOps:
|
|
229
|
+
compare_ops = {
|
|
230
|
+
"cosine_similarity": GetCosineSimilarity(),
|
|
231
|
+
"max_abs_error": GetMaxAbsErr(),
|
|
232
|
+
"max_relative_error": GetMaxRelativeErr(),
|
|
233
|
+
"one_thousand_err_ratio": GetThousandErrRatio(),
|
|
234
|
+
"five_thousand_err_ratio": GetFiveThousandErrRatio()
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def compare_ops_apply(n_value, b_value, error_flag, err_msg, relative_err=None):
|
|
239
|
+
result_list = []
|
|
240
|
+
for op in CompareOps.compare_ops.values():
|
|
241
|
+
result, msg = op.apply(n_value, b_value, error_flag, relative_err=relative_err)
|
|
242
|
+
err_msg += msg
|
|
243
|
+
result_list.append(result)
|
|
244
|
+
return result_list, err_msg
|
|
File without changes
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from msprobe.pytorch.common import seed_all
|
|
2
|
+
from msprobe.pytorch.common.log import logger
|
|
3
|
+
from msprobe.core.common.const import Const
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DebuggerConfig:
|
|
7
|
+
def __init__(self, common_config, task_config, task, dump_path, level):
|
|
8
|
+
self.dump_path = dump_path if dump_path else common_config.dump_path
|
|
9
|
+
self.task = task or common_config.task or Const.STATISTICS
|
|
10
|
+
self.rank = common_config.rank if common_config.rank else []
|
|
11
|
+
self.step = common_config.step if common_config.step else []
|
|
12
|
+
self.level = level or common_config.level or "L1"
|
|
13
|
+
self.seed = common_config.seed if common_config.seed else 1234
|
|
14
|
+
self.is_deterministic = common_config.is_deterministic
|
|
15
|
+
self.enable_dataloader = common_config.enable_dataloader
|
|
16
|
+
self.scope = task_config.scope if task_config.scope else []
|
|
17
|
+
self.list = task_config.list if task_config.list else []
|
|
18
|
+
self.data_mode = task_config.data_mode if task_config.data_mode else ["all"]
|
|
19
|
+
self.backward_input_list = task_config.backward_input if task_config.backward_input else []
|
|
20
|
+
self.backward_input = {}
|
|
21
|
+
self.acl_config = common_config.acl_config if common_config.acl_config else ""
|
|
22
|
+
self.is_forward_acl_dump = True
|
|
23
|
+
self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS
|
|
24
|
+
self.overflow_num = task_config.overflow_num if task_config.overflow_num else 1
|
|
25
|
+
self.framework = Const.PT_FRAMEWORK
|
|
26
|
+
|
|
27
|
+
if self.task == Const.FREE_BENCHMARK:
|
|
28
|
+
self.fuzz_device = task_config.fuzz_device if task_config.fuzz_device else 'npu'
|
|
29
|
+
self.handler_type = task_config.handler_type if task_config.handler_type else 'check'
|
|
30
|
+
self.pert_mode = task_config.pert_mode if task_config.pert_mode else 'improve_precision'
|
|
31
|
+
self.fuzz_level = task_config.fuzz_level if task_config.fuzz_level else 'L1'
|
|
32
|
+
self.fuzz_stage = task_config.fuzz_stage if task_config.fuzz_stage else 'forward'
|
|
33
|
+
self.preheat_config = {
|
|
34
|
+
"if_preheat": task_config.if_preheat if task_config.if_preheat is not None else True,
|
|
35
|
+
"preheat_step": task_config.preheat_step if task_config.preheat_step else 15,
|
|
36
|
+
"max_sample": task_config.max_sample if task_config.max_sample else 20,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
self.check()
|
|
40
|
+
if self.step:
|
|
41
|
+
self.step.sort()
|
|
42
|
+
if self.level == "L2":
|
|
43
|
+
if not self.scope or not isinstance(self.scope, list) or len(self.scope) != 1:
|
|
44
|
+
raise ValueError("scope must be configured as a list with one api name")
|
|
45
|
+
if isinstance(self.scope[0], str) and Const.BACKWARD in self.scope[0] and not self.backward_input_list:
|
|
46
|
+
raise ValueError("backward_input must be configured when scope contains 'backward'")
|
|
47
|
+
if Const.BACKWARD in self.scope[0]:
|
|
48
|
+
self.is_forward_acl_dump = False
|
|
49
|
+
for index, scope_spec in enumerate(self.scope):
|
|
50
|
+
self.scope[index] = scope_spec.replace(Const.BACKWARD, Const.FORWARD)
|
|
51
|
+
self.backward_input[self.scope[index]] = self.backward_input_list[index]
|
|
52
|
+
seed_all(self.seed, self.is_deterministic)
|
|
53
|
+
|
|
54
|
+
def check_kwargs(self):
|
|
55
|
+
if self.task and self.task not in Const.TASK_LIST:
|
|
56
|
+
raise Exception("task is invalid")
|
|
57
|
+
if self.level and self.level not in Const.LEVEL_LIST:
|
|
58
|
+
raise Exception("level is invalid")
|
|
59
|
+
if not self.dump_path:
|
|
60
|
+
raise Exception("Invalid dump path, please check your config")
|
|
61
|
+
|
|
62
|
+
def check(self):
|
|
63
|
+
self.check_kwargs()
|
|
64
|
+
self._check_rank()
|
|
65
|
+
self._check_step()
|
|
66
|
+
return True
|
|
67
|
+
|
|
68
|
+
def check_model(self, model):
|
|
69
|
+
if self.level in ["L0", "mix"] and not model:
|
|
70
|
+
raise Exception(
|
|
71
|
+
f"For level {self.level}, PrecisionDebugger must receive a model argument."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _check_rank(self):
|
|
75
|
+
if self.rank:
|
|
76
|
+
for rank_id in self.rank:
|
|
77
|
+
if not isinstance(rank_id, int) or rank_id < 0:
|
|
78
|
+
raise ValueError(f"rank {self.rank} must be an integer and greater than or equal to 0.")
|
|
79
|
+
else:
|
|
80
|
+
logger.warning_on_rank_0(f"Rank argument is provided. Only rank {self.rank} data will be dumpped.")
|
|
81
|
+
|
|
82
|
+
def _check_step(self):
|
|
83
|
+
if self.step:
|
|
84
|
+
for s in self.step:
|
|
85
|
+
if not isinstance(s, int) or s < 0:
|
|
86
|
+
raise ValueError(f"step element {s} must be an integer and greater than or equal to 0.")
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch.utils.data import dataloader
|
|
3
|
+
from msprobe.pytorch.debugger.debugger_config import DebuggerConfig
|
|
4
|
+
from msprobe.pytorch.service import Service
|
|
5
|
+
from msprobe.pytorch.common.log import logger
|
|
6
|
+
from msprobe.pytorch.pt_config import parse_json_config
|
|
7
|
+
from msprobe.core.common.exceptions import MsaccException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PrecisionDebugger:
|
|
11
|
+
_instance = None
|
|
12
|
+
|
|
13
|
+
def __new__(cls, *args, **kwargs):
|
|
14
|
+
if cls._instance is None:
|
|
15
|
+
cls._instance = super(PrecisionDebugger, cls).__new__(cls)
|
|
16
|
+
cls._instance.config = None
|
|
17
|
+
cls._instance.enable_dataloader = False
|
|
18
|
+
return cls._instance
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
config_path=None,
|
|
23
|
+
task=None,
|
|
24
|
+
dump_path=None,
|
|
25
|
+
level=None,
|
|
26
|
+
model=None,
|
|
27
|
+
step=None,
|
|
28
|
+
):
|
|
29
|
+
if not hasattr(self, "initialized"):
|
|
30
|
+
self.initialized = True
|
|
31
|
+
self.model = self.check_model_valid(model)
|
|
32
|
+
common_config, task_config = parse_json_config(config_path, task)
|
|
33
|
+
if step:
|
|
34
|
+
common_config.step = step
|
|
35
|
+
self.config = DebuggerConfig(
|
|
36
|
+
common_config, task_config, task, dump_path, level
|
|
37
|
+
)
|
|
38
|
+
self.config.check_model(self.model)
|
|
39
|
+
self.service = Service(self.config)
|
|
40
|
+
self.enable_dataloader = self.config.enable_dataloader
|
|
41
|
+
if self.enable_dataloader:
|
|
42
|
+
logger.warning_on_rank_0("The enable_dataloader feature will be deprecated in the future.")
|
|
43
|
+
dataloader._BaseDataLoaderIter.__next__ = iter_tracer(dataloader._BaseDataLoaderIter.__next__)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def instance(self):
|
|
47
|
+
return self._instance
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def check_model_valid(model):
|
|
51
|
+
if not model or isinstance(model, torch.nn.Module):
|
|
52
|
+
return model
|
|
53
|
+
raise MsaccException(
|
|
54
|
+
MsaccException.INVALID_PARAM_ERROR, "model 参数必须是torch.nn.Module类型。"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def start(cls):
|
|
59
|
+
instance = cls._instance
|
|
60
|
+
if not instance:
|
|
61
|
+
raise Exception("No instance of PrecisionDebugger found.")
|
|
62
|
+
if instance.enable_dataloader:
|
|
63
|
+
logger.warning_on_rank_0("DataLoader is enabled, start() skipped.")
|
|
64
|
+
else:
|
|
65
|
+
instance.service.start(instance.model)
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def stop(cls):
|
|
69
|
+
instance = cls._instance
|
|
70
|
+
if not instance:
|
|
71
|
+
raise Exception("PrecisionDebugger instance is not created.")
|
|
72
|
+
if instance.enable_dataloader:
|
|
73
|
+
logger.warning_on_rank_0("DataLoader is enabled, stop() skipped.")
|
|
74
|
+
else:
|
|
75
|
+
instance.service.stop()
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def step(cls):
|
|
79
|
+
if not cls._instance:
|
|
80
|
+
raise Exception("PrecisionDebugger instance is not created.")
|
|
81
|
+
cls._instance.service.step()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def iter_tracer(func):
|
|
85
|
+
def func_wrapper(*args, **kwargs):
|
|
86
|
+
debugger_instance = PrecisionDebugger.instance
|
|
87
|
+
debugger_instance.enable_dataloader = False
|
|
88
|
+
if not debugger_instance.service.first_start:
|
|
89
|
+
debugger_instance.stop()
|
|
90
|
+
debugger_instance.step()
|
|
91
|
+
result = func(*args, **kwargs)
|
|
92
|
+
debugger_instance.start()
|
|
93
|
+
debugger_instance.enable_dataloader = True
|
|
94
|
+
return result
|
|
95
|
+
return func_wrapper
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# 精度预检工具
|
|
2
|
+
|
|
3
|
+
1. 预检工具在dump和run_ut的过程中,是否需要同时开启或关闭jit编译(jit_compile)?
|
|
4
|
+
|
|
5
|
+
答:是。
|
|
6
|
+
|
|
7
|
+
2. 预检工具对于type_as这类涉及数据类型转换操作的API,是否具有参考性?
|
|
8
|
+
|
|
9
|
+
由于这类API在CPU侧存在精度先提升后下降的操作,因此这类API的有效性的参考价值有限。
|
|
10
|
+
|
|
11
|
+
3. run ut过程中出现报错:ERROR:Got unsupported ScalarType BFloat16
|
|
12
|
+
|
|
13
|
+
答:请使用最新版本的工具。
|
|
14
|
+
|
|
15
|
+
4. Dropout算子,CPU和NPU的随机应该不一样,为什么结果比对是一致的?
|
|
16
|
+
|
|
17
|
+
答:这个结果是正常的,工具对该算子有特殊处理,只判定位置为0的位置比例大约和设定p值相当。
|
|
18
|
+
|
|
19
|
+
5. 为什么浮点型数据bench和CPU的dtype不一致?
|
|
20
|
+
|
|
21
|
+
答:对于fp16的数据,CPU会上升一个精度fp32去计算,这是和算子那边对齐的精度结论,CPU用更高精度去计算会更接近真实值。
|
|
22
|
+
|
|
23
|
+
6. 添加预检工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。
|
|
24
|
+
|
|
25
|
+
答:注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
|
|
26
|
+
|
|
27
|
+
7. 添加预检工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。
|
|
28
|
+
|
|
29
|
+
答:注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
|
|
30
|
+
|
|
31
|
+
8. 添加预检工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。
|
|
32
|
+
|
|
33
|
+
答:注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。
|
|
34
|
+
|
|
35
|
+
9. Tensor 魔法函数具体对应什么操作?
|
|
36
|
+
|
|
37
|
+
答:
|
|
38
|
+
|
|
39
|
+
| Tensor魔法函数 | 具体操作 |
|
|
40
|
+
| --------------- | ---------------- |
|
|
41
|
+
| `__add__` | + |
|
|
42
|
+
| `__and__` | & |
|
|
43
|
+
| `__bool__` | 返回Tensor布尔值 |
|
|
44
|
+
| `__div__` | / |
|
|
45
|
+
| `__eq__` | == |
|
|
46
|
+
| `__ge__` | >= |
|
|
47
|
+
| `__gt__` | > |
|
|
48
|
+
| `__iadd__` | += |
|
|
49
|
+
| `__iand__` | &= |
|
|
50
|
+
| `__idiv__` | /= |
|
|
51
|
+
| `__ifloordiv__` | //= |
|
|
52
|
+
| `__ilshift__` | <<= |
|
|
53
|
+
| `__imod__` | %= |
|
|
54
|
+
| `__imul__` | *= |
|
|
55
|
+
| `__ior__` | \|= |
|
|
56
|
+
| `__irshift__` | >>= |
|
|
57
|
+
| `__isub__` | -= |
|
|
58
|
+
| `__ixor__` | ^= |
|
|
59
|
+
| `__lshift__` | << |
|
|
60
|
+
| `__matmul__` | 矩阵乘法 |
|
|
61
|
+
| `__mod__` | % |
|
|
62
|
+
| `__mul__` | * |
|
|
63
|
+
| `__nonzero__` | 同`__bool__` |
|
|
64
|
+
| `__or__` | \| |
|
|
65
|
+
| `__radd__` | +(反向) |
|
|
66
|
+
| `__rmul__` | *(反向) |
|
|
67
|
+
| `__rshift__` | >> |
|
|
68
|
+
| `__sub__` | - |
|
|
69
|
+
| `__truediv__` | 同`__div__` |
|
|
70
|
+
| `__xor__` | ^ |
|
|
71
|
+
|
|
72
|
+
# 精度比对工具
|
|
73
|
+
|
|
74
|
+
## 工具使用
|
|
75
|
+
|
|
76
|
+
### dump指定融合算子
|
|
77
|
+
|
|
78
|
+
dump指定操作当前支持dump指定融合算子的输入输出,需要在mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml中添加,比如以下代码段调用的softmax融合算子
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
def npu_forward_fused_softmax(self, input_, mask):
|
|
82
|
+
resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False)
|
|
83
|
+
return resl
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
如果需要dump其中调用的npu_scaled_masked_softmax算子的输入输出信息,需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可:
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
- npu_scaled_masked_softmax
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
(npu_scaled_masked_softmax融合算子工具已支持dump,本例仅供参考)
|
|
93
|
+
|
|
94
|
+
## 常见问题
|
|
95
|
+
|
|
96
|
+
### 1. 在同一个目录多次执行dump会冲突吗?
|
|
97
|
+
|
|
98
|
+
会,同一个目录多次dump,会覆盖上一次结果,可以使用dump_path参数修改dump目录。
|
|
99
|
+
|
|
100
|
+
### 2. 如何dump算子级的数据?
|
|
101
|
+
|
|
102
|
+
需要配置level为L2模式。
|
|
103
|
+
|
|
104
|
+
### 3. 工具比对发现NPU和标杆数据的API无法完全对齐?
|
|
105
|
+
|
|
106
|
+
torch版本和硬件差异属于正常情况。
|
|
107
|
+
|
|
108
|
+
## 异常情况
|
|
109
|
+
|
|
110
|
+
### 2. HCCL 报错: error code: EI0006
|
|
111
|
+
|
|
112
|
+
**故障现象**
|
|
113
|
+
|
|
114
|
+
使用msprobe工具时,报错: error code: EI0006。
|
|
115
|
+
|
|
116
|
+
**故障原因**
|
|
117
|
+
|
|
118
|
+
CANN软件版本较低导致不兼容。
|
|
119
|
+
|
|
120
|
+
**故障处理**
|
|
121
|
+
|
|
122
|
+
升级新版CANN软件版本。
|
|
123
|
+
|
|
124
|
+
### 3. torch_npu._C._clear_overflow_npu() RuntimeError NPU error,error code is 107002
|
|
125
|
+
|
|
126
|
+
如果运行溢出检测功能遇到这个报错,采取以下解决方法:
|
|
127
|
+
如果是单卡运行,添加如下代码,0是卡号,选择自己空闲的卡号。
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
torch.npu.set_device('npu:0')
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
如果多卡运行,请在代码中修改对应卡号,比如进程使用卡号为{rank}时可以添加如下代码:
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
torch.npu.set_device(f'npu:{rank}')
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
如果运行精度比对功能遇到这个报错,尝试安装最新版本的msprobe。
|
|
140
|
+
|
|
141
|
+
### 4. dump得到的VF_lstm_99_forward_input.1.0.npy、VF_lstm_99_forward_input.1.1.npy类似的数据是否正常?
|
|
142
|
+
|
|
143
|
+
带1.0/1.1/1.2后缀的npy是正常现象,例如当输入数据为[[tensor1, tensor2, tensor3]]会生成这样的后缀。
|
|
144
|
+
|
|
145
|
+
### 5. 进行compare报错:The current file contains stack information, please turn on the stack_mode
|
|
146
|
+
|
|
147
|
+
在比对脚本中,设置stack_mode=True,例如:
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
from msprobe.pytorch import compare
|
|
151
|
+
dump_result_param={
|
|
152
|
+
"npu_json_path": "./npu_dump/dump.json",
|
|
153
|
+
"bench_json_path": "./gpu_dump/dump.json",
|
|
154
|
+
"stack_json_path": "./npu_dump/stack.json",
|
|
155
|
+
"is_print_compare_log": True
|
|
156
|
+
}
|
|
157
|
+
compare(dump_result_param, output_path="./output", stack_mode=True)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### 6. dump指定反向API的kernel级别的数据报错:NameError:name 'torch_npu' is not defined
|
|
161
|
+
|
|
162
|
+
- 如果是npu环境,请安装torch_npu;
|
|
163
|
+
- 如果是gpu环境,暂不支持dump指定API的kernel级别的数据
|
|
164
|
+
|
|
165
|
+
### 7. 配置dump_path后,使用工具报错:[ERROR]The file path /home/xxx/dump contains special characters
|
|
166
|
+
|
|
167
|
+
- 请检查你设置的dump绝对路径是否包含特殊字符,确保路径名只包含大小写字母、数字、下划线、斜杠、点和短横线
|
|
168
|
+
- 注意,如果执行脚本的路径为/home/abc++/,设置的dump_path="./dump",工具实际校验的路径为绝对路径/home/abc++/dump,++为特殊字符,会引发本条报错
|
|
169
|
+
|
|
170
|
+
### 8. 无法dump matmul权重的反向梯度数据
|
|
171
|
+
|
|
172
|
+
- matmul期望的输入是二维,当输入不是二维时,会将输入通过view操作展成二维,再进行matmul运算,因此在反向求导时,backward_hook能拿到的是UnsafeViewBackward这步操作里面数据的梯度信息,取不到MmBackward这步操作里面数据的梯度信息,即权重的反向梯度数据。
|
|
173
|
+
- 典型的例子有,当linear的输入不是二维,且无bias时,会调用output = input.matmul(weight.t()),因此拿不到linear层的weight的反向梯度数据。
|
|
174
|
+
|
|
175
|
+
### 9. dump.json文件中的某些api的dtype类型为float16,但是读取此api的npy文件显示的dtype类型为float32
|
|
176
|
+
|
|
177
|
+
- msprobe工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型,npu to cpu的逻辑和gpu to cpu是保持一致的,都存在dtype可能从float16变为float32类型的情况,如果出现dtype不一致的问题,最终dump数据的dtype以pkl文件为准。
|
|
178
|
+
|
|
179
|
+
### 10. 使用dataloader后raise异常Exception("msprobe: exit after iteration {}". format(max(self.config.step))
|
|
180
|
+
|
|
181
|
+
- 正常现象,dataloader通过raise结束程序,堆栈信息可忽略。
|
|
182
|
+
|
|
183
|
+
### 11. 添加msprobe工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。
|
|
184
|
+
|
|
185
|
+
- 注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
|
|
186
|
+
|
|
187
|
+
### 12. 添加msprobe工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。
|
|
188
|
+
|
|
189
|
+
- 注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。
|
|
190
|
+
|
|
191
|
+
### 13. 添加msprobe工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。
|
|
192
|
+
|
|
193
|
+
- 注释工具目录mstt/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。
|