mindstudio-probe 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/METADATA +5 -5
- mindstudio_probe-1.1.1.dist-info/RECORD +341 -0
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/WHEEL +1 -1
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/entry_points.txt +0 -1
- msprobe/README.md +39 -3
- msprobe/config.json +1 -3
- msprobe/core/advisor/advisor.py +8 -3
- msprobe/core/common/const.py +113 -13
- msprobe/core/common/exceptions.py +25 -3
- msprobe/core/common/file_utils.py +150 -26
- msprobe/core/common/inplace_op_checker.py +15 -0
- msprobe/core/common/log.py +27 -9
- msprobe/core/common/utils.py +182 -69
- msprobe/core/common_config.py +44 -15
- msprobe/core/compare/acc_compare.py +207 -142
- msprobe/core/compare/check.py +2 -5
- msprobe/core/compare/compare_cli.py +21 -4
- msprobe/core/compare/highlight.py +124 -55
- msprobe/core/compare/layer_mapping/__init__.py +19 -0
- msprobe/core/compare/layer_mapping/data_scope_parser.py +235 -0
- msprobe/core/compare/layer_mapping/layer_mapping.py +242 -0
- msprobe/core/compare/layer_mapping/postprocess_pass.py +94 -0
- msprobe/core/compare/npy_compare.py +52 -23
- msprobe/core/compare/utils.py +272 -247
- msprobe/core/data_dump/data_collector.py +13 -11
- msprobe/core/data_dump/data_processor/base.py +46 -16
- msprobe/core/data_dump/data_processor/mindspore_processor.py +4 -4
- msprobe/core/data_dump/data_processor/pytorch_processor.py +156 -59
- msprobe/core/data_dump/scope.py +113 -34
- msprobe/core/grad_probe/constant.py +27 -13
- msprobe/core/grad_probe/grad_compare.py +18 -1
- msprobe/core/grad_probe/utils.py +30 -2
- msprobe/core/overflow_check/abnormal_scene.py +185 -0
- msprobe/core/overflow_check/api_info.py +55 -0
- msprobe/core/overflow_check/checker.py +138 -0
- msprobe/core/overflow_check/filter.py +157 -0
- msprobe/core/overflow_check/ignore_rules.yaml +55 -0
- msprobe/core/overflow_check/level.py +22 -0
- msprobe/core/overflow_check/utils.py +28 -0
- msprobe/docs/01.installation.md +10 -0
- msprobe/docs/02.config_introduction.md +49 -22
- msprobe/docs/03.config_examples.md +2 -9
- msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
- msprobe/docs/05.data_dump_PyTorch.md +3 -1
- msprobe/docs/06.data_dump_MindSpore.md +157 -90
- msprobe/docs/07.accuracy_checker_PyTorch.md +12 -12
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
- msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
- msprobe/docs/10.accuracy_compare_PyTorch.md +19 -13
- msprobe/docs/11.accuracy_compare_MindSpore.md +104 -13
- msprobe/docs/12.overflow_check_PyTorch.md +1 -1
- msprobe/docs/13.overflow_check_MindSpore.md +6 -6
- msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
- msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
- msprobe/docs/17.grad_probe.md +5 -6
- msprobe/docs/19.monitor.md +468 -0
- msprobe/docs/20.monitor_performance_baseline.md +52 -0
- msprobe/docs/21.visualization_PyTorch.md +386 -0
- msprobe/docs/22.visualization_MindSpore.md +384 -0
- msprobe/docs/23.tool_function_introduction.md +28 -0
- msprobe/docs/FAQ.md +3 -0
- msprobe/docs/data_dump_Mindspore/dynamic_graph_quick_start_example.md +211 -0
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/monitor/cpu_info.png +0 -0
- msprobe/mindspore/__init__.py +15 -0
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +113 -145
- msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
- msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
- msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
- msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +63 -1
- msprobe/mindspore/api_accuracy_checker/compute_element.py +59 -24
- msprobe/mindspore/api_accuracy_checker/data_manager.py +264 -0
- msprobe/mindspore/api_accuracy_checker/main.py +27 -3
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +206 -0
- msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +58 -0
- msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
- msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
- msprobe/mindspore/cell_processor.py +33 -12
- msprobe/mindspore/common/const.py +33 -13
- msprobe/mindspore/common/log.py +5 -9
- msprobe/mindspore/common/utils.py +43 -4
- msprobe/mindspore/compare/distributed_compare.py +22 -22
- msprobe/mindspore/compare/ms_compare.py +271 -248
- msprobe/mindspore/compare/ms_graph_compare.py +81 -47
- msprobe/mindspore/debugger/debugger_config.py +4 -1
- msprobe/mindspore/debugger/precision_debugger.py +7 -1
- msprobe/mindspore/dump/dump_tool_factory.py +3 -1
- msprobe/mindspore/dump/hook_cell/api_registry.py +12 -2
- msprobe/mindspore/dump/hook_cell/primitive_hooks.py +13 -16
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +25 -0
- msprobe/mindspore/dump/jit_dump.py +17 -5
- msprobe/mindspore/dump/kernel_graph_dump.py +2 -4
- msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +145 -39
- msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
- msprobe/mindspore/free_benchmark/common/utils.py +19 -4
- msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
- msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
- msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
- msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
- msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
- msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
- msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
- msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +4 -4
- msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
- msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
- msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
- msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
- msprobe/mindspore/grad_probe/global_context.py +28 -8
- msprobe/mindspore/grad_probe/grad_analyzer.py +27 -13
- msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
- msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
- msprobe/mindspore/grad_probe/hook.py +24 -10
- msprobe/mindspore/grad_probe/utils.py +18 -5
- msprobe/mindspore/ms_config.py +22 -15
- msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +2 -4
- msprobe/mindspore/runtime.py +15 -0
- msprobe/mindspore/service.py +36 -30
- msprobe/mindspore/task_handler_factory.py +15 -0
- msprobe/msprobe.py +24 -7
- msprobe/pytorch/__init__.py +3 -2
- msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
- msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -4
- msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +454 -0
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +6 -1
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +19 -14
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +13 -9
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +77 -53
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +15 -4
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
- msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
- msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +100 -6
- msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
- msprobe/pytorch/bench_functions/swiglu.py +10 -2
- msprobe/pytorch/common/parse_json.py +6 -6
- msprobe/pytorch/common/utils.py +56 -5
- msprobe/pytorch/compare/distributed_compare.py +8 -9
- msprobe/pytorch/compare/pt_compare.py +8 -6
- msprobe/pytorch/debugger/debugger_config.py +19 -15
- msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
- msprobe/pytorch/free_benchmark/common/constant.py +15 -0
- msprobe/pytorch/free_benchmark/common/counter.py +15 -0
- msprobe/pytorch/free_benchmark/common/enums.py +15 -0
- msprobe/pytorch/free_benchmark/common/params.py +8 -1
- msprobe/pytorch/free_benchmark/common/utils.py +26 -4
- msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -3
- msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
- msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
- msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +10 -0
- msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
- msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
- msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +1 -0
- msprobe/pytorch/hook_module/wrap_functional.py +14 -12
- msprobe/pytorch/module_processer.py +2 -5
- msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
- msprobe/pytorch/monitor/anomaly_detect.py +340 -0
- msprobe/pytorch/monitor/distributed/__init__.py +0 -0
- msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
- msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
- msprobe/pytorch/monitor/distributed/wrap_distributed.py +272 -0
- msprobe/pytorch/monitor/features.py +108 -0
- msprobe/pytorch/monitor/module_hook.py +870 -0
- msprobe/pytorch/monitor/module_metric.py +193 -0
- msprobe/pytorch/monitor/module_spec_verifier.py +93 -0
- msprobe/pytorch/monitor/optimizer_collect.py +295 -0
- msprobe/pytorch/monitor/unittest/__init__.py +0 -0
- msprobe/pytorch/monitor/unittest/test_monitor.py +145 -0
- msprobe/pytorch/monitor/utils.py +250 -0
- msprobe/pytorch/monitor/visualizer.py +59 -0
- msprobe/pytorch/online_dispatch/__init__.py +2 -3
- msprobe/pytorch/online_dispatch/compare.py +29 -38
- msprobe/pytorch/online_dispatch/dispatch.py +50 -25
- msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
- msprobe/pytorch/online_dispatch/single_compare.py +53 -32
- msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
- msprobe/pytorch/online_dispatch/utils.py +49 -21
- msprobe/pytorch/parse_tool/lib/compare.py +12 -18
- msprobe/pytorch/parse_tool/lib/config.py +1 -1
- msprobe/pytorch/parse_tool/lib/parse_tool.py +1 -2
- msprobe/pytorch/parse_tool/lib/utils.py +16 -35
- msprobe/pytorch/parse_tool/lib/visualization.py +2 -0
- msprobe/pytorch/pt_config.py +31 -8
- msprobe/pytorch/service.py +15 -5
- msprobe/visualization/__init__.py +14 -0
- msprobe/visualization/builder/__init__.py +14 -0
- msprobe/visualization/builder/graph_builder.py +165 -0
- msprobe/visualization/builder/msprobe_adapter.py +205 -0
- msprobe/visualization/compare/__init__.py +14 -0
- msprobe/visualization/compare/graph_comparator.py +130 -0
- msprobe/visualization/compare/mode_adapter.py +211 -0
- msprobe/visualization/graph/__init__.py +14 -0
- msprobe/visualization/graph/base_node.py +124 -0
- msprobe/visualization/graph/graph.py +200 -0
- msprobe/visualization/graph/node_colors.py +95 -0
- msprobe/visualization/graph/node_op.py +39 -0
- msprobe/visualization/graph_service.py +214 -0
- msprobe/visualization/utils.py +232 -0
- mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
- msprobe/docs/04.acl_config_examples.md +0 -78
- msprobe/mindspore/compare/layer_mapping.py +0 -146
- msprobe/mindspore/compare/modify_mapping.py +0 -107
- msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
- msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.1.1.dist-info}/top_level.txt +0 -0
- /msprobe/{mindspore/free_benchmark/decorator → pytorch/monitor}/__init__.py +0 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from typing import Dict, List, Optional, Any
|
|
17
|
+
|
|
18
|
+
from msprobe.core.common.const import Const
|
|
19
|
+
|
|
20
|
+
from msprobe.core.overflow_check.abnormal_scene import InputAnomalyOutputNormalScene, InputAnomalyOutputAnomalyScene, \
|
|
21
|
+
InputNormalOutputAnomalyScene, NumericalMutationScene, AnomalyScene
|
|
22
|
+
from msprobe.core.overflow_check.api_info import APIInfo
|
|
23
|
+
from msprobe.core.overflow_check.filter import IgnoreFilter
|
|
24
|
+
from msprobe.core.overflow_check.level import OverflowLevel
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class StatisticsFields:
|
|
28
|
+
"""统计字段常量类"""
|
|
29
|
+
CRITICAL_APIS = 'critical_apis'
|
|
30
|
+
HIGH_PRIORITY_APIS = 'high_priority_apis'
|
|
31
|
+
MEDIUM_PRIORITY_APIS = 'medium_priority_apis'
|
|
32
|
+
ANOMALY_DETAILS = 'anomaly_details'
|
|
33
|
+
|
|
34
|
+
# 所有字段
|
|
35
|
+
ALL_FIELDS = [CRITICAL_APIS, HIGH_PRIORITY_APIS, MEDIUM_PRIORITY_APIS, ANOMALY_DETAILS]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class AnomalyDetector:
|
|
39
|
+
"""异常检测器"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, dump_data: Dict):
|
|
42
|
+
"""
|
|
43
|
+
初始化检测器,并保存dump_data
|
|
44
|
+
Args:
|
|
45
|
+
dump_data: 数据格式如下
|
|
46
|
+
{
|
|
47
|
+
"api/module": {statistics}
|
|
48
|
+
}
|
|
49
|
+
"""
|
|
50
|
+
self.dump_data = dump_data
|
|
51
|
+
self.ignore_filter = IgnoreFilter()
|
|
52
|
+
self.scene_types = [
|
|
53
|
+
InputNormalOutputAnomalyScene, # 输入正常,输出异常
|
|
54
|
+
InputAnomalyOutputAnomalyScene, # 输入异常,输出异常
|
|
55
|
+
InputAnomalyOutputNormalScene, # 输入异常,输出正常
|
|
56
|
+
NumericalMutationScene # 输出较输入值突变
|
|
57
|
+
]
|
|
58
|
+
self.anomaly_scenes: Dict[str, AnomalyScene] = dict()
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _create_api_info(api_name: str, data: Dict) -> APIInfo:
|
|
62
|
+
"""从原始数据创建APIInfo实例"""
|
|
63
|
+
return APIInfo(
|
|
64
|
+
api_name=api_name,
|
|
65
|
+
input_args=data.get(Const.INPUT_ARGS, []),
|
|
66
|
+
input_kwargs=data.get(Const.INPUT_KWARGS, {}),
|
|
67
|
+
output_data=data.get(Const.OUTPUT, [])
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def get_statistics(self) -> Dict[str, List]:
|
|
71
|
+
"""获取统计信息
|
|
72
|
+
|
|
73
|
+
使用StatisticsFields类统一管理字段名称,避免硬编码
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dict[str, List]: 包含各优先级API列表和异常详情的字典
|
|
77
|
+
"""
|
|
78
|
+
stats = {field: [] for field in StatisticsFields.ALL_FIELDS}
|
|
79
|
+
|
|
80
|
+
# 定义rank到结果key的映射关系
|
|
81
|
+
rank_to_key = {
|
|
82
|
+
OverflowLevel.CRITICAL: StatisticsFields.CRITICAL_APIS,
|
|
83
|
+
OverflowLevel.HIGH: StatisticsFields.HIGH_PRIORITY_APIS,
|
|
84
|
+
OverflowLevel.MEDIUM: StatisticsFields.MEDIUM_PRIORITY_APIS
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
for scene in self.anomaly_scenes.values():
|
|
88
|
+
stats[StatisticsFields.ANOMALY_DETAILS].append(scene.get_details())
|
|
89
|
+
# 根据rank分类API
|
|
90
|
+
key = rank_to_key.get(scene.rank, None)
|
|
91
|
+
if not key:
|
|
92
|
+
stats[key].append(scene.api_name)
|
|
93
|
+
|
|
94
|
+
return stats
|
|
95
|
+
|
|
96
|
+
def analyze(self):
|
|
97
|
+
"""
|
|
98
|
+
按照异常场景对调用数据进行分析
|
|
99
|
+
Returns:
|
|
100
|
+
返回类本身,若不进行过滤,则仅调用analyze即可
|
|
101
|
+
"""
|
|
102
|
+
# 遍历data item
|
|
103
|
+
for api_name, data in self.dump_data.items():
|
|
104
|
+
api_info = self._create_api_info(api_name, data)
|
|
105
|
+
|
|
106
|
+
# 每种都进行检测,可能涉及多种命中,原则如下:
|
|
107
|
+
# - 就高原则
|
|
108
|
+
# - 优先原则,数据异常放最后检测
|
|
109
|
+
for scene_type in self.scene_types:
|
|
110
|
+
scene = scene_type(api_info)
|
|
111
|
+
if hasattr(scene, 'matches') and scene.matches():
|
|
112
|
+
self.anomaly_scenes[api_name] = scene
|
|
113
|
+
break # 直接跳过,就高原则
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
def filter(self):
|
|
117
|
+
"""
|
|
118
|
+
对误检数据进行过滤
|
|
119
|
+
Returns:
|
|
120
|
+
检查checker自身,方便链式调用
|
|
121
|
+
"""
|
|
122
|
+
result = dict()
|
|
123
|
+
for api_name, scene in self.anomaly_scenes.items():
|
|
124
|
+
if self.ignore_filter.apply_filter(scene.api_data):
|
|
125
|
+
continue
|
|
126
|
+
result[api_name] = scene
|
|
127
|
+
self.anomaly_scenes = result
|
|
128
|
+
return self
|
|
129
|
+
|
|
130
|
+
def overflow_result(self) -> Dict[str, AnomalyScene]:
|
|
131
|
+
return self.anomaly_scenes
|
|
132
|
+
|
|
133
|
+
def has_overflow(self, api_name: str) -> bool:
|
|
134
|
+
return api_name in self.anomaly_scenes.keys()
|
|
135
|
+
|
|
136
|
+
def get_overflow_level(self, api_name: str) -> Optional[Any]:
|
|
137
|
+
scene = self.anomaly_scenes.get(api_name, None)
|
|
138
|
+
return scene.rank if scene else None
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import os.path
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Set
|
|
18
|
+
|
|
19
|
+
from msprobe.core.common.file_utils import load_yaml
|
|
20
|
+
from msprobe.core.overflow_check.api_info import APIInfo
|
|
21
|
+
from msprobe.core.overflow_check.utils import has_nan_inf
|
|
22
|
+
|
|
23
|
+
cur_path = os.path.dirname(os.path.realpath(__file__))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class IgnoreFilter:
|
|
27
|
+
def __init__(self, rule_path=os.path.join(cur_path, './ignore_rules.yaml')):
|
|
28
|
+
self.rules = dict()
|
|
29
|
+
self._load_rules(rule_path)
|
|
30
|
+
|
|
31
|
+
def has_api_rule(self, api_name: str) -> bool:
|
|
32
|
+
return api_name in self.rules.keys()
|
|
33
|
+
|
|
34
|
+
def apply_filter(self, api_info: APIInfo) -> bool:
|
|
35
|
+
"""
|
|
36
|
+
应用过滤规则,返回是否需要被过滤
|
|
37
|
+
Args:
|
|
38
|
+
api_info: API调用信息
|
|
39
|
+
Returns:
|
|
40
|
+
是否为误检,是否需要过滤
|
|
41
|
+
"""
|
|
42
|
+
torch_api = api_info.torch_api_name
|
|
43
|
+
if not self.has_api_rule(torch_api):
|
|
44
|
+
return False
|
|
45
|
+
rule = self.rules.get(torch_api)
|
|
46
|
+
if not rule.match(api_info):
|
|
47
|
+
return False
|
|
48
|
+
return True
|
|
49
|
+
|
|
50
|
+
def _load_rules(self, rule_file_path):
|
|
51
|
+
if self.rules and len(self.rules):
|
|
52
|
+
return
|
|
53
|
+
data = load_yaml(rule_file_path)
|
|
54
|
+
self.rules = dict()
|
|
55
|
+
for rule_item in data.get('ignore_nan_inf', []):
|
|
56
|
+
rule = Rule(
|
|
57
|
+
api_name=rule_item.get('api_name', ''),
|
|
58
|
+
desc=rule_item.get('description', ''),
|
|
59
|
+
input_ignore=rule_item.get('input_ignore', []),
|
|
60
|
+
output_ignore=rule_item.get('output_ignore', [])
|
|
61
|
+
)
|
|
62
|
+
if not rule.verify_field():
|
|
63
|
+
continue
|
|
64
|
+
if self.has_api_rule(rule.api_name):
|
|
65
|
+
continue
|
|
66
|
+
self.rules[rule.api_name] = rule
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Rule:
|
|
70
|
+
|
|
71
|
+
def __init__(self, api_name, desc='', input_ignore=None, output_ignore=None):
|
|
72
|
+
self.api_name = api_name
|
|
73
|
+
self.desc = desc
|
|
74
|
+
self.input_ignore = IgnoreItem()
|
|
75
|
+
self.output_ignore = IgnoreItem()
|
|
76
|
+
self._init_ignore(input_ignore, output_ignore)
|
|
77
|
+
|
|
78
|
+
def __repr__(self):
|
|
79
|
+
return (f'Rule(api_name={self.api_name}, desc={self.desc}, input_ignore={self.input_ignore}, output_ignore='
|
|
80
|
+
f'{self.output_ignore})')
|
|
81
|
+
|
|
82
|
+
def verify_field(self):
|
|
83
|
+
if self.api_name == '':
|
|
84
|
+
return False
|
|
85
|
+
# 若无输入输出规则长度,则为无效规则
|
|
86
|
+
if not (len(self.input_ignore.index) + len(self.input_ignore.name) + len(self.output_ignore.index)):
|
|
87
|
+
return False
|
|
88
|
+
return True
|
|
89
|
+
|
|
90
|
+
def match(self, api_info: APIInfo) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
匹配API信息是否符合规则
|
|
93
|
+
Returns:
|
|
94
|
+
bool: True if the api_info matches this rule, False otherwise
|
|
95
|
+
"""
|
|
96
|
+
# 首先检查API名称是否匹配
|
|
97
|
+
api_name = api_info.torch_api_name
|
|
98
|
+
if api_name != self.api_name:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# 检查输入参数中的NaN/Inf
|
|
102
|
+
if self.input_ignore.index and len(api_info.input_args):
|
|
103
|
+
for idx, arg in enumerate(api_info.input_args):
|
|
104
|
+
if has_nan_inf(arg) and not self.input_ignore.has_index(idx):
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
# 检查输入kwargs中的NaN/Inf
|
|
108
|
+
if self.input_ignore.name and len(api_info.input_kwargs):
|
|
109
|
+
for name, value in api_info.input_kwargs.items():
|
|
110
|
+
if has_nan_inf(value) and not self.input_ignore.has_name(name):
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
# 检查输出中的NaN/Inf
|
|
114
|
+
if self.output_ignore.index and len(api_info.output_data):
|
|
115
|
+
for idx, out in enumerate(api_info.output_data):
|
|
116
|
+
if has_nan_inf(out) and not self.output_ignore.has_index(idx):
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
return True
|
|
120
|
+
|
|
121
|
+
def _init_ignore(self, input_ignore=None, output_ignore=None):
|
|
122
|
+
"""初始化忽略项"""
|
|
123
|
+
if input_ignore is None:
|
|
124
|
+
input_ignore = []
|
|
125
|
+
if output_ignore is None:
|
|
126
|
+
output_ignore = []
|
|
127
|
+
|
|
128
|
+
# 处理输入忽略规则
|
|
129
|
+
for item in input_ignore:
|
|
130
|
+
if 'index' in item:
|
|
131
|
+
self.input_ignore.add_index(item['index'])
|
|
132
|
+
if 'name' in item:
|
|
133
|
+
self.input_ignore.add_name(item['name'])
|
|
134
|
+
|
|
135
|
+
# 处理输出忽略规则
|
|
136
|
+
for item in output_ignore:
|
|
137
|
+
if 'index' in item:
|
|
138
|
+
self.output_ignore.add_index(item['index'])
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass
|
|
142
|
+
class IgnoreItem:
|
|
143
|
+
"""存储需要忽略的索引和名称"""
|
|
144
|
+
index: Set[int] = field(default_factory=set)
|
|
145
|
+
name: Set[str] = field(default_factory=set)
|
|
146
|
+
|
|
147
|
+
def add_index(self, idx: int):
|
|
148
|
+
self.index.add(idx)
|
|
149
|
+
|
|
150
|
+
def add_name(self, name: str):
|
|
151
|
+
self.name.add(name)
|
|
152
|
+
|
|
153
|
+
def has_index(self, idx: int) -> bool:
|
|
154
|
+
return idx in self.index
|
|
155
|
+
|
|
156
|
+
def has_name(self, name: str) -> bool:
|
|
157
|
+
return name in self.name
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
ignore_nan_inf:
|
|
2
|
+
# Create an uninitialized memory
|
|
3
|
+
- api_name: "torch.empty"
|
|
4
|
+
description: "Creates a tensor with uninitialized data. The values may contain NaN or Inf because the memory is not cleared or set to zero."
|
|
5
|
+
output_ignore:
|
|
6
|
+
- index: 0
|
|
7
|
+
|
|
8
|
+
- api_name: "torch.empty_like"
|
|
9
|
+
description: "Creates an uninitialized tensor with the same size, dtype, and device as the input tensor. The values may contain NaN or Inf due to uninitialized memory."
|
|
10
|
+
output_ignore:
|
|
11
|
+
- index: 0
|
|
12
|
+
|
|
13
|
+
- api_name: "torch.empty_strided"
|
|
14
|
+
description: "Creates a tensor with uninitialized data using specified strides. NaN or Inf may be present due to uninitialized memory."
|
|
15
|
+
output_ignore:
|
|
16
|
+
- index: 0
|
|
17
|
+
|
|
18
|
+
# Distributed func
|
|
19
|
+
- api_name: "distributed.recv"
|
|
20
|
+
description: "Receives a tensor from another process. The input tensor may contain uninitialized data before the recv call, but it will be overwritten with received data."
|
|
21
|
+
input_ignore:
|
|
22
|
+
- index: 0 # tensor (the input buffer, which may be uninitialized before receiving)
|
|
23
|
+
- name: tensor
|
|
24
|
+
|
|
25
|
+
- api_name: "distributed.all_gather"
|
|
26
|
+
description: "Gathers tensors from all processes and distributes them to each process. The tensors in tensor_list may contain uninitialized data before the all_gather call, but they will be overwritten with collected data from all processes."
|
|
27
|
+
input_ignore:
|
|
28
|
+
- index: 0 # tensor_list (the input list of tensors, which may contain uninitialized data before the all_gather call)
|
|
29
|
+
|
|
30
|
+
- api_name: "distributed.reduce_scatter"
|
|
31
|
+
description: "Combines reduction and scatter operations. The output tensor may contain uninitialized data before the reduce_scatter call, but it will be overwritten with the reduced and scattered data from all processes."
|
|
32
|
+
input_ignore:
|
|
33
|
+
- index: 0
|
|
34
|
+
- name: output
|
|
35
|
+
|
|
36
|
+
- api_name: "distributed._reduce_scatter_base"
|
|
37
|
+
description: "Performs a combined reduction and scatter operation using a single input tensor. The output tensor may contain uninitialized data before the _reduce_scatter_base call, but it will be overwritten with the reduced and scattered data."
|
|
38
|
+
input_ignore:
|
|
39
|
+
- index: 0
|
|
40
|
+
|
|
41
|
+
- api_name: "distributed.all_gather_into_tensor"
|
|
42
|
+
description: "Gathers tensors from all processes into a single output tensor. The output tensor may contain uninitialized data before the all_gather_into_tensor call, but it will be overwritten with collected data from all processes."
|
|
43
|
+
input_ignore:
|
|
44
|
+
- index: 0
|
|
45
|
+
|
|
46
|
+
- api_name: "distributed.reduce_scatter_tensor"
|
|
47
|
+
description: "Performs a reduction operation across all processes and scatters the result into the output tensor. The output tensor may contain uninitialized data before the reduce_scatter_tensor call, but it will be overwritten with the reduced and scattered data."
|
|
48
|
+
input_ignore:
|
|
49
|
+
- index: 0
|
|
50
|
+
|
|
51
|
+
# Tensor inplace func
|
|
52
|
+
- api_name: "tensor.masked_fill_"
|
|
53
|
+
description: "Inplace fill tensor with given value by filtered mask"
|
|
54
|
+
input_ignore:
|
|
55
|
+
- index: 0
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from enum import Enum
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OverflowLevel(Enum):
|
|
20
|
+
MEDIUM = "medium"
|
|
21
|
+
HIGH = "high"
|
|
22
|
+
CRITICAL = "critical"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
CHECK_FIELDS = ['Max', 'Min', 'Mean', 'Norm']
|
|
19
|
+
OVERFLOW_VALUES = ['inf', '-inf', 'nan']
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def has_nan_inf(value: Any) -> bool:
|
|
23
|
+
"""检查值是否包含NaN或Inf"""
|
|
24
|
+
if isinstance(value, dict):
|
|
25
|
+
for k, v in value.items():
|
|
26
|
+
if k in CHECK_FIELDS and str(v).lower() in OVERFLOW_VALUES:
|
|
27
|
+
return True
|
|
28
|
+
return False
|
msprobe/docs/01.installation.md
CHANGED
|
@@ -16,6 +16,7 @@ pip install mindstudio-probe
|
|
|
16
16
|
|
|
17
17
|
|版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
|
|
18
18
|
|:--:|:--:|:--:|:--:|:--:|:--:|
|
|
19
|
+
|1.1.0|2024.10.14|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.0-py3-none-any.whl)|83a5a9b7c65a357639f8c9636d88c693b4cf0eb590d4f8f5cb56395ba69b1f6d|
|
|
19
20
|
|1.0.4|2024.09.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.0.4-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.4-py3-none-any.whl)|4e1909566a71a855b356597750c20ee43d964a22b2c2b02ac08312a5def75fd6|
|
|
20
21
|
| 1.0.3 | 2024.08.23 | 1.11/2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.3-py3-none-any.whl) | 7060cc141a5b98ef770cd9220995d299393f32a61938261e632c7e8b5160bef2 |
|
|
21
22
|
| 1.0.2 | 2024.08.09 | 1.11/2.0/2.1/2.2 | 2.4.0 | [mindstudio_probe-1.0.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.0/mindstudio_probe-1.0.2-py3-none-any.whl) | e4a980e5d98c426ce5ce9842520d9bc031d3b3de621c74b3d59414cc6e238e0e |
|
|
@@ -49,6 +50,15 @@ pip install ./mindstudio_probe*.whl
|
|
|
49
50
|
|
|
50
51
|
<table>
|
|
51
52
|
<tr><th>版本</th><th>特性</th></tr>
|
|
53
|
+
<tr><td rowspan="5">1.1.0</td><td>【总体】</br>1. 训练精度一体化工具 atat 统一更名为 msprobe;</br>2. msprobe 支持日志分级功能。</td></tr>
|
|
54
|
+
<tr><td>【精度预检】</br>1. 支持配置 blacklist 黑名单字段;</br>2. 补充了支持的融合算子列表。</td></tr>
|
|
55
|
+
<tr><td>【精度比对】</br>1. 支持 data mapping 和 layer mapping 的比对功能。</td></tr>
|
|
56
|
+
<tr><td>【数据采集】</br>1. 增加 L1 dump 接口,支持在指定区间内进行正反向 dump 功能;</br>2. 在 MindSpore 中新增 mix 模式,支持指定模块内部进行 dump;</br>3. 新增 MindSpore 函数式接口的通信 API dump 功能。</td></tr>
|
|
57
|
+
<tr><td>【梯度工具】</br>1. 增加了梯度工具中关于 JIT 限制的说明。</td></tr>
|
|
58
|
+
<tr><td rowspan="4">1.0.4</td><td>【数据采集】</br>1. 支持在 config.json 中传入 step 范围配置;</br>2. 优化了 MindSpore 场景下的 step 机制,step 结束后训练继续运行。</td></tr>
|
|
59
|
+
<tr><td>【精度预检】</br>1. 在 PyTorch 场景下,支持部分 NPU 融合算子精度预检。</td></tr>
|
|
60
|
+
<tr><td>【精度比对】</br>1. 解决了在 MindSpore 场景下需要安装 PyTorch 的问题。</td></tr>
|
|
61
|
+
<tr><td>【无标杆比对】</br>1. 补充了 PyTorch 场景的性能基线报告;</br>2. 支持 MindSpore 场景下的 change_value 扰动模式。</td></tr>
|
|
52
62
|
<tr><td rowspan="2">1.0.3</td><td>【精度预检】</br>1. 落盘数据小;</br>2. 支持随机生成模式和真实数据模式;</br>3. 单 API 测试,排除整网中的累计误差问题。</td></tr>
|
|
53
63
|
<tr><td>【梯度检测】</br>1. 使用便捷,无需在训练流程里插入代码。</br>2. 可以精准定位问题出现的 step。</td></tr>
|
|
54
64
|
</table>
|
|
@@ -10,15 +10,14 @@
|
|
|
10
10
|
|
|
11
11
|
### 1.1 通用配置
|
|
12
12
|
|
|
13
|
-
| 参数 | 解释
|
|
14
|
-
| -----------------
|
|
15
|
-
| task | dump 的任务类型,str 类型。可选参数:<br/> "statistics":仅采集统计信息,默认值;<br/> "tensor":采集统计信息和完全复刻整网的真实数据;<br/> "run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;<br/> "overflow_check":溢出检测;<br/> "free_benchmark"
|
|
16
|
-
| dump_path | 设置 dump 数据目录路径,str 类型。<br/> **配置示例**:"dump_path": "./dump_path"。
|
|
17
|
-
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。<br/> PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。<br/> MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。<br/>**配置示例**:"rank": [1, "4-6"]。
|
|
18
|
-
| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。<br/> **配置示例**:"step": [0, 1 , 2, "4-6"]。
|
|
19
|
-
| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:<br/>"L0":dump 模块级精度数据,仅 PyTorch 与 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);<br/>"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;<br/>"L2":dump kernel 级精度数据,PyTorch
|
|
20
|
-
|
|
|
21
|
-
| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 | 否 |
|
|
13
|
+
| 参数 | 解释 | 是否必选 |
|
|
14
|
+
| ----------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
|
|
15
|
+
| task | dump 的任务类型,str 类型。可选参数:<br/> "statistics":仅采集统计信息,默认值;<br/> "tensor":采集统计信息和完全复刻整网的真实数据;<br/> "run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;<br/> "overflow_check":溢出检测;<br/> "free_benchmark":无标杆比对;<br/> "grad_probe":梯度监控。<br/> 根据 task 参数取值的不同,可以配置不同场景参数,详见:<br/>[1.2 task 配置为 statistics](#12-task-配置为-statistics),<br/>[1.3 task 配置为 tensor](#13-task-配置为-tensor),<br/>[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),<br/>[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),<br/>[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),<br/>[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。 <br/> **配置示例**:"task": "tensor"。 | 否 |
|
|
16
|
+
| dump_path | 设置 dump 数据目录路径,str 类型。<br/> **配置示例**:"dump_path": "./dump_path"。 | 是 |
|
|
17
|
+
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。<br/> PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。<br/> MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。<br/> 注意,单卡训练时,rank必须为[],即空列表,不能指定rank。<br/>**配置示例**:"rank": [1, "4-6"]。 | 否 |
|
|
18
|
+
| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。<br/> **配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 |
|
|
19
|
+
| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:<br/>"L0":dump 模块级精度数据,仅 PyTorch 与 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);<br/>"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;<br/>"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);<br/>"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。<br/> **配置示例**:"level": "L1"。 | 否 |
|
|
20
|
+
| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 |
|
|
22
21
|
|
|
23
22
|
#### 1.1.1 模块级精度数据 dump 说明
|
|
24
23
|
|
|
@@ -34,19 +33,21 @@
|
|
|
34
33
|
|
|
35
34
|
<table>
|
|
36
35
|
<tr><th>参数</th><th>解释</th><th>是否必选</th></tr>
|
|
37
|
-
<tr><td>scope</td><td>PyTorch 和 MindSpore 动态图场景 dump 范围,list[str] 类型,默认未配置(list 也未配置时表示 dump 所有 API 的数据)。该参数可以在 [ ] 内配置两个模块名或 API
|
|
36
|
+
<tr><td>scope</td><td>PyTorch 和 MindSpore 动态图场景 dump 范围,list[str] 类型,默认未配置(list 也未配置时表示 dump 所有 API 的数据)。该参数可以在 [ ] 内配置两个模块名或 API 名,要求列表长度必须为2,需要配置按照工具命名格式的完整模块名或API名称,用于锁定区间,dump 该范围内的数据。<br/><b>配置示例</b>:
|
|
37
|
+
"scope": ["Module.conv1.Conv2d.forward.0", "Module.fc2.Linear.forward.0"],
|
|
38
|
+
或 "scope": ["Cell.conv1.Conv2d.forward.0", "Cell.fc2.Dense.backward.0"], 或"scope": ["Tensor.add.0.forward", "Functional.square.2.forward"]。与 level 参数取值相关,level 为 L0 级别时,可配置模块名;level 为 L1 级别时,可配置 API 名, level为 mix 级别时,可配置为模块名或API名。</td><td>否</td></tr>
|
|
38
39
|
<tr><td rowspan="4">list</td><td>自定义采集的算子列表,list[str] 类型,默认未配置(scope 也未配置时表示 dump 所有 API 的数据),包含以下配置方法:</td><td rowspan="4">否</td></tr>
|
|
39
|
-
<tr><td>PyTorch 和 MindSpore 动态图场景配置具体的 API 全称,dump 该 API
|
|
40
|
-
<
|
|
40
|
+
<tr><td>PyTorch 和 MindSpore 动态图场景配置具体的 API 全称,dump 该 API 数据。在 PyTorch 场景,如果 level 配置成 L2,该配置为必填项。<br/><b>配置示例</b>:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。<br/> PyTorch 和 MindSpore 动态图场景在level为 mix 级别时可以配置模块名称,dump该模块展开数据 (dump该模块执行前到执行期间结束所有的数据)。
|
|
41
|
+
<br/><b>配置示例</b>:"list": ["Module.module.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"], 或 "list": ["Cell.network_with_loss.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"]</td></tr>
|
|
42
|
+
<tr><td>PyTorch 和 MindSpore 动态图场景指定某一类 API,dump 某一类的 API 级别输入输出数据。<br/><b>配置示例</b>:"list": ["relu"]。 <br/> PyTorch 和 MindSpore 动态图场景在level为 mix 级别时, 会dump名称中包含list中配置的字符串的API数据,还会将名称中包含list中配置的字符串的模块进行展开dump (dump该模块执行前到执行期间结束所有的数据)。</td></tr>
|
|
41
43
|
<tr><td>MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。<br/><b>配置示例</b>:list: ["name-regex(Default/.+)"]<br/>可匹配算子名称以“Default/”开头的所有算子。</td></tr>
|
|
42
44
|
<tr><td rowspan="3">data_mode</td><td>dump 数据过滤,str 类型。</td><td rowspan="3">否</td></tr>
|
|
43
|
-
<tr><td>PyTorch
|
|
44
|
-
<tr><td>MindSpore
|
|
45
|
-
<tr><td>summary_mode</td><td>控制 dump 文件输出的模式,str 类型,仅 PyTorch
|
|
45
|
+
<tr><td>PyTorch 与 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。<br/> <b>配置示例</b>:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。</td></tr>
|
|
46
|
+
<tr><td>MindSpore 静态图场景:仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。<br/><b>配置示例</b>:"data_mode": ["all"]。</td></tr>
|
|
47
|
+
<tr><td>summary_mode</td><td>控制 dump 文件输出的模式,str 类型,仅 PyTorch 与 MindSpore 动态图场景支持,可选参数:<br/> md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性;<br/> statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。<br/><b>配置示例</b>:"summary_mode": "md5"。</td><td>否</td></tr>
|
|
46
48
|
</table>
|
|
47
49
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
+
**说明**:"summary_mode"配置为"md5"时,所使用的校验算法为CRC-32算法。
|
|
50
51
|
|
|
51
52
|
### 1.3 task 配置为 tensor
|
|
52
53
|
|
|
@@ -54,9 +55,6 @@
|
|
|
54
55
|
| -------------- | ---------------------- | -------- |
|
|
55
56
|
| scope | 与[ 1.2 task 配置为 statistics ](#12-task-配置为-statistics)中的解释相同。 | 否 |
|
|
56
57
|
| list | 与[ 1.2 task 配置为 statistics ](#12-task-配置为-statistics)中的解释相同。 | 否 |
|
|
57
|
-
| backward_input | 首次运行训练采集得到反向 API 输入的 dump 文件,list[str] 类型,仅支持 PyTorch 场景的 kernel dump,默认未配置。例如,若需要采集 Functional.conv2d.1 API 反向过程的输入输出,则需要在 dump 目录下查找命名包含 Functional.conv2d.1、backward 和 input 字段的 dump 文件。<br/>**配置示例**:"backward_input": ["./npu_dump/step0/rank0/dump_tensor_data/Functional.conv2d.1.backward.input.0.pt"] | 否 |
|
|
58
|
-
| list | 与[ 1.2 task 配置为 statistics ](#12-task-配置为-statistics)中的解释相同。另外,<br/> PyTorch 和 MindSpore 动态图场景配置 kernel_api,dump 前向和反向 API 的kernel_api 级别数据,其中 dump 反向 API 时需要配置 **backward_input** 参数。<br/>**前向 API 配置示例**:"list": ["Tensor.permute.1.forward"];<br/>**反向 API 配置示例**:"list": ["Tensor.permute.1.forward"], "backward.input": ["./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"]。<br/> | 否 |
|
|
59
|
-
| backward_input | 该输入文件为首次运行训练 dump 得到反向 API 输入的 dump 文件,list[str] 类型,仅 PyTorch 场景支持,默认未配置。例如若需要 dump Functional.conv2d.1 API 的反向过程的输入输出,则需要在 dump 目录下查找命名包含 Functional.conv2d.1、backward 和 input 字段的 dump 文件。<br/>**配置示例**:"backward_input": ["./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"] | 否 |
|
|
60
58
|
| data_mode | 与[ 1.2 task 配置为 statistics ](#12-task-配置为-statistics)中的解释相同 | 否 |
|
|
61
59
|
| file_format | tensor 数据的保存格式,str 类型,仅支持 MindSpore 静态图场景的 L2,不支持 L0 和 L1。可选参数:<br/> "bin":dump 的 tensor 文件为二进制格式;<br/>"npy":dump 的 tensor 文件后缀为 .npy,默认值。 | 否 |
|
|
62
60
|
| online_run_ut<sup>a</sup> | 在线预检模式开关,bool 类型,可选参数 true(开启)、false(关闭),默认未配置,表示关闭。配置为 true 表示开启在线预检。| 否 |
|
|
@@ -85,7 +83,7 @@
|
|
|
85
83
|
|
|
86
84
|
### 1.5 task 配置为 overflow_check
|
|
87
85
|
|
|
88
|
-
PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O2"。
|
|
86
|
+
PyTorch 与 MindSpore 动态图场景下,"level"须为"L0"或"L1";MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O2"。
|
|
89
87
|
|
|
90
88
|
| 参数 | 解释 | 是否必选 |
|
|
91
89
|
| ------------- | ---------------------- | -------- |
|
|
@@ -112,7 +110,7 @@ PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态
|
|
|
112
110
|
<tr><td>pert_mode</td><td>无标杆扰动因子,str 类型。可选参数:<br/> "improve_precision":对输入做升精度,默认值;<br/> "add_noise":对输入增加噪声;<br/> "no_change":不加扰动直接二次执行;<br/> "bit_noise":输入的末位比特翻转,MindSpore 场景不支持 BF16 类型的向量;<br/> "change_value":输入的张量首尾值调换;<br/> "to_cpu":在 CPU 等价执行(仅 PyTorch 场景支持)。<br/><b>配置示例</b>:"pert_mode": "improve_precision"。</td><td>否</td></tr>
|
|
113
111
|
<tr><td>handler_type</td><td>处理类型,可选参数:<br/> "check":进行无标杆比对检查,默认值;<br/> "fix":将扰动后的 API 输出结果覆盖原始 API 输出结果,尝试将 Loss 曲线恢复正常,该模式下不支持预热功能与反向过程,且仅支持"improve_precision"、"to_cpu"( PyTorch 场景)两种扰动因子。<br/> <b>配置示例</b>:"handler_type": "check"。</td><td>否</td></tr>
|
|
114
112
|
<tr><td>fuzz_level</td><td>无标杆数据 dump 级别,即选择比对结果文件应输出的表头属性,当前仅支持取值为:"L1"。输出结果详见 <a href="#161-无标杆比对数据存盘格式">1.6.1 无标杆比对数据存盘格式</a>。</td><td>否</td></tr>
|
|
115
|
-
<tr><td>fuzz_stage</td><td>比对过程,选择对 API 前向或反向进行无标杆比对,可选参数:<br/> "forward":前向,默认值;<br/> "backward"
|
|
113
|
+
<tr><td>fuzz_stage</td><td>比对过程,选择对 API 前向或反向进行无标杆比对,可选参数:<br/> "forward":前向,默认值;<br/> "backward":反向。当 fuzz_stage 为 "backward" 时,handler_type 只能为 "check"。<br/> <b>配置示例</b>:"fuzz_stage": "backward"。</td><td>否</td></tr>
|
|
116
114
|
<tr><td>if_preheat</td><td>预热功能(仅 PyTorch 场景支持),bool 类型。开启功能后工具可以根据每次迭代的输出调整精度算法的阈值,从而更准确地找出存在精度问题的 API。当"handler_type": "fix"时,不支持预热。可选参数:<br/> true(开启)或 false(关闭),默认关闭。<br/> <b>配置示例</b>:"if_preheat": "true"。</td><td>否</td></tr>
|
|
117
115
|
<tr><td>preheat_step</td><td>开启预热的迭代数量(仅 PyTorch 场景支持),int 类型,默认值为 15。须配置 "if_preheat": "true"。</td><td>否</td></tr>
|
|
118
116
|
<tr><td>max_sample</td><td>每个算子预热的采样次数的最大阈值(仅 PyTorch 场景支持),int 类型,默认值为 20。须配置 "if_preheat": "true"。</td><td>否</td></tr>
|
|
@@ -135,3 +133,32 @@ PyTorch 与 MindSpore 动态图场景下,"level"须为"L1";MindSpore 静态
|
|
|
135
133
|
| dtype | 输入的 dtype,string 类型。 |
|
|
136
134
|
| shape | 输入的 shape,tuple 类型。 |
|
|
137
135
|
| output_index | 如果输出为列表或元组,其中一个元素检测不一致,则会有该元素的 index,否则为空,int 类型。 |
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
### 1.7 task 配置为 grad_probe
|
|
139
|
+
|
|
140
|
+
**参数说明**
|
|
141
|
+
|
|
142
|
+
| 参数 | 说明 | 输入类型 | 是否必选 |
|
|
143
|
+
|--------------------------------|-----------------------------------|-----------------|----------|
|
|
144
|
+
| task | 填为"grad_probe"。 | str | 是 |
|
|
145
|
+
| dump_path | 输出目录。如果不存在就会创建一个新目录。 | str | 是 |
|
|
146
|
+
| rank | rank id列表,在多卡场景下,表示需要导出梯度数据的进程的rank id。列表为空就表示导出所有rank的数据。默认为空。采集特定 rank 时,须指定为训练脚本中存在的 rank_id,可逐个配置,也可以指定范围。<br/> **配置示例**:"rank": [0, 1 , 2, "4-6"]。(MindSpore静态图模式下,当前暂不支持指定rank功能) | list[Union[int, str]] | 否 |
|
|
147
|
+
| step | step列表,表示需要导出数据的step列表。列表为空就表示导出所有step的数据。默认为空。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。<br/> **配置示例**:"step": [0, 1 , 2, "4-6"]。(MindSpore静态图模式下,当前暂不支持指定step功能) | list[Union[int, str]] | 否 |
|
|
148
|
+
| grad_level | 输出级别。决定导出数据的详细程度,级别越大导出数据越详细。可取值:L0, L1, L2。默认L1。|str | 否 |
|
|
149
|
+
| param_list | 权重名称列表,表示需要监控的权重。列表为空就表示监控所有权重。默认为空。 | List[str] | 否 |
|
|
150
|
+
| bounds | 区间列表,用来划分区间以统计数值的分布。需要保证由数据小到大排列,并且列表中的元素需要在int64取值范围内。可以使用默认值[-1, 0, 1]。 | List[float, int] | 否 |
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
**不同级别的level的导出数据**
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
| 级别 | 特征数据表头 | 是否有方向数据 |
|
|
157
|
+
| ---- | ------------------------------------------------------------ | -------------- |
|
|
158
|
+
| L0 | ("param_name", "MD5", "max", "min", "norm", "shape") | 否 |
|
|
159
|
+
| L1 | ("param_name", "max", "min", "norm", "shape") | 是 |
|
|
160
|
+
| L2 | ("param_name", *intervals, "=0", "max", "min", "norm", "shape") | 是 |
|
|
161
|
+
|
|
162
|
+
intervals就是根据值分布bounds划分出的区间。
|
|
163
|
+
MindSpore静态图模式下,L0级别中暂不支持"MD5"
|
|
164
|
+
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
"rank": [],
|
|
14
14
|
"step": [],
|
|
15
15
|
"level": "L1",
|
|
16
|
-
"enable_dataloader": false,
|
|
17
16
|
|
|
18
17
|
"statistics": {
|
|
19
18
|
"scope": [],
|
|
@@ -33,13 +32,11 @@
|
|
|
33
32
|
"rank": [],
|
|
34
33
|
"step": [],
|
|
35
34
|
"level": "L1",
|
|
36
|
-
"enable_dataloader": false,
|
|
37
35
|
|
|
38
36
|
"tensor": {
|
|
39
37
|
"scope": [],
|
|
40
38
|
"list":[],
|
|
41
|
-
"data_mode": ["all"]
|
|
42
|
-
"backward_input": []
|
|
39
|
+
"data_mode": ["all"]
|
|
43
40
|
}
|
|
44
41
|
}
|
|
45
42
|
```
|
|
@@ -53,7 +50,6 @@
|
|
|
53
50
|
"rank": [],
|
|
54
51
|
"step": [],
|
|
55
52
|
"level": "L1",
|
|
56
|
-
"enable_dataloader": false,
|
|
57
53
|
|
|
58
54
|
"run_ut": {
|
|
59
55
|
"white_list": [],
|
|
@@ -72,7 +68,6 @@
|
|
|
72
68
|
"rank": [],
|
|
73
69
|
"step": [],
|
|
74
70
|
"level": "L1",
|
|
75
|
-
"enable_dataloader": false,
|
|
76
71
|
|
|
77
72
|
"overflow_check": {
|
|
78
73
|
"overflow_nums": 1
|
|
@@ -89,7 +84,6 @@
|
|
|
89
84
|
"rank": [],
|
|
90
85
|
"step": [],
|
|
91
86
|
"level": "L1",
|
|
92
|
-
"enable_dataloader": false,
|
|
93
87
|
|
|
94
88
|
"free_benchmark": {
|
|
95
89
|
"scope": [],
|
|
@@ -138,8 +132,7 @@
|
|
|
138
132
|
|
|
139
133
|
"tensor": {
|
|
140
134
|
"list":[],
|
|
141
|
-
"data_mode": ["all"]
|
|
142
|
-
"backward_input": []
|
|
135
|
+
"data_mode": ["all"]
|
|
143
136
|
}
|
|
144
137
|
}
|
|
145
138
|
```
|