mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
- {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
- msprobe/README.md +32 -1
- msprobe/core/__init__.py +17 -0
- msprobe/core/common/const.py +120 -21
- msprobe/core/common/exceptions.py +2 -2
- msprobe/core/common/file_utils.py +279 -50
- msprobe/core/common/framework_adapter.py +169 -0
- msprobe/core/common/global_lock.py +86 -0
- msprobe/core/common/runtime.py +25 -0
- msprobe/core/common/utils.py +136 -45
- msprobe/core/common_config.py +7 -0
- msprobe/core/compare/acc_compare.py +646 -428
- msprobe/core/compare/check.py +36 -103
- msprobe/core/compare/compare_cli.py +4 -0
- msprobe/core/compare/config.py +72 -0
- msprobe/core/compare/highlight.py +215 -215
- msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
- msprobe/core/compare/merge_result/merge_result.py +4 -4
- msprobe/core/compare/multiprocessing_compute.py +223 -110
- msprobe/core/compare/npy_compare.py +2 -4
- msprobe/core/compare/utils.py +214 -244
- msprobe/core/config_check/__init__.py +17 -0
- msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
- msprobe/core/config_check/checkers/base_checker.py +60 -0
- msprobe/core/config_check/checkers/dataset_checker.py +138 -0
- msprobe/core/config_check/checkers/env_args_checker.py +96 -0
- msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
- msprobe/core/config_check/checkers/pip_checker.py +90 -0
- msprobe/core/config_check/checkers/random_checker.py +367 -0
- msprobe/core/config_check/checkers/weights_checker.py +147 -0
- msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
- msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
- msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
- msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
- msprobe/core/config_check/config_check_cli.py +51 -0
- msprobe/core/config_check/config_checker.py +100 -0
- msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
- msprobe/core/config_check/resource/env.yaml +57 -0
- msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
- msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
- msprobe/core/config_check/utils/utils.py +107 -0
- msprobe/core/data_dump/api_registry.py +67 -4
- msprobe/core/data_dump/data_collector.py +170 -89
- msprobe/core/data_dump/data_processor/base.py +72 -51
- msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
- msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
- msprobe/core/data_dump/json_writer.py +143 -27
- msprobe/core/debugger/precision_debugger.py +144 -0
- msprobe/core/grad_probe/constant.py +1 -1
- msprobe/core/grad_probe/grad_compare.py +1 -1
- msprobe/core/grad_probe/utils.py +1 -1
- msprobe/core/hook_manager.py +242 -0
- msprobe/core/monitor/anomaly_processor.py +384 -0
- msprobe/core/service.py +357 -0
- msprobe/core/single_save/__init__.py +0 -0
- msprobe/core/single_save/single_comparator.py +243 -0
- msprobe/core/single_save/single_saver.py +146 -0
- msprobe/docs/01.installation.md +6 -5
- msprobe/docs/02.config_introduction.md +79 -22
- msprobe/docs/03.config_examples.md +1 -0
- msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
- msprobe/docs/05.data_dump_PyTorch.md +118 -49
- msprobe/docs/06.data_dump_MindSpore.md +167 -20
- msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
- msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
- msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
- msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
- msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
- msprobe/docs/12.overflow_check_PyTorch.md +2 -2
- msprobe/docs/13.overflow_check_MindSpore.md +2 -2
- msprobe/docs/14.data_parse_PyTorch.md +3 -3
- msprobe/docs/17.grad_probe.md +2 -1
- msprobe/docs/18.online_dispatch.md +2 -2
- msprobe/docs/19.monitor.md +90 -44
- msprobe/docs/21.visualization_PyTorch.md +68 -15
- msprobe/docs/22.visualization_MindSpore.md +71 -18
- msprobe/docs/25.tool_function_introduction.md +23 -22
- msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
- msprobe/docs/27.dump_json_instruction.md +1 -1
- msprobe/docs/28.debugger_save_instruction.md +111 -20
- msprobe/docs/29.data_dump_MSAdapter.md +2 -2
- msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
- msprobe/docs/31.config_check.md +95 -0
- msprobe/docs/32.ckpt_compare.md +69 -0
- msprobe/docs/33.generate_operator_MindSpore.md +181 -0
- msprobe/docs/34.RL_collect.md +92 -0
- msprobe/docs/35.nan_analyze.md +72 -0
- msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
- msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
- msprobe/docs/img/compare_result.png +0 -0
- msprobe/docs/img/save_compare_result_sample.png +0 -0
- msprobe/docs/img/visualization/proxy.png +0 -0
- msprobe/mindspore/__init__.py +1 -2
- msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
- msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
- msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
- msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
- msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
- msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
- msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
- msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
- msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
- msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
- msprobe/mindspore/cell_processor.py +204 -33
- msprobe/mindspore/code_mapping/graph_parser.py +4 -21
- msprobe/mindspore/common/const.py +17 -7
- msprobe/mindspore/common/utils.py +128 -11
- msprobe/mindspore/compare/common_dir_compare.py +382 -0
- msprobe/mindspore/compare/distributed_compare.py +2 -26
- msprobe/mindspore/compare/ms_compare.py +17 -405
- msprobe/mindspore/compare/ms_graph_compare.py +14 -5
- msprobe/mindspore/compare/utils.py +37 -0
- msprobe/mindspore/debugger/debugger_config.py +53 -3
- msprobe/mindspore/debugger/precision_debugger.py +72 -91
- msprobe/mindspore/dump/cell_dump_process.py +877 -0
- msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
- msprobe/mindspore/dump/dump_tool_factory.py +13 -5
- msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
- msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
- msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
- msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
- msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
- msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
- msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
- msprobe/mindspore/dump/jit_dump.py +21 -18
- msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
- msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
- msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
- msprobe/mindspore/free_benchmark/common/utils.py +1 -1
- msprobe/mindspore/grad_probe/global_context.py +7 -2
- msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
- msprobe/mindspore/mindspore_service.py +114 -0
- msprobe/mindspore/monitor/common_func.py +52 -0
- msprobe/mindspore/monitor/data_writers.py +237 -0
- msprobe/mindspore/monitor/features.py +20 -7
- msprobe/mindspore/monitor/module_hook.py +281 -209
- msprobe/mindspore/monitor/optimizer_collect.py +334 -0
- msprobe/mindspore/monitor/utils.py +25 -5
- msprobe/mindspore/ms_config.py +16 -15
- msprobe/mindspore/task_handler_factory.py +5 -2
- msprobe/msprobe.py +19 -0
- msprobe/nan_analyze/__init__.py +14 -0
- msprobe/nan_analyze/analyzer.py +255 -0
- msprobe/nan_analyze/graph.py +189 -0
- msprobe/nan_analyze/utils.py +211 -0
- msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
- msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
- msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
- msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
- msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
- msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
- msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
- msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
- msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
- msprobe/pytorch/attl_manager.py +65 -0
- msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
- msprobe/pytorch/common/utils.py +26 -14
- msprobe/pytorch/compare/distributed_compare.py +4 -36
- msprobe/pytorch/compare/pt_compare.py +13 -84
- msprobe/pytorch/compare/utils.py +47 -0
- msprobe/pytorch/debugger/debugger_config.py +34 -17
- msprobe/pytorch/debugger/precision_debugger.py +66 -118
- msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
- msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
- msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
- msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
- msprobe/pytorch/hook_module/api_register.py +29 -5
- msprobe/pytorch/hook_module/hook_module.py +9 -18
- msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
- msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
- msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
- msprobe/pytorch/hook_module/utils.py +28 -2
- msprobe/pytorch/monitor/csv2tb.py +6 -2
- msprobe/pytorch/monitor/data_writers.py +259 -0
- msprobe/pytorch/monitor/module_hook.py +227 -158
- msprobe/pytorch/monitor/module_metric.py +14 -0
- msprobe/pytorch/monitor/optimizer_collect.py +242 -270
- msprobe/pytorch/monitor/utils.py +16 -3
- msprobe/pytorch/online_dispatch/dispatch.py +4 -2
- msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
- msprobe/pytorch/parse_tool/lib/utils.py +3 -3
- msprobe/pytorch/pt_config.py +8 -7
- msprobe/pytorch/pytorch_service.py +73 -0
- msprobe/visualization/builder/graph_builder.py +33 -13
- msprobe/visualization/builder/msprobe_adapter.py +24 -11
- msprobe/visualization/compare/graph_comparator.py +53 -45
- msprobe/visualization/compare/mode_adapter.py +31 -1
- msprobe/visualization/graph/base_node.py +3 -3
- msprobe/visualization/graph/graph.py +2 -2
- msprobe/visualization/graph_service.py +250 -103
- msprobe/visualization/utils.py +27 -11
- msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
- msprobe/mindspore/monitor/anomaly_detect.py +0 -404
- msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
- msprobe/mindspore/service.py +0 -549
- msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
- msprobe/pytorch/monitor/anomaly_detect.py +0 -410
- msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
- msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
- msprobe/pytorch/service.py +0 -473
- {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
- {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
- {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
- {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
- /msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
- /msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
- /msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0
msprobe/README.md
CHANGED
|
@@ -54,7 +54,9 @@ export MSPROBE_LOG_LEVEL={x}
|
|
|
54
54
|
|
|
55
55
|
**2. 工具读写的所有路径,如config_path、dump_path等,只允许包含大小写字母、数字、下划线、斜杠、点和短横线。**
|
|
56
56
|
|
|
57
|
-
## ⚙️
|
|
57
|
+
## ⚙️ 安装
|
|
58
|
+
|
|
59
|
+
请参见[安装指导说明](./docs/01.installation.md)。
|
|
58
60
|
|
|
59
61
|
## 🌟 新版本特性
|
|
60
62
|
|
|
@@ -138,6 +140,8 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.
|
|
|
138
140
|
|
|
139
141
|
[PyTorch 单算子API自动生成脚本](./docs/23.generate_operator_PyTorch.md)
|
|
140
142
|
|
|
143
|
+
[MindSpore 单算子API自动生成脚本](./docs/33.generate_operator_MindSpore.md)
|
|
144
|
+
|
|
141
145
|
### 11 数码关联
|
|
142
146
|
|
|
143
147
|
该功能只支持 MindSpore 静态图场景,用于将IR图与dump数据进行关联,获取dump数据和代码调用栈的关联关系。
|
|
@@ -155,6 +159,33 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.
|
|
|
155
159
|
|
|
156
160
|
[MSAdapter 场景的溢出检测](./docs/30.overflow_check_MSAdapter.md)
|
|
157
161
|
|
|
162
|
+
### 13 训练检查
|
|
163
|
+
|
|
164
|
+
该工具主要包括:
|
|
165
|
+
|
|
166
|
+
训练前或精度比对前,对比两个环境下可能影响训练精度的配置差异。
|
|
167
|
+
|
|
168
|
+
[PyTorch 训练前配置检查](./docs/31.config_check.md)
|
|
169
|
+
|
|
170
|
+
训练过程中或结束后,比较两个不同的checkpoint,评估模型相似度。
|
|
171
|
+
|
|
172
|
+
[checkpoint比对](./docs/32.ckpt_compare.md)
|
|
173
|
+
|
|
174
|
+
### 14 强化学习数据采集
|
|
175
|
+
|
|
176
|
+
主要能力:
|
|
177
|
+
|
|
178
|
+
灵活采集强化学习中重要关键过程数据,并支持比对。
|
|
179
|
+
|
|
180
|
+
[强化学习数据采集](./docs/34.RL_collect.md)
|
|
181
|
+
|
|
182
|
+
### 15 整网首个溢出节点分析
|
|
183
|
+
|
|
184
|
+
多rank场景下通过dump数据找到首个出现Nan或Inf的节点。
|
|
185
|
+
|
|
186
|
+
[PyTorch 场景整网首个溢出节点分析](./docs/35.nan_analyze.md)
|
|
187
|
+
|
|
188
|
+
|
|
158
189
|
## 📑 补充材料
|
|
159
190
|
|
|
160
191
|
[无标杆比对功能在 PyTorch 场景的性能基线报告](./docs/S02.report_free_benchmarking_validation_performance_baseline.md)
|
msprobe/core/__init__.py
CHANGED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from msprobe.core.single_save.single_saver import SingleSave
|
|
17
|
+
from msprobe.core.single_save.single_comparator import SingleComparator
|
msprobe/core/common/const.py
CHANGED
|
@@ -70,7 +70,7 @@ class Const:
|
|
|
70
70
|
SUMMARY = "summary"
|
|
71
71
|
MD5 = "md5"
|
|
72
72
|
VALUE = "value"
|
|
73
|
-
SUMMARY_MODE = [
|
|
73
|
+
SUMMARY_MODE = ["statistics", "md5"]
|
|
74
74
|
|
|
75
75
|
WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
|
|
76
76
|
WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
|
|
@@ -80,6 +80,8 @@ class Const:
|
|
|
80
80
|
NUMPY_SUFFIX = ".npy"
|
|
81
81
|
NUMPY_PATTERN = "*.npy"
|
|
82
82
|
PT_SUFFIX = ".pt"
|
|
83
|
+
PY_SUFFIX = ".py"
|
|
84
|
+
INIT_PY = "init.py"
|
|
83
85
|
ONE_GB = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
84
86
|
TEN_GB = 10737418240 # 10 * 1024 * 1024 * 1024
|
|
85
87
|
ONE_MB = 1048576 # 1 * 1024 * 1024
|
|
@@ -95,6 +97,7 @@ class Const:
|
|
|
95
97
|
GRAD_OUTPUT = 'grad_output'
|
|
96
98
|
PARAMS = 'parameters'
|
|
97
99
|
PARAMS_GRAD = 'parameters_grad'
|
|
100
|
+
DEBUG = 'debug'
|
|
98
101
|
START = "start"
|
|
99
102
|
STOP = "stop"
|
|
100
103
|
ENV_ENABLE = "1"
|
|
@@ -132,6 +135,7 @@ class Const:
|
|
|
132
135
|
NPU = 'NPU'
|
|
133
136
|
NPU_LOWERCASE = 'npu'
|
|
134
137
|
CPU_LOWERCASE = 'cpu'
|
|
138
|
+
GPU_LOWERCASE = 'gpu'
|
|
135
139
|
CUDA_LOWERCASE = 'cuda'
|
|
136
140
|
DEVICE = 'device'
|
|
137
141
|
DISTRIBUTED = 'Distributed'
|
|
@@ -140,6 +144,10 @@ class Const:
|
|
|
140
144
|
MODULE_PREFIX = ["Module", "Cell"]
|
|
141
145
|
FORWARD_NAME_SUFFIX = ".forward"
|
|
142
146
|
|
|
147
|
+
DUMP_JSON_FILE = "dump_json_file"
|
|
148
|
+
DEBUG_JSON_FILE = "debug_json_file"
|
|
149
|
+
STACK_JSON_FILE = "stack_json_file"
|
|
150
|
+
|
|
143
151
|
# struct json param
|
|
144
152
|
ORIGIN_DATA = "origin_data"
|
|
145
153
|
SCOPE = "scope"
|
|
@@ -170,6 +178,10 @@ class Const:
|
|
|
170
178
|
TOP_LAYER = "TopLayer"
|
|
171
179
|
CELL = "Cell"
|
|
172
180
|
MODULE = "Module"
|
|
181
|
+
API = "api"
|
|
182
|
+
PYNATIVE_MODE = "pynative"
|
|
183
|
+
PYNATIVE_GRAPH_MODE = "pynative_graph"
|
|
184
|
+
|
|
173
185
|
FRAME_FILE_LIST = ["site-packages/torch", "package/torch", "site-packages/mindspore", "package/mindspore"]
|
|
174
186
|
INPLACE_LIST = [
|
|
175
187
|
"broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
|
|
@@ -191,7 +203,11 @@ class Const:
|
|
|
191
203
|
|
|
192
204
|
FILL_CHAR_NUMS = 50
|
|
193
205
|
TOOL_ENDS_SUCCESSFULLY = f"{TOOL_NAME} ends successfully."
|
|
206
|
+
|
|
194
207
|
WITHOUT_CALL_STACK = "The call stack retrieval failed."
|
|
208
|
+
STACK_FILTER_KEYWORDS = ["msprobe/core", "msprobe/pytorch", "msprobe/mindspore"]
|
|
209
|
+
CALL_STACK_FLAG = "data_dump/api_registry"
|
|
210
|
+
NEW_STACK_FLAG = "0"
|
|
195
211
|
|
|
196
212
|
STEP = "step"
|
|
197
213
|
RANK = "rank"
|
|
@@ -209,12 +225,16 @@ class Const:
|
|
|
209
225
|
TORCH_FLOAT32 = "torch.float32"
|
|
210
226
|
TORCH_BFLOAT16 = "torch.bfloat16"
|
|
211
227
|
|
|
228
|
+
TYPE = 'type'
|
|
212
229
|
DTYPE = 'dtype'
|
|
213
230
|
SHAPE = 'shape'
|
|
231
|
+
STACK_INFO = 'stack_info'
|
|
214
232
|
MAX = 'Max'
|
|
215
233
|
MIN = 'Min'
|
|
216
234
|
MEAN = 'Mean'
|
|
217
235
|
NORM = 'Norm'
|
|
236
|
+
DATA_NAME = 'data_name'
|
|
237
|
+
TENSOR_STAT_INDEX = 'tensor_stat_index'
|
|
218
238
|
|
|
219
239
|
CODE_STACK = 'Code Stack'
|
|
220
240
|
OP_NAME = 'Op Name'
|
|
@@ -226,6 +246,10 @@ class Const:
|
|
|
226
246
|
# 分隔符常量
|
|
227
247
|
SCOPE_SEPARATOR = "/"
|
|
228
248
|
REPLACEMENT_CHARACTER = "_"
|
|
249
|
+
PIPE_SEPARATOR = "|"
|
|
250
|
+
|
|
251
|
+
FORWARD_PATTERN = SEP + FORWARD + SEP
|
|
252
|
+
BACKWARD_PATTERN = SEP + BACKWARD + SEP
|
|
229
253
|
|
|
230
254
|
OPTIMIZER = "optimizer"
|
|
231
255
|
CLIP_GRAD = "clip_grad"
|
|
@@ -243,6 +267,7 @@ class Const:
|
|
|
243
267
|
PT_API_TYPE_ATEN = "aten"
|
|
244
268
|
PT_API_TYPE_DIST = "distributed"
|
|
245
269
|
PT_API_TYPE_NPU_DIST = "npu_distributed"
|
|
270
|
+
PT_API_TYPE_MINDSPEED = "mindspeed"
|
|
246
271
|
|
|
247
272
|
MS_API_TYPE_OPS = "ops"
|
|
248
273
|
MS_API_TYPE_TENSOR = "tensor"
|
|
@@ -250,6 +275,7 @@ class Const:
|
|
|
250
275
|
MS_API_TYPE_MINT = "mint.ops"
|
|
251
276
|
MS_API_TYPE_MINT_FUNC = "mint.nn.functional"
|
|
252
277
|
MS_API_TYPE_COM = "communication.comm_func"
|
|
278
|
+
MS_API_TYPE_MINT_DIST = "mint.distributed"
|
|
253
279
|
|
|
254
280
|
FUNCTIONAL_API_TYPE_PREFIX = "Functional"
|
|
255
281
|
TENSOR_API_TYPE_PREFIX = "Tensor"
|
|
@@ -259,9 +285,11 @@ class Const:
|
|
|
259
285
|
NPU_API_TYPE_PREFIX = "NPU"
|
|
260
286
|
ATEN_API_TYPE_PREFIX = "Aten"
|
|
261
287
|
VF_API_TYPE_PREFIX = "VF"
|
|
288
|
+
MINDSPEED_API_TYPE_PREFIX = "MindSpeed"
|
|
262
289
|
|
|
263
290
|
MINT_API_TYPE_PREFIX = "Mint"
|
|
264
291
|
MINT_FUNC_API_TYPE_PREFIX = "MintFunctional"
|
|
292
|
+
MINT_DIST_API_TYPE_PREFIX = "MintDistributed"
|
|
265
293
|
|
|
266
294
|
SUPPORT_API_DICT_KEY_MAP = {
|
|
267
295
|
PT_FRAMEWORK: {
|
|
@@ -272,7 +300,8 @@ class Const:
|
|
|
272
300
|
PT_API_TYPE_NPU: PT_API_TYPE_NPU,
|
|
273
301
|
PT_API_TYPE_ATEN: PT_API_TYPE_ATEN,
|
|
274
302
|
PT_API_TYPE_DIST: PT_API_TYPE_DIST,
|
|
275
|
-
PT_API_TYPE_NPU_DIST: PT_API_TYPE_NPU_DIST
|
|
303
|
+
PT_API_TYPE_NPU_DIST: PT_API_TYPE_NPU_DIST,
|
|
304
|
+
PT_API_TYPE_MINDSPEED: PT_API_TYPE_MINDSPEED
|
|
276
305
|
},
|
|
277
306
|
MS_FRAMEWORK: {
|
|
278
307
|
MS_API_TYPE_OPS: MS_API_TYPE_OPS,
|
|
@@ -280,7 +309,8 @@ class Const:
|
|
|
280
309
|
MS_API_TYPE_STUB_TENSOR: MS_API_TYPE_TENSOR,
|
|
281
310
|
MS_API_TYPE_MINT: MS_API_TYPE_MINT,
|
|
282
311
|
MS_API_TYPE_MINT_FUNC: MS_API_TYPE_MINT_FUNC,
|
|
283
|
-
MS_API_TYPE_COM: MS_API_TYPE_COM
|
|
312
|
+
MS_API_TYPE_COM: MS_API_TYPE_COM,
|
|
313
|
+
MS_API_TYPE_MINT_DIST: MS_API_TYPE_MINT_DIST
|
|
284
314
|
},
|
|
285
315
|
MT_FRAMEWORK: {
|
|
286
316
|
PT_API_TYPE_FUNCTIONAL: PT_API_TYPE_FUNCTIONAL,
|
|
@@ -300,7 +330,8 @@ class Const:
|
|
|
300
330
|
PT_API_TYPE_NPU: NPU_API_TYPE_PREFIX,
|
|
301
331
|
PT_API_TYPE_ATEN: ATEN_API_TYPE_PREFIX,
|
|
302
332
|
PT_API_TYPE_DIST: DIST_API_TYPE_PREFIX,
|
|
303
|
-
PT_API_TYPE_NPU_DIST: DIST_API_TYPE_PREFIX
|
|
333
|
+
PT_API_TYPE_NPU_DIST: DIST_API_TYPE_PREFIX,
|
|
334
|
+
PT_API_TYPE_MINDSPEED: MINDSPEED_API_TYPE_PREFIX
|
|
304
335
|
},
|
|
305
336
|
MS_FRAMEWORK: {
|
|
306
337
|
MS_API_TYPE_OPS: FUNCTIONAL_API_TYPE_PREFIX,
|
|
@@ -308,7 +339,8 @@ class Const:
|
|
|
308
339
|
MS_API_TYPE_STUB_TENSOR: TENSOR_API_TYPE_PREFIX,
|
|
309
340
|
MS_API_TYPE_MINT: MINT_API_TYPE_PREFIX,
|
|
310
341
|
MS_API_TYPE_MINT_FUNC: MINT_FUNC_API_TYPE_PREFIX,
|
|
311
|
-
MS_API_TYPE_COM: DIST_API_TYPE_PREFIX
|
|
342
|
+
MS_API_TYPE_COM: DIST_API_TYPE_PREFIX,
|
|
343
|
+
MS_API_TYPE_MINT_DIST: MINT_DIST_API_TYPE_PREFIX
|
|
312
344
|
},
|
|
313
345
|
MT_FRAMEWORK: {
|
|
314
346
|
PT_API_TYPE_FUNCTIONAL: FUNCTIONAL_API_TYPE_PREFIX,
|
|
@@ -319,12 +351,42 @@ class Const:
|
|
|
319
351
|
}
|
|
320
352
|
}
|
|
321
353
|
|
|
354
|
+
def _fused_adamw_(
|
|
355
|
+
self,
|
|
356
|
+
grads,
|
|
357
|
+
exp_avgs,
|
|
358
|
+
exp_avg_sqs,
|
|
359
|
+
max_exp_avg_sqs,
|
|
360
|
+
state_steps,
|
|
361
|
+
*,
|
|
362
|
+
lr,
|
|
363
|
+
beta1,
|
|
364
|
+
beta2,
|
|
365
|
+
weight_decay,
|
|
366
|
+
eps,
|
|
367
|
+
amsgrad,
|
|
368
|
+
maximize,
|
|
369
|
+
grad_scale=None,
|
|
370
|
+
found_inf=None
|
|
371
|
+
):
|
|
372
|
+
pass
|
|
373
|
+
|
|
374
|
+
API_WITH_SELF_ARG = {
|
|
375
|
+
'Torch._fused_adamw_': _fused_adamw_
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
ASCEND = "ASCEND"
|
|
379
|
+
MATCH_MODE_NAME = "pure name"
|
|
380
|
+
MATCH_MODE_MAPPING = "mapping"
|
|
381
|
+
MATCH_MODE_SIMILARITY = "similarity"
|
|
382
|
+
|
|
322
383
|
|
|
323
384
|
class CompareConst:
|
|
324
385
|
"""
|
|
325
386
|
Class for compare module const
|
|
326
387
|
"""
|
|
327
388
|
SPACE = " "
|
|
389
|
+
NAME = "Name"
|
|
328
390
|
# compare result column name
|
|
329
391
|
NPU_NAME = "NPU Name"
|
|
330
392
|
BENCH_NAME = "Bench Name"
|
|
@@ -368,6 +430,7 @@ class CompareConst:
|
|
|
368
430
|
OUTPUT_STRUCT = "output_struct"
|
|
369
431
|
PARAMS_STRUCT = "params_struct"
|
|
370
432
|
PARAMS_GRAD_STRUCT = "params_grad_struct"
|
|
433
|
+
DEBUG_STRUCT = "debug_struct"
|
|
371
434
|
SUMMARY = "summary"
|
|
372
435
|
COMPARE_RESULT = "compare_result"
|
|
373
436
|
COMPARE_MESSAGE = "compare_message"
|
|
@@ -474,16 +537,10 @@ class CompareConst:
|
|
|
474
537
|
Const.KWARGS: INPUT_STRUCT,
|
|
475
538
|
Const.OUTPUT: OUTPUT_STRUCT,
|
|
476
539
|
Const.PARAMS: PARAMS_STRUCT,
|
|
477
|
-
Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT
|
|
540
|
+
Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT,
|
|
541
|
+
Const.DEBUG: DEBUG_STRUCT
|
|
478
542
|
}
|
|
479
543
|
|
|
480
|
-
STRUCT_COMPARE_KEY = [
|
|
481
|
-
INPUT_STRUCT,
|
|
482
|
-
OUTPUT_STRUCT,
|
|
483
|
-
PARAMS_STRUCT,
|
|
484
|
-
PARAMS_GRAD_STRUCT
|
|
485
|
-
]
|
|
486
|
-
|
|
487
544
|
# compare standard
|
|
488
545
|
HUNDRED_RATIO_THRESHOLD = 0.01
|
|
489
546
|
THOUSAND_RATIO_THRESHOLD = 0.001
|
|
@@ -562,15 +619,35 @@ class CompareConst:
|
|
|
562
619
|
MAX_DIFF: None, MIN_DIFF: None, MEAN_DIFF: None, NORM_DIFF: None, MAX_RELATIVE_ERR: None,
|
|
563
620
|
MIN_RELATIVE_ERR: None, MEAN_RELATIVE_ERR: None, NORM_RELATIVE_ERR: None
|
|
564
621
|
}
|
|
622
|
+
|
|
623
|
+
API_MAPPING_KEYS_TO_COMPARE = [
|
|
624
|
+
('ms_args', 'pt_args'),
|
|
625
|
+
('ms_outputs', 'pt_outputs'),
|
|
626
|
+
('ms_parameters', 'pt_parameters'),
|
|
627
|
+
('ms_parameters_grad', 'pt_parameters_grad')
|
|
628
|
+
]
|
|
629
|
+
|
|
565
630
|
INPUT_PATTERN = Const.SEP + Const.INPUT + Const.SEP
|
|
566
631
|
KWARGS_PATTERN = Const.SEP + Const.KWARGS + Const.SEP
|
|
567
632
|
OUTPUT_PATTERN = Const.SEP + Const.OUTPUT + Const.SEP
|
|
568
633
|
PARAMS_PATTERN = Const.SEP + Const.PARAMS + Const.SEP
|
|
569
634
|
PARAMS_GRAD_PATTERN = Const.SEP + Const.PARAMS_GRAD + Const.SEP
|
|
570
|
-
|
|
571
|
-
|
|
635
|
+
|
|
636
|
+
CMP_KEY = 'compare_key'
|
|
637
|
+
CMP_SHAPE = 'compare_shape'
|
|
638
|
+
|
|
639
|
+
OP_NAME_X = 'op_name_x'
|
|
640
|
+
MATCH_RESULT_COLUMNS = [
|
|
641
|
+
OP_NAME_X, 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x',
|
|
642
|
+
CMP_KEY, CMP_SHAPE,
|
|
643
|
+
'op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'data_name_y',
|
|
644
|
+
]
|
|
645
|
+
|
|
572
646
|
INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml'
|
|
573
647
|
UNREADABLE = 'unreadable data'
|
|
648
|
+
NPU_DUMP_DATA_DIR = 'npu_dump_data_dir'
|
|
649
|
+
BENCH_DUMP_DATA_DIR = 'bench_dump_data_dir'
|
|
650
|
+
NO_REAL_DATA_FLAG = '-1'
|
|
574
651
|
|
|
575
652
|
|
|
576
653
|
class FileCheckConst:
|
|
@@ -592,6 +669,8 @@ class FileCheckConst:
|
|
|
592
669
|
XLSX_SUFFIX = ".xlsx"
|
|
593
670
|
YAML_SUFFIX = ".yaml"
|
|
594
671
|
IR_SUFFIX = ".ir"
|
|
672
|
+
ZIP_SUFFIX = ".zip"
|
|
673
|
+
SHELL_SUFFIX = ".sh"
|
|
595
674
|
MAX_PKL_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
596
675
|
MAX_NUMPY_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024
|
|
597
676
|
MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
@@ -600,6 +679,9 @@ class FileCheckConst:
|
|
|
600
679
|
MAX_XLSX_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
601
680
|
MAX_YAML_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
602
681
|
MAX_IR_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
682
|
+
MAX_ZIP_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024
|
|
683
|
+
MAX_FILE_IN_ZIP_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
684
|
+
MAX_FILE_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
|
|
603
685
|
COMMOM_FILE_SIZE = 1048576 # 1 * 1024 * 1024
|
|
604
686
|
DIR = "dir"
|
|
605
687
|
FILE = "file"
|
|
@@ -613,7 +695,8 @@ class FileCheckConst:
|
|
|
613
695
|
CSV_SUFFIX: MAX_CSV_SIZE,
|
|
614
696
|
XLSX_SUFFIX: MAX_XLSX_SIZE,
|
|
615
697
|
YAML_SUFFIX: MAX_YAML_SIZE,
|
|
616
|
-
IR_SUFFIX: MAX_IR_SIZE
|
|
698
|
+
IR_SUFFIX: MAX_IR_SIZE,
|
|
699
|
+
ZIP_SUFFIX: MAX_ZIP_SIZE
|
|
617
700
|
}
|
|
618
701
|
CSV_BLACK_LIST = r'^[+-=%@\+\-=%@]|;[+-=%@\+\-=%@]'
|
|
619
702
|
|
|
@@ -671,7 +754,7 @@ class MonitorConst:
|
|
|
671
754
|
DEFAULT_MIN_COLLECT_TIMES = 0
|
|
672
755
|
DEFAULT_STEP_INTERVAL = 1
|
|
673
756
|
|
|
674
|
-
OP_LIST = ["norm", "min", "max", "zeros", "nans", "id", "mean"]
|
|
757
|
+
OP_LIST = ["norm", "min", "max", "zeros", "nans", "id", "mean", "shape", "dtype"]
|
|
675
758
|
MONITOR_OUTPUT_DIR = "MONITOR_OUTPUT_DIR"
|
|
676
759
|
DEFAULT_MONITOR_OUTPUT_DIR = "./monitor_output"
|
|
677
760
|
DATABASE = "database"
|
|
@@ -683,7 +766,7 @@ class MonitorConst:
|
|
|
683
766
|
"DeepSpeedZeroOptimizer_Stage3"
|
|
684
767
|
)
|
|
685
768
|
DEEPSPEED_ZERO_OPT_FILTER = "DeepSpeedZeroOptimizer"
|
|
686
|
-
RULE_NAME = ['AnomalyTurbulence']
|
|
769
|
+
RULE_NAME = ['AnomalyTurbulence', 'AnomalyNan']
|
|
687
770
|
|
|
688
771
|
SLICE_SIZE = 20480
|
|
689
772
|
# used for name
|
|
@@ -700,15 +783,16 @@ class MonitorConst:
|
|
|
700
783
|
ACTVGRAD = "actv_grad"
|
|
701
784
|
POST_GRAD = "post_grad"
|
|
702
785
|
PRE_GRAD = "pre_grad"
|
|
786
|
+
PRE_PARAM = "param_origin"
|
|
787
|
+
POST_PARAM = "param_updated"
|
|
703
788
|
ACC_GRAD = "acc_grad"
|
|
704
789
|
PREFIX_POST = "post"
|
|
705
790
|
PREFIX_PRE = "pre"
|
|
706
791
|
EXP_AVG = "exp_avg"
|
|
707
792
|
EXP_AVG_SQ = "exp_avg_sq"
|
|
708
|
-
PARAM = "param"
|
|
709
793
|
|
|
710
794
|
CSV_HEADER = ["vpp_stage", "name", "step"]
|
|
711
|
-
|
|
795
|
+
CSV_HEADER_MICRO_STEP = ["vpp_stage", "name", "step", "micro_step"]
|
|
712
796
|
OUTPUT_DIR_PATTERN = r"([\w-]{0,20})-rank(\d{1,5})-"
|
|
713
797
|
ANOMALY_JSON = "anomaly.json"
|
|
714
798
|
ANALYSE_JSON = "anomaly_analyse.json"
|
|
@@ -716,5 +800,20 @@ class MonitorConst:
|
|
|
716
800
|
CSV = "csv"
|
|
717
801
|
API = "api"
|
|
718
802
|
HEADER_NAME = 'name'
|
|
719
|
-
|
|
720
803
|
MAX_NDIGITS = 20
|
|
804
|
+
|
|
805
|
+
DEFAULT_STAGE = -1
|
|
806
|
+
FORWARD_STAGE = 0
|
|
807
|
+
BACKWARD_STAGE = 1
|
|
808
|
+
OPTIMIZER_STAGE = 2
|
|
809
|
+
FORWARD_KEY = [ACTV]
|
|
810
|
+
BACKWARD_KEY = [ACTVGRAD, PRE_GRAD, POST_GRAD, ACC_GRAD]
|
|
811
|
+
OPTIMIZER_KEY = [EXP_AVG, EXP_AVG_SQ]
|
|
812
|
+
|
|
813
|
+
TRAIN_STAGE = {}
|
|
814
|
+
for key in FORWARD_KEY:
|
|
815
|
+
TRAIN_STAGE[key] = FORWARD_STAGE
|
|
816
|
+
for key in BACKWARD_KEY:
|
|
817
|
+
TRAIN_STAGE[key] = BACKWARD_STAGE
|
|
818
|
+
for key in OPTIMIZER_KEY:
|
|
819
|
+
TRAIN_STAGE[key] = OPTIMIZER_STAGE
|