mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
  2. {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
  3. msprobe/README.md +32 -1
  4. msprobe/core/__init__.py +17 -0
  5. msprobe/core/common/const.py +120 -21
  6. msprobe/core/common/exceptions.py +2 -2
  7. msprobe/core/common/file_utils.py +279 -50
  8. msprobe/core/common/framework_adapter.py +169 -0
  9. msprobe/core/common/global_lock.py +86 -0
  10. msprobe/core/common/runtime.py +25 -0
  11. msprobe/core/common/utils.py +136 -45
  12. msprobe/core/common_config.py +7 -0
  13. msprobe/core/compare/acc_compare.py +646 -428
  14. msprobe/core/compare/check.py +36 -103
  15. msprobe/core/compare/compare_cli.py +4 -0
  16. msprobe/core/compare/config.py +72 -0
  17. msprobe/core/compare/highlight.py +215 -215
  18. msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
  19. msprobe/core/compare/merge_result/merge_result.py +4 -4
  20. msprobe/core/compare/multiprocessing_compute.py +223 -110
  21. msprobe/core/compare/npy_compare.py +2 -4
  22. msprobe/core/compare/utils.py +214 -244
  23. msprobe/core/config_check/__init__.py +17 -0
  24. msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
  25. msprobe/core/config_check/checkers/base_checker.py +60 -0
  26. msprobe/core/config_check/checkers/dataset_checker.py +138 -0
  27. msprobe/core/config_check/checkers/env_args_checker.py +96 -0
  28. msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
  29. msprobe/core/config_check/checkers/pip_checker.py +90 -0
  30. msprobe/core/config_check/checkers/random_checker.py +367 -0
  31. msprobe/core/config_check/checkers/weights_checker.py +147 -0
  32. msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
  33. msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
  34. msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
  35. msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
  36. msprobe/core/config_check/config_check_cli.py +51 -0
  37. msprobe/core/config_check/config_checker.py +100 -0
  38. msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
  39. msprobe/core/config_check/resource/env.yaml +57 -0
  40. msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
  41. msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
  42. msprobe/core/config_check/utils/utils.py +107 -0
  43. msprobe/core/data_dump/api_registry.py +67 -4
  44. msprobe/core/data_dump/data_collector.py +170 -89
  45. msprobe/core/data_dump/data_processor/base.py +72 -51
  46. msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
  47. msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
  48. msprobe/core/data_dump/json_writer.py +143 -27
  49. msprobe/core/debugger/precision_debugger.py +144 -0
  50. msprobe/core/grad_probe/constant.py +1 -1
  51. msprobe/core/grad_probe/grad_compare.py +1 -1
  52. msprobe/core/grad_probe/utils.py +1 -1
  53. msprobe/core/hook_manager.py +242 -0
  54. msprobe/core/monitor/anomaly_processor.py +384 -0
  55. msprobe/core/service.py +357 -0
  56. msprobe/core/single_save/__init__.py +0 -0
  57. msprobe/core/single_save/single_comparator.py +243 -0
  58. msprobe/core/single_save/single_saver.py +146 -0
  59. msprobe/docs/01.installation.md +6 -5
  60. msprobe/docs/02.config_introduction.md +79 -22
  61. msprobe/docs/03.config_examples.md +1 -0
  62. msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
  63. msprobe/docs/05.data_dump_PyTorch.md +118 -49
  64. msprobe/docs/06.data_dump_MindSpore.md +167 -20
  65. msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
  66. msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
  67. msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
  68. msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
  69. msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
  70. msprobe/docs/12.overflow_check_PyTorch.md +2 -2
  71. msprobe/docs/13.overflow_check_MindSpore.md +2 -2
  72. msprobe/docs/14.data_parse_PyTorch.md +3 -3
  73. msprobe/docs/17.grad_probe.md +2 -1
  74. msprobe/docs/18.online_dispatch.md +2 -2
  75. msprobe/docs/19.monitor.md +90 -44
  76. msprobe/docs/21.visualization_PyTorch.md +68 -15
  77. msprobe/docs/22.visualization_MindSpore.md +71 -18
  78. msprobe/docs/25.tool_function_introduction.md +23 -22
  79. msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
  80. msprobe/docs/27.dump_json_instruction.md +1 -1
  81. msprobe/docs/28.debugger_save_instruction.md +111 -20
  82. msprobe/docs/29.data_dump_MSAdapter.md +2 -2
  83. msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
  84. msprobe/docs/31.config_check.md +95 -0
  85. msprobe/docs/32.ckpt_compare.md +69 -0
  86. msprobe/docs/33.generate_operator_MindSpore.md +181 -0
  87. msprobe/docs/34.RL_collect.md +92 -0
  88. msprobe/docs/35.nan_analyze.md +72 -0
  89. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
  90. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
  91. msprobe/docs/img/compare_result.png +0 -0
  92. msprobe/docs/img/save_compare_result_sample.png +0 -0
  93. msprobe/docs/img/visualization/proxy.png +0 -0
  94. msprobe/mindspore/__init__.py +1 -2
  95. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
  96. msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
  97. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
  98. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
  99. msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
  100. msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
  101. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
  102. msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
  103. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
  104. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
  105. msprobe/mindspore/cell_processor.py +204 -33
  106. msprobe/mindspore/code_mapping/graph_parser.py +4 -21
  107. msprobe/mindspore/common/const.py +17 -7
  108. msprobe/mindspore/common/utils.py +128 -11
  109. msprobe/mindspore/compare/common_dir_compare.py +382 -0
  110. msprobe/mindspore/compare/distributed_compare.py +2 -26
  111. msprobe/mindspore/compare/ms_compare.py +17 -405
  112. msprobe/mindspore/compare/ms_graph_compare.py +14 -5
  113. msprobe/mindspore/compare/utils.py +37 -0
  114. msprobe/mindspore/debugger/debugger_config.py +53 -3
  115. msprobe/mindspore/debugger/precision_debugger.py +72 -91
  116. msprobe/mindspore/dump/cell_dump_process.py +877 -0
  117. msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
  118. msprobe/mindspore/dump/dump_tool_factory.py +13 -5
  119. msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
  120. msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
  121. msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
  122. msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
  123. msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
  124. msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
  125. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
  126. msprobe/mindspore/dump/jit_dump.py +21 -18
  127. msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
  128. msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
  129. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
  130. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
  131. msprobe/mindspore/free_benchmark/common/utils.py +1 -1
  132. msprobe/mindspore/grad_probe/global_context.py +7 -2
  133. msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
  134. msprobe/mindspore/mindspore_service.py +114 -0
  135. msprobe/mindspore/monitor/common_func.py +52 -0
  136. msprobe/mindspore/monitor/data_writers.py +237 -0
  137. msprobe/mindspore/monitor/features.py +20 -7
  138. msprobe/mindspore/monitor/module_hook.py +281 -209
  139. msprobe/mindspore/monitor/optimizer_collect.py +334 -0
  140. msprobe/mindspore/monitor/utils.py +25 -5
  141. msprobe/mindspore/ms_config.py +16 -15
  142. msprobe/mindspore/task_handler_factory.py +5 -2
  143. msprobe/msprobe.py +19 -0
  144. msprobe/nan_analyze/__init__.py +14 -0
  145. msprobe/nan_analyze/analyzer.py +255 -0
  146. msprobe/nan_analyze/graph.py +189 -0
  147. msprobe/nan_analyze/utils.py +211 -0
  148. msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
  149. msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
  150. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
  151. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
  152. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
  153. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
  154. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
  155. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
  156. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
  157. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
  158. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
  159. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
  160. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
  161. msprobe/pytorch/attl_manager.py +65 -0
  162. msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
  163. msprobe/pytorch/common/utils.py +26 -14
  164. msprobe/pytorch/compare/distributed_compare.py +4 -36
  165. msprobe/pytorch/compare/pt_compare.py +13 -84
  166. msprobe/pytorch/compare/utils.py +47 -0
  167. msprobe/pytorch/debugger/debugger_config.py +34 -17
  168. msprobe/pytorch/debugger/precision_debugger.py +66 -118
  169. msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
  170. msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
  171. msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
  172. msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
  173. msprobe/pytorch/hook_module/api_register.py +29 -5
  174. msprobe/pytorch/hook_module/hook_module.py +9 -18
  175. msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
  176. msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
  177. msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
  178. msprobe/pytorch/hook_module/utils.py +28 -2
  179. msprobe/pytorch/monitor/csv2tb.py +6 -2
  180. msprobe/pytorch/monitor/data_writers.py +259 -0
  181. msprobe/pytorch/monitor/module_hook.py +227 -158
  182. msprobe/pytorch/monitor/module_metric.py +14 -0
  183. msprobe/pytorch/monitor/optimizer_collect.py +242 -270
  184. msprobe/pytorch/monitor/utils.py +16 -3
  185. msprobe/pytorch/online_dispatch/dispatch.py +4 -2
  186. msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
  187. msprobe/pytorch/parse_tool/lib/utils.py +3 -3
  188. msprobe/pytorch/pt_config.py +8 -7
  189. msprobe/pytorch/pytorch_service.py +73 -0
  190. msprobe/visualization/builder/graph_builder.py +33 -13
  191. msprobe/visualization/builder/msprobe_adapter.py +24 -11
  192. msprobe/visualization/compare/graph_comparator.py +53 -45
  193. msprobe/visualization/compare/mode_adapter.py +31 -1
  194. msprobe/visualization/graph/base_node.py +3 -3
  195. msprobe/visualization/graph/graph.py +2 -2
  196. msprobe/visualization/graph_service.py +250 -103
  197. msprobe/visualization/utils.py +27 -11
  198. msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
  199. msprobe/mindspore/monitor/anomaly_detect.py +0 -404
  200. msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
  201. msprobe/mindspore/service.py +0 -549
  202. msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
  203. msprobe/pytorch/monitor/anomaly_detect.py +0 -410
  204. msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
  205. msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
  206. msprobe/pytorch/service.py +0 -473
  207. {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
  208. {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
  209. {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
  210. {mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
  211. /msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
  212. /msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
  213. /msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0
msprobe/README.md CHANGED
@@ -54,7 +54,9 @@ export MSPROBE_LOG_LEVEL={x}
54
54
 
55
55
  **2. 工具读写的所有路径,如config_path、dump_path等,只允许包含大小写字母、数字、下划线、斜杠、点和短横线。**
56
56
 
57
- ## ⚙️ [安装](./docs/01.installation.md)
57
+ ## ⚙️ 安装
58
+
59
+ 请参见[安装指导说明](./docs/01.installation.md)。
58
60
 
59
61
  ## 🌟 新版本特性
60
62
 
@@ -138,6 +140,8 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.
138
140
 
139
141
  [PyTorch 单算子API自动生成脚本](./docs/23.generate_operator_PyTorch.md)
140
142
 
143
+ [MindSpore 单算子API自动生成脚本](./docs/33.generate_operator_MindSpore.md)
144
+
141
145
  ### 11 数码关联
142
146
 
143
147
  该功能只支持 MindSpore 静态图场景,用于将IR图与dump数据进行关联,获取dump数据和代码调用栈的关联关系。
@@ -155,6 +159,33 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.
155
159
 
156
160
  [MSAdapter 场景的溢出检测](./docs/30.overflow_check_MSAdapter.md)
157
161
 
162
+ ### 13 训练检查
163
+
164
+ 该工具主要包括:
165
+
166
+ 训练前或精度比对前,对比两个环境下可能影响训练精度的配置差异。
167
+
168
+ [PyTorch 训练前配置检查](./docs/31.config_check.md)
169
+
170
+ 训练过程中或结束后,比较两个不同的checkpoint,评估模型相似度。
171
+
172
+ [checkpoint比对](./docs/32.ckpt_compare.md)
173
+
174
+ ### 14 强化学习数据采集
175
+
176
+ 主要能力:
177
+
178
+ 灵活采集强化学习中重要关键过程数据,并支持比对。
179
+
180
+ [强化学习数据采集](./docs/34.RL_collect.md)
181
+
182
+ ### 15 整网首个溢出节点分析
183
+
184
+ 多rank场景下通过dump数据找到首个出现Nan或Inf的节点。
185
+
186
+ [PyTorch 场景整网首个溢出节点分析](./docs/35.nan_analyze.md)
187
+
188
+
158
189
  ## 📑 补充材料
159
190
 
160
191
  [无标杆比对功能在 PyTorch 场景的性能基线报告](./docs/S02.report_free_benchmarking_validation_performance_baseline.md)
msprobe/core/__init__.py CHANGED
@@ -0,0 +1,17 @@
1
+ # Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from msprobe.core.single_save.single_saver import SingleSave
17
+ from msprobe.core.single_save.single_comparator import SingleComparator
@@ -70,7 +70,7 @@ class Const:
70
70
  SUMMARY = "summary"
71
71
  MD5 = "md5"
72
72
  VALUE = "value"
73
- SUMMARY_MODE = [ALL, SUMMARY, MD5]
73
+ SUMMARY_MODE = ["statistics", "md5"]
74
74
 
75
75
  WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
76
76
  WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
@@ -80,6 +80,8 @@ class Const:
80
80
  NUMPY_SUFFIX = ".npy"
81
81
  NUMPY_PATTERN = "*.npy"
82
82
  PT_SUFFIX = ".pt"
83
+ PY_SUFFIX = ".py"
84
+ INIT_PY = "init.py"
83
85
  ONE_GB = 1073741824 # 1 * 1024 * 1024 * 1024
84
86
  TEN_GB = 10737418240 # 10 * 1024 * 1024 * 1024
85
87
  ONE_MB = 1048576 # 1 * 1024 * 1024
@@ -95,6 +97,7 @@ class Const:
95
97
  GRAD_OUTPUT = 'grad_output'
96
98
  PARAMS = 'parameters'
97
99
  PARAMS_GRAD = 'parameters_grad'
100
+ DEBUG = 'debug'
98
101
  START = "start"
99
102
  STOP = "stop"
100
103
  ENV_ENABLE = "1"
@@ -132,6 +135,7 @@ class Const:
132
135
  NPU = 'NPU'
133
136
  NPU_LOWERCASE = 'npu'
134
137
  CPU_LOWERCASE = 'cpu'
138
+ GPU_LOWERCASE = 'gpu'
135
139
  CUDA_LOWERCASE = 'cuda'
136
140
  DEVICE = 'device'
137
141
  DISTRIBUTED = 'Distributed'
@@ -140,6 +144,10 @@ class Const:
140
144
  MODULE_PREFIX = ["Module", "Cell"]
141
145
  FORWARD_NAME_SUFFIX = ".forward"
142
146
 
147
+ DUMP_JSON_FILE = "dump_json_file"
148
+ DEBUG_JSON_FILE = "debug_json_file"
149
+ STACK_JSON_FILE = "stack_json_file"
150
+
143
151
  # struct json param
144
152
  ORIGIN_DATA = "origin_data"
145
153
  SCOPE = "scope"
@@ -170,6 +178,10 @@ class Const:
170
178
  TOP_LAYER = "TopLayer"
171
179
  CELL = "Cell"
172
180
  MODULE = "Module"
181
+ API = "api"
182
+ PYNATIVE_MODE = "pynative"
183
+ PYNATIVE_GRAPH_MODE = "pynative_graph"
184
+
173
185
  FRAME_FILE_LIST = ["site-packages/torch", "package/torch", "site-packages/mindspore", "package/mindspore"]
174
186
  INPLACE_LIST = [
175
187
  "broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
@@ -191,7 +203,11 @@ class Const:
191
203
 
192
204
  FILL_CHAR_NUMS = 50
193
205
  TOOL_ENDS_SUCCESSFULLY = f"{TOOL_NAME} ends successfully."
206
+
194
207
  WITHOUT_CALL_STACK = "The call stack retrieval failed."
208
+ STACK_FILTER_KEYWORDS = ["msprobe/core", "msprobe/pytorch", "msprobe/mindspore"]
209
+ CALL_STACK_FLAG = "data_dump/api_registry"
210
+ NEW_STACK_FLAG = "0"
195
211
 
196
212
  STEP = "step"
197
213
  RANK = "rank"
@@ -209,12 +225,16 @@ class Const:
209
225
  TORCH_FLOAT32 = "torch.float32"
210
226
  TORCH_BFLOAT16 = "torch.bfloat16"
211
227
 
228
+ TYPE = 'type'
212
229
  DTYPE = 'dtype'
213
230
  SHAPE = 'shape'
231
+ STACK_INFO = 'stack_info'
214
232
  MAX = 'Max'
215
233
  MIN = 'Min'
216
234
  MEAN = 'Mean'
217
235
  NORM = 'Norm'
236
+ DATA_NAME = 'data_name'
237
+ TENSOR_STAT_INDEX = 'tensor_stat_index'
218
238
 
219
239
  CODE_STACK = 'Code Stack'
220
240
  OP_NAME = 'Op Name'
@@ -226,6 +246,10 @@ class Const:
226
246
  # 分隔符常量
227
247
  SCOPE_SEPARATOR = "/"
228
248
  REPLACEMENT_CHARACTER = "_"
249
+ PIPE_SEPARATOR = "|"
250
+
251
+ FORWARD_PATTERN = SEP + FORWARD + SEP
252
+ BACKWARD_PATTERN = SEP + BACKWARD + SEP
229
253
 
230
254
  OPTIMIZER = "optimizer"
231
255
  CLIP_GRAD = "clip_grad"
@@ -243,6 +267,7 @@ class Const:
243
267
  PT_API_TYPE_ATEN = "aten"
244
268
  PT_API_TYPE_DIST = "distributed"
245
269
  PT_API_TYPE_NPU_DIST = "npu_distributed"
270
+ PT_API_TYPE_MINDSPEED = "mindspeed"
246
271
 
247
272
  MS_API_TYPE_OPS = "ops"
248
273
  MS_API_TYPE_TENSOR = "tensor"
@@ -250,6 +275,7 @@ class Const:
250
275
  MS_API_TYPE_MINT = "mint.ops"
251
276
  MS_API_TYPE_MINT_FUNC = "mint.nn.functional"
252
277
  MS_API_TYPE_COM = "communication.comm_func"
278
+ MS_API_TYPE_MINT_DIST = "mint.distributed"
253
279
 
254
280
  FUNCTIONAL_API_TYPE_PREFIX = "Functional"
255
281
  TENSOR_API_TYPE_PREFIX = "Tensor"
@@ -259,9 +285,11 @@ class Const:
259
285
  NPU_API_TYPE_PREFIX = "NPU"
260
286
  ATEN_API_TYPE_PREFIX = "Aten"
261
287
  VF_API_TYPE_PREFIX = "VF"
288
+ MINDSPEED_API_TYPE_PREFIX = "MindSpeed"
262
289
 
263
290
  MINT_API_TYPE_PREFIX = "Mint"
264
291
  MINT_FUNC_API_TYPE_PREFIX = "MintFunctional"
292
+ MINT_DIST_API_TYPE_PREFIX = "MintDistributed"
265
293
 
266
294
  SUPPORT_API_DICT_KEY_MAP = {
267
295
  PT_FRAMEWORK: {
@@ -272,7 +300,8 @@ class Const:
272
300
  PT_API_TYPE_NPU: PT_API_TYPE_NPU,
273
301
  PT_API_TYPE_ATEN: PT_API_TYPE_ATEN,
274
302
  PT_API_TYPE_DIST: PT_API_TYPE_DIST,
275
- PT_API_TYPE_NPU_DIST: PT_API_TYPE_NPU_DIST
303
+ PT_API_TYPE_NPU_DIST: PT_API_TYPE_NPU_DIST,
304
+ PT_API_TYPE_MINDSPEED: PT_API_TYPE_MINDSPEED
276
305
  },
277
306
  MS_FRAMEWORK: {
278
307
  MS_API_TYPE_OPS: MS_API_TYPE_OPS,
@@ -280,7 +309,8 @@ class Const:
280
309
  MS_API_TYPE_STUB_TENSOR: MS_API_TYPE_TENSOR,
281
310
  MS_API_TYPE_MINT: MS_API_TYPE_MINT,
282
311
  MS_API_TYPE_MINT_FUNC: MS_API_TYPE_MINT_FUNC,
283
- MS_API_TYPE_COM: MS_API_TYPE_COM
312
+ MS_API_TYPE_COM: MS_API_TYPE_COM,
313
+ MS_API_TYPE_MINT_DIST: MS_API_TYPE_MINT_DIST
284
314
  },
285
315
  MT_FRAMEWORK: {
286
316
  PT_API_TYPE_FUNCTIONAL: PT_API_TYPE_FUNCTIONAL,
@@ -300,7 +330,8 @@ class Const:
300
330
  PT_API_TYPE_NPU: NPU_API_TYPE_PREFIX,
301
331
  PT_API_TYPE_ATEN: ATEN_API_TYPE_PREFIX,
302
332
  PT_API_TYPE_DIST: DIST_API_TYPE_PREFIX,
303
- PT_API_TYPE_NPU_DIST: DIST_API_TYPE_PREFIX
333
+ PT_API_TYPE_NPU_DIST: DIST_API_TYPE_PREFIX,
334
+ PT_API_TYPE_MINDSPEED: MINDSPEED_API_TYPE_PREFIX
304
335
  },
305
336
  MS_FRAMEWORK: {
306
337
  MS_API_TYPE_OPS: FUNCTIONAL_API_TYPE_PREFIX,
@@ -308,7 +339,8 @@ class Const:
308
339
  MS_API_TYPE_STUB_TENSOR: TENSOR_API_TYPE_PREFIX,
309
340
  MS_API_TYPE_MINT: MINT_API_TYPE_PREFIX,
310
341
  MS_API_TYPE_MINT_FUNC: MINT_FUNC_API_TYPE_PREFIX,
311
- MS_API_TYPE_COM: DIST_API_TYPE_PREFIX
342
+ MS_API_TYPE_COM: DIST_API_TYPE_PREFIX,
343
+ MS_API_TYPE_MINT_DIST: MINT_DIST_API_TYPE_PREFIX
312
344
  },
313
345
  MT_FRAMEWORK: {
314
346
  PT_API_TYPE_FUNCTIONAL: FUNCTIONAL_API_TYPE_PREFIX,
@@ -319,12 +351,42 @@ class Const:
319
351
  }
320
352
  }
321
353
 
354
+ def _fused_adamw_(
355
+ self,
356
+ grads,
357
+ exp_avgs,
358
+ exp_avg_sqs,
359
+ max_exp_avg_sqs,
360
+ state_steps,
361
+ *,
362
+ lr,
363
+ beta1,
364
+ beta2,
365
+ weight_decay,
366
+ eps,
367
+ amsgrad,
368
+ maximize,
369
+ grad_scale=None,
370
+ found_inf=None
371
+ ):
372
+ pass
373
+
374
+ API_WITH_SELF_ARG = {
375
+ 'Torch._fused_adamw_': _fused_adamw_
376
+ }
377
+
378
+ ASCEND = "ASCEND"
379
+ MATCH_MODE_NAME = "pure name"
380
+ MATCH_MODE_MAPPING = "mapping"
381
+ MATCH_MODE_SIMILARITY = "similarity"
382
+
322
383
 
323
384
  class CompareConst:
324
385
  """
325
386
  Class for compare module const
326
387
  """
327
388
  SPACE = " "
389
+ NAME = "Name"
328
390
  # compare result column name
329
391
  NPU_NAME = "NPU Name"
330
392
  BENCH_NAME = "Bench Name"
@@ -368,6 +430,7 @@ class CompareConst:
368
430
  OUTPUT_STRUCT = "output_struct"
369
431
  PARAMS_STRUCT = "params_struct"
370
432
  PARAMS_GRAD_STRUCT = "params_grad_struct"
433
+ DEBUG_STRUCT = "debug_struct"
371
434
  SUMMARY = "summary"
372
435
  COMPARE_RESULT = "compare_result"
373
436
  COMPARE_MESSAGE = "compare_message"
@@ -474,16 +537,10 @@ class CompareConst:
474
537
  Const.KWARGS: INPUT_STRUCT,
475
538
  Const.OUTPUT: OUTPUT_STRUCT,
476
539
  Const.PARAMS: PARAMS_STRUCT,
477
- Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT
540
+ Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT,
541
+ Const.DEBUG: DEBUG_STRUCT
478
542
  }
479
543
 
480
- STRUCT_COMPARE_KEY = [
481
- INPUT_STRUCT,
482
- OUTPUT_STRUCT,
483
- PARAMS_STRUCT,
484
- PARAMS_GRAD_STRUCT
485
- ]
486
-
487
544
  # compare standard
488
545
  HUNDRED_RATIO_THRESHOLD = 0.01
489
546
  THOUSAND_RATIO_THRESHOLD = 0.001
@@ -562,15 +619,35 @@ class CompareConst:
562
619
  MAX_DIFF: None, MIN_DIFF: None, MEAN_DIFF: None, NORM_DIFF: None, MAX_RELATIVE_ERR: None,
563
620
  MIN_RELATIVE_ERR: None, MEAN_RELATIVE_ERR: None, NORM_RELATIVE_ERR: None
564
621
  }
622
+
623
+ API_MAPPING_KEYS_TO_COMPARE = [
624
+ ('ms_args', 'pt_args'),
625
+ ('ms_outputs', 'pt_outputs'),
626
+ ('ms_parameters', 'pt_parameters'),
627
+ ('ms_parameters_grad', 'pt_parameters_grad')
628
+ ]
629
+
565
630
  INPUT_PATTERN = Const.SEP + Const.INPUT + Const.SEP
566
631
  KWARGS_PATTERN = Const.SEP + Const.KWARGS + Const.SEP
567
632
  OUTPUT_PATTERN = Const.SEP + Const.OUTPUT + Const.SEP
568
633
  PARAMS_PATTERN = Const.SEP + Const.PARAMS + Const.SEP
569
634
  PARAMS_GRAD_PATTERN = Const.SEP + Const.PARAMS_GRAD + Const.SEP
570
- COMPARE_KEY = 'compare_key'
571
- COMPARE_SHAPE = 'compare_shape'
635
+
636
+ CMP_KEY = 'compare_key'
637
+ CMP_SHAPE = 'compare_shape'
638
+
639
+ OP_NAME_X = 'op_name_x'
640
+ MATCH_RESULT_COLUMNS = [
641
+ OP_NAME_X, 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x',
642
+ CMP_KEY, CMP_SHAPE,
643
+ 'op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'data_name_y',
644
+ ]
645
+
572
646
  INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml'
573
647
  UNREADABLE = 'unreadable data'
648
+ NPU_DUMP_DATA_DIR = 'npu_dump_data_dir'
649
+ BENCH_DUMP_DATA_DIR = 'bench_dump_data_dir'
650
+ NO_REAL_DATA_FLAG = '-1'
574
651
 
575
652
 
576
653
  class FileCheckConst:
@@ -592,6 +669,8 @@ class FileCheckConst:
592
669
  XLSX_SUFFIX = ".xlsx"
593
670
  YAML_SUFFIX = ".yaml"
594
671
  IR_SUFFIX = ".ir"
672
+ ZIP_SUFFIX = ".zip"
673
+ SHELL_SUFFIX = ".sh"
595
674
  MAX_PKL_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
596
675
  MAX_NUMPY_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024
597
676
  MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
@@ -600,6 +679,9 @@ class FileCheckConst:
600
679
  MAX_XLSX_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
601
680
  MAX_YAML_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
602
681
  MAX_IR_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
682
+ MAX_ZIP_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024
683
+ MAX_FILE_IN_ZIP_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
684
+ MAX_FILE_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
603
685
  COMMOM_FILE_SIZE = 1048576 # 1 * 1024 * 1024
604
686
  DIR = "dir"
605
687
  FILE = "file"
@@ -613,7 +695,8 @@ class FileCheckConst:
613
695
  CSV_SUFFIX: MAX_CSV_SIZE,
614
696
  XLSX_SUFFIX: MAX_XLSX_SIZE,
615
697
  YAML_SUFFIX: MAX_YAML_SIZE,
616
- IR_SUFFIX: MAX_IR_SIZE
698
+ IR_SUFFIX: MAX_IR_SIZE,
699
+ ZIP_SUFFIX: MAX_ZIP_SIZE
617
700
  }
618
701
  CSV_BLACK_LIST = r'^[+-=%@\+\-=%@]|;[+-=%@\+\-=%@]'
619
702
 
@@ -671,7 +754,7 @@ class MonitorConst:
671
754
  DEFAULT_MIN_COLLECT_TIMES = 0
672
755
  DEFAULT_STEP_INTERVAL = 1
673
756
 
674
- OP_LIST = ["norm", "min", "max", "zeros", "nans", "id", "mean"]
757
+ OP_LIST = ["norm", "min", "max", "zeros", "nans", "id", "mean", "shape", "dtype"]
675
758
  MONITOR_OUTPUT_DIR = "MONITOR_OUTPUT_DIR"
676
759
  DEFAULT_MONITOR_OUTPUT_DIR = "./monitor_output"
677
760
  DATABASE = "database"
@@ -683,7 +766,7 @@ class MonitorConst:
683
766
  "DeepSpeedZeroOptimizer_Stage3"
684
767
  )
685
768
  DEEPSPEED_ZERO_OPT_FILTER = "DeepSpeedZeroOptimizer"
686
- RULE_NAME = ['AnomalyTurbulence']
769
+ RULE_NAME = ['AnomalyTurbulence', 'AnomalyNan']
687
770
 
688
771
  SLICE_SIZE = 20480
689
772
  # used for name
@@ -700,15 +783,16 @@ class MonitorConst:
700
783
  ACTVGRAD = "actv_grad"
701
784
  POST_GRAD = "post_grad"
702
785
  PRE_GRAD = "pre_grad"
786
+ PRE_PARAM = "param_origin"
787
+ POST_PARAM = "param_updated"
703
788
  ACC_GRAD = "acc_grad"
704
789
  PREFIX_POST = "post"
705
790
  PREFIX_PRE = "pre"
706
791
  EXP_AVG = "exp_avg"
707
792
  EXP_AVG_SQ = "exp_avg_sq"
708
- PARAM = "param"
709
793
 
710
794
  CSV_HEADER = ["vpp_stage", "name", "step"]
711
- CSV_HEADER_XY = ["vpp_stage", "name", "step", "micro_step"]
795
+ CSV_HEADER_MICRO_STEP = ["vpp_stage", "name", "step", "micro_step"]
712
796
  OUTPUT_DIR_PATTERN = r"([\w-]{0,20})-rank(\d{1,5})-"
713
797
  ANOMALY_JSON = "anomaly.json"
714
798
  ANALYSE_JSON = "anomaly_analyse.json"
@@ -716,5 +800,20 @@ class MonitorConst:
716
800
  CSV = "csv"
717
801
  API = "api"
718
802
  HEADER_NAME = 'name'
719
-
720
803
  MAX_NDIGITS = 20
804
+
805
+ DEFAULT_STAGE = -1
806
+ FORWARD_STAGE = 0
807
+ BACKWARD_STAGE = 1
808
+ OPTIMIZER_STAGE = 2
809
+ FORWARD_KEY = [ACTV]
810
+ BACKWARD_KEY = [ACTVGRAD, PRE_GRAD, POST_GRAD, ACC_GRAD]
811
+ OPTIMIZER_KEY = [EXP_AVG, EXP_AVG_SQ]
812
+
813
+ TRAIN_STAGE = {}
814
+ for key in FORWARD_KEY:
815
+ TRAIN_STAGE[key] = FORWARD_STAGE
816
+ for key in BACKWARD_KEY:
817
+ TRAIN_STAGE[key] = BACKWARD_STAGE
818
+ for key in OPTIMIZER_KEY:
819
+ TRAIN_STAGE[key] = OPTIMIZER_STAGE
@@ -21,8 +21,8 @@ class CodedException(Exception):
21
21
 
22
22
  def __str__(self):
23
23
  return self.error_info
24
-
25
-
24
+
25
+
26
26
  class MsprobeException(CodedException):
27
27
  INVALID_PARAM_ERROR = 0
28
28
  OVERFLOW_NUMS_ERROR = 1