mindstudio-probe 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/METADATA +7 -6
  2. mindstudio_probe-1.2.1.dist-info/RECORD +396 -0
  3. {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/WHEEL +1 -1
  4. {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/entry_points.txt +0 -1
  5. msprobe/CMakeLists.txt +5 -0
  6. msprobe/README.md +51 -20
  7. msprobe/config.json +2 -3
  8. msprobe/core/advisor/advisor.py +8 -3
  9. msprobe/core/common/const.py +264 -15
  10. msprobe/core/common/exceptions.py +27 -3
  11. msprobe/core/common/file_utils.py +176 -26
  12. msprobe/core/common/inplace_op_checker.py +15 -0
  13. msprobe/core/common/inplace_ops.yaml +3 -0
  14. msprobe/core/common/log.py +27 -9
  15. msprobe/core/common/utils.py +204 -77
  16. msprobe/core/common_config.py +49 -14
  17. msprobe/core/compare/acc_compare.py +274 -198
  18. msprobe/core/compare/check.py +32 -33
  19. msprobe/core/compare/compare_cli.py +32 -14
  20. msprobe/core/compare/highlight.py +283 -127
  21. msprobe/core/compare/layer_mapping/__init__.py +19 -0
  22. msprobe/core/compare/layer_mapping/data_scope_parser.py +246 -0
  23. msprobe/core/compare/layer_mapping/layer_mapping.py +249 -0
  24. msprobe/core/compare/layer_mapping/postprocess_pass.py +95 -0
  25. msprobe/core/compare/merge_result/merge_result.py +380 -0
  26. msprobe/core/compare/merge_result/merge_result_cli.py +31 -0
  27. msprobe/core/compare/multiprocessing_compute.py +2 -2
  28. msprobe/core/compare/npy_compare.py +135 -144
  29. msprobe/core/compare/utils.py +419 -274
  30. msprobe/core/data_dump/data_collector.py +60 -28
  31. msprobe/core/data_dump/data_processor/base.py +84 -36
  32. msprobe/core/data_dump/data_processor/factory.py +5 -3
  33. msprobe/core/data_dump/data_processor/mindspore_processor.py +152 -18
  34. msprobe/core/data_dump/data_processor/pytorch_processor.py +267 -110
  35. msprobe/core/data_dump/json_writer.py +29 -1
  36. msprobe/core/data_dump/scope.py +119 -39
  37. msprobe/core/grad_probe/constant.py +27 -13
  38. msprobe/core/grad_probe/grad_compare.py +18 -1
  39. msprobe/core/grad_probe/utils.py +30 -2
  40. msprobe/core/overflow_check/abnormal_scene.py +189 -0
  41. msprobe/core/overflow_check/api_info.py +55 -0
  42. msprobe/core/overflow_check/checker.py +138 -0
  43. msprobe/core/overflow_check/filter.py +157 -0
  44. msprobe/core/overflow_check/ignore_rules.yaml +55 -0
  45. msprobe/core/overflow_check/level.py +22 -0
  46. msprobe/core/overflow_check/utils.py +28 -0
  47. msprobe/docs/01.installation.md +96 -7
  48. msprobe/docs/02.config_introduction.md +50 -23
  49. msprobe/docs/03.config_examples.md +2 -9
  50. msprobe/docs/04.kernel_dump_PyTorch.md +73 -0
  51. msprobe/docs/05.data_dump_PyTorch.md +93 -61
  52. msprobe/docs/06.data_dump_MindSpore.md +200 -95
  53. msprobe/docs/07.accuracy_checker_PyTorch.md +28 -28
  54. msprobe/docs/08.accuracy_checker_online_PyTorch.md +1 -6
  55. msprobe/docs/09.accuracy_checker_MindSpore.md +44 -8
  56. msprobe/docs/10.accuracy_compare_PyTorch.md +114 -50
  57. msprobe/docs/11.accuracy_compare_MindSpore.md +340 -48
  58. msprobe/docs/12.overflow_check_PyTorch.md +2 -2
  59. msprobe/docs/13.overflow_check_MindSpore.md +6 -6
  60. msprobe/docs/15.free_benchmarking_PyTorch.md +4 -5
  61. msprobe/docs/16.free_benchmarking_MindSpore.md +56 -37
  62. msprobe/docs/17.grad_probe.md +5 -6
  63. msprobe/docs/19.monitor.md +561 -0
  64. msprobe/docs/20.monitor_performance_baseline.md +52 -0
  65. msprobe/docs/21.visualization_PyTorch.md +466 -0
  66. msprobe/docs/22.visualization_MindSpore.md +481 -0
  67. msprobe/docs/23.generate_operator_PyTorch.md +107 -0
  68. msprobe/docs/24.code_mapping_Mindspore.md +28 -0
  69. msprobe/docs/25.tool_function_introduction.md +29 -0
  70. msprobe/docs/26.data_dump_PyTorch_baseline.md +37 -0
  71. msprobe/docs/27.dump_json_instruction.md +521 -0
  72. msprobe/docs/FAQ.md +29 -2
  73. msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +14 -0
  74. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +22 -0
  75. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +211 -0
  76. msprobe/docs/img/compare_result.png +0 -0
  77. msprobe/docs/img/merge_result.png +0 -0
  78. msprobe/docs/img/monitor/cpu_info.png +0 -0
  79. msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
  80. msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
  81. msprobe/docs/img/visualization/tensorboard_1.png +0 -0
  82. msprobe/docs/img/visualization/tensorboard_2.png +0 -0
  83. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  84. msprobe/docs/img/visualization/vis_browser_2.png +0 -0
  85. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  86. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  87. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  88. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  89. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  90. msprobe/docs/visualization/GPTModel.png +0 -0
  91. msprobe/docs/visualization/ParallelMLP.png +0 -0
  92. msprobe/docs/visualization/layer_mapping_example.md +132 -0
  93. msprobe/docs/visualization/mapping.png +0 -0
  94. msprobe/docs/visualization/mapping1.png +0 -0
  95. msprobe/docs/visualization/module_name.png +0 -0
  96. msprobe/docs/visualization/module_name1.png +0 -0
  97. msprobe/docs/visualization/no_mapping.png +0 -0
  98. msprobe/docs/visualization/no_mapping1.png +0 -0
  99. msprobe/docs/visualization/no_mapping_analyze.png +0 -0
  100. msprobe/docs/visualization/top_layer.png +0 -0
  101. msprobe/mindspore/__init__.py +25 -0
  102. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -151
  103. msprobe/mindspore/api_accuracy_checker/api_info.py +21 -6
  104. msprobe/mindspore/api_accuracy_checker/api_runner.py +43 -18
  105. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +21 -7
  106. msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +77 -0
  107. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +64 -1
  108. msprobe/mindspore/api_accuracy_checker/compute_element.py +64 -31
  109. msprobe/mindspore/api_accuracy_checker/data_manager.py +301 -0
  110. msprobe/mindspore/api_accuracy_checker/main.py +28 -3
  111. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +212 -0
  112. msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +60 -0
  113. msprobe/mindspore/api_accuracy_checker/type_mapping.py +22 -5
  114. msprobe/mindspore/api_accuracy_checker/utils.py +34 -17
  115. msprobe/mindspore/cell_processor.py +33 -12
  116. msprobe/mindspore/code_mapping/bind.py +264 -0
  117. msprobe/mindspore/code_mapping/cmd_parser.py +40 -0
  118. msprobe/mindspore/code_mapping/graph.py +49 -0
  119. msprobe/mindspore/code_mapping/graph_parser.py +226 -0
  120. msprobe/mindspore/code_mapping/main.py +24 -0
  121. msprobe/mindspore/code_mapping/processor.py +34 -0
  122. msprobe/mindspore/common/const.py +35 -13
  123. msprobe/mindspore/common/log.py +5 -9
  124. msprobe/mindspore/common/utils.py +88 -4
  125. msprobe/mindspore/compare/distributed_compare.py +22 -24
  126. msprobe/mindspore/compare/ms_compare.py +333 -268
  127. msprobe/mindspore/compare/ms_graph_compare.py +95 -52
  128. msprobe/mindspore/debugger/debugger_config.py +7 -1
  129. msprobe/mindspore/debugger/precision_debugger.py +87 -12
  130. msprobe/mindspore/dump/dump_tool_factory.py +3 -1
  131. msprobe/mindspore/dump/hook_cell/api_registry.py +95 -18
  132. msprobe/mindspore/dump/hook_cell/hook_cell.py +60 -38
  133. msprobe/mindspore/dump/hook_cell/primitive_hooks.py +45 -30
  134. msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +36 -1
  135. msprobe/mindspore/dump/hook_cell/wrap_api.py +92 -1
  136. msprobe/mindspore/dump/jit_dump.py +17 -5
  137. msprobe/mindspore/dump/kernel_dump/kernel_config.py +33 -0
  138. msprobe/mindspore/dump/kernel_graph_dump.py +9 -4
  139. msprobe/mindspore/dump/kernel_kbyk_dump.py +2 -4
  140. msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +140 -0
  141. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +53 -0
  142. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +156 -41
  143. msprobe/mindspore/free_benchmark/common/handler_params.py +1 -2
  144. msprobe/mindspore/free_benchmark/common/utils.py +19 -4
  145. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -204
  146. msprobe/mindspore/free_benchmark/handler/base_handler.py +3 -3
  147. msprobe/mindspore/free_benchmark/handler/check_handler.py +4 -5
  148. msprobe/mindspore/free_benchmark/handler/fix_handler.py +4 -4
  149. msprobe/mindspore/free_benchmark/handler/handler_factory.py +4 -4
  150. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +2 -2
  151. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +15 -6
  152. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +2 -2
  153. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +2 -2
  154. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +13 -6
  155. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +2 -2
  156. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +2 -2
  157. msprobe/mindspore/grad_probe/global_context.py +28 -8
  158. msprobe/mindspore/grad_probe/grad_analyzer.py +50 -24
  159. msprobe/mindspore/grad_probe/grad_monitor.py +16 -1
  160. msprobe/mindspore/grad_probe/grad_stat_csv.py +33 -5
  161. msprobe/mindspore/grad_probe/hook.py +35 -12
  162. msprobe/mindspore/grad_probe/utils.py +18 -5
  163. msprobe/mindspore/mindtorch/__init__.py +18 -0
  164. msprobe/mindspore/mindtorch/mindtorch_adaptor.py +255 -0
  165. msprobe/mindspore/ms_config.py +27 -16
  166. msprobe/mindspore/overflow_check/kernel_graph_overflow_check.py +9 -4
  167. msprobe/mindspore/runtime.py +15 -0
  168. msprobe/mindspore/service.py +285 -113
  169. msprobe/mindspore/task_handler_factory.py +15 -0
  170. msprobe/msprobe.py +48 -10
  171. msprobe/pytorch/__init__.py +8 -6
  172. msprobe/pytorch/api_accuracy_checker/common/config.py +62 -0
  173. msprobe/pytorch/api_accuracy_checker/common/utils.py +31 -16
  174. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -8
  175. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +103 -271
  176. msprobe/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml +4 -1
  177. msprobe/pytorch/api_accuracy_checker/compare/compare.py +69 -68
  178. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +54 -0
  179. msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +51 -0
  180. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +2 -4
  181. msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +9 -0
  182. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +478 -0
  183. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +365 -0
  184. msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +106 -0
  185. msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +107 -0
  186. msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +151 -0
  187. msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +226 -0
  188. msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +68 -0
  189. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +218 -0
  190. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +104 -0
  191. msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +63 -0
  192. msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +200 -0
  193. msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +63 -2
  194. msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +21 -15
  195. msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +54 -22
  196. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +140 -71
  197. msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +49 -8
  198. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +9 -24
  199. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +4 -12
  200. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +5 -3
  201. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/dump_dispatch.py +9 -4
  202. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +3 -11
  203. msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +2 -2
  204. msprobe/pytorch/bench_functions/confusion_transpose.py +5 -1
  205. msprobe/pytorch/bench_functions/matmul_backward.py +12 -0
  206. msprobe/pytorch/bench_functions/npu_fusion_attention.py +142 -16
  207. msprobe/pytorch/bench_functions/rotary_mul.py +4 -0
  208. msprobe/pytorch/bench_functions/swiglu.py +10 -2
  209. msprobe/pytorch/common/parse_json.py +7 -6
  210. msprobe/pytorch/common/utils.py +101 -7
  211. msprobe/pytorch/compare/distributed_compare.py +17 -30
  212. msprobe/pytorch/compare/pt_compare.py +44 -22
  213. msprobe/pytorch/debugger/debugger_config.py +46 -27
  214. msprobe/pytorch/debugger/precision_debugger.py +42 -12
  215. msprobe/pytorch/dump/kernel_dump/kernel_config.py +33 -0
  216. msprobe/pytorch/dump/module_dump/module_dump.py +86 -0
  217. msprobe/pytorch/{module_processer.py → dump/module_dump/module_processer.py} +81 -10
  218. msprobe/pytorch/free_benchmark/common/constant.py +15 -0
  219. msprobe/pytorch/free_benchmark/common/counter.py +15 -0
  220. msprobe/pytorch/free_benchmark/common/enums.py +15 -0
  221. msprobe/pytorch/free_benchmark/common/params.py +10 -2
  222. msprobe/pytorch/free_benchmark/common/utils.py +29 -4
  223. msprobe/pytorch/free_benchmark/compare/grad_saver.py +20 -5
  224. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +2 -0
  225. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -1
  226. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +6 -4
  227. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +2 -0
  228. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +4 -0
  229. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +41 -47
  230. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +6 -5
  231. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -4
  232. msprobe/pytorch/grad_probe/grad_monitor.py +23 -6
  233. msprobe/pytorch/grad_probe/grad_stat_csv.py +40 -10
  234. msprobe/pytorch/hook_module/__init__.py +1 -1
  235. msprobe/pytorch/hook_module/hook_module.py +14 -11
  236. msprobe/pytorch/hook_module/register_optimizer_hook.py +59 -0
  237. msprobe/pytorch/hook_module/support_wrap_ops.yaml +35 -0
  238. msprobe/pytorch/hook_module/wrap_distributed.py +6 -8
  239. msprobe/pytorch/hook_module/wrap_functional.py +0 -38
  240. msprobe/pytorch/monitor/__init__.py +0 -0
  241. msprobe/pytorch/monitor/anomaly_analyse.py +201 -0
  242. msprobe/pytorch/monitor/anomaly_detect.py +425 -0
  243. msprobe/pytorch/monitor/csv2tb.py +166 -0
  244. msprobe/pytorch/monitor/distributed/__init__.py +0 -0
  245. msprobe/pytorch/monitor/distributed/distributed_ops.yaml +19 -0
  246. msprobe/pytorch/monitor/distributed/stack_blacklist.yaml +5 -0
  247. msprobe/pytorch/monitor/distributed/wrap_distributed.py +283 -0
  248. msprobe/pytorch/monitor/features.py +108 -0
  249. msprobe/pytorch/monitor/module_hook.py +1076 -0
  250. msprobe/pytorch/monitor/module_metric.py +172 -0
  251. msprobe/pytorch/monitor/module_spec_verifier.py +95 -0
  252. msprobe/pytorch/monitor/optimizer_collect.py +333 -0
  253. msprobe/pytorch/monitor/unittest/__init__.py +0 -0
  254. msprobe/pytorch/monitor/unittest/test_monitor.py +160 -0
  255. msprobe/pytorch/monitor/utils.py +321 -0
  256. msprobe/pytorch/monitor/visualizer.py +59 -0
  257. msprobe/pytorch/online_dispatch/__init__.py +2 -3
  258. msprobe/pytorch/online_dispatch/compare.py +29 -38
  259. msprobe/pytorch/online_dispatch/dispatch.py +58 -27
  260. msprobe/pytorch/online_dispatch/dump_compare.py +21 -9
  261. msprobe/pytorch/online_dispatch/single_compare.py +53 -32
  262. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +1 -1
  263. msprobe/pytorch/online_dispatch/utils.py +49 -21
  264. msprobe/pytorch/parse_tool/lib/compare.py +21 -27
  265. msprobe/pytorch/parse_tool/lib/config.py +6 -8
  266. msprobe/pytorch/parse_tool/lib/file_desc.py +15 -1
  267. msprobe/pytorch/parse_tool/lib/interactive_cli.py +10 -10
  268. msprobe/pytorch/parse_tool/lib/parse_exception.py +7 -7
  269. msprobe/pytorch/parse_tool/lib/parse_tool.py +12 -12
  270. msprobe/pytorch/parse_tool/lib/utils.py +33 -53
  271. msprobe/pytorch/parse_tool/lib/visualization.py +11 -10
  272. msprobe/pytorch/pt_config.py +31 -8
  273. msprobe/pytorch/service.py +188 -108
  274. msprobe/visualization/__init__.py +14 -0
  275. msprobe/visualization/builder/__init__.py +14 -0
  276. msprobe/visualization/builder/graph_builder.py +222 -0
  277. msprobe/visualization/builder/msprobe_adapter.py +227 -0
  278. msprobe/visualization/compare/__init__.py +14 -0
  279. msprobe/visualization/compare/graph_comparator.py +180 -0
  280. msprobe/visualization/compare/mode_adapter.py +197 -0
  281. msprobe/visualization/graph/__init__.py +14 -0
  282. msprobe/visualization/graph/base_node.py +119 -0
  283. msprobe/visualization/graph/distributed_analyzer.py +318 -0
  284. msprobe/visualization/graph/graph.py +209 -0
  285. msprobe/visualization/graph/node_colors.py +95 -0
  286. msprobe/visualization/graph/node_op.py +39 -0
  287. msprobe/visualization/graph_service.py +288 -0
  288. msprobe/visualization/utils.py +217 -0
  289. mindstudio_probe-1.1.0.dist-info/RECORD +0 -287
  290. msprobe/docs/04.acl_config_examples.md +0 -78
  291. msprobe/mindspore/compare/layer_mapping.py +0 -146
  292. msprobe/mindspore/compare/modify_mapping.py +0 -107
  293. msprobe/mindspore/free_benchmark/decorator/dec_forward.py +0 -57
  294. msprobe/mindspore/free_benchmark/decorator/decorator_factory.py +0 -122
  295. msprobe/pytorch/functional/module_dump.py +0 -84
  296. {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/LICENSE +0 -0
  297. {mindstudio_probe-1.1.0.dist-info → mindstudio_probe-1.2.1.dist-info}/top_level.txt +0 -0
  298. /msprobe/mindspore/{free_benchmark/decorator → code_mapping}/__init__.py +0 -0
  299. /msprobe/pytorch/{functional → dump/module_dump}/__init__.py +0 -0
@@ -62,7 +62,12 @@ class Advisor:
62
62
  .format(item[CompareConst.NPU_NAME]))
63
63
 
64
64
  def gen_advisor_result(self, pd_data):
65
- first_failing_data = pd_data.iloc[0]
65
+ try:
66
+ first_failing_data = pd_data.iloc[0]
67
+ except IndexError as e:
68
+ err_msg = "index out of bounds error occurs, pd_data is empty, please check!"
69
+ logger.error(err_msg)
70
+ raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e
66
71
  node_name = first_failing_data[CompareConst.NPU_NAME]
67
72
  index = first_failing_data['index']
68
73
  message = self.gen_advisor_message(node_name)
@@ -87,7 +92,7 @@ class Advisor:
87
92
  return message
88
93
 
89
94
  def analysis(self):
90
- self._check_path_vaild()
95
+ self._check_path_valid()
91
96
  analyze_data = self._parse_input_data()
92
97
  logger.info("Start analyzing the comparison result: %s" % self.file_type)
93
98
  self.analyze_unmatched(analyze_data)
@@ -119,6 +124,6 @@ class Advisor:
119
124
  df = self.input_data.reset_index()
120
125
  return df
121
126
 
122
- def _check_path_vaild(self):
127
+ def _check_path_valid(self):
123
128
  out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE)
124
129
  out_path_checker.common_check()
@@ -1,3 +1,18 @@
1
+ # Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
1
16
  import os
2
17
  import stat
3
18
 
@@ -10,9 +25,11 @@ class Const:
10
25
  """
11
26
  TOOL_NAME = "msprobe"
12
27
 
28
+ ipv4_pattern = "([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])(\.([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])){3}$"
13
29
  SEP = "."
14
30
  REGEX_PREFIX_MAX_LENGTH = 20
15
31
  REGEX_PREFIX_PATTERN = r"^[a-zA-Z0-9_-]+$"
32
+ REGEX_FORWARD_BACKWARD = r'\.(forward|backward)\.'
16
33
  FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$'
17
34
  STRING_BLACKLIST = r"^[+-=%@\+\-=%@]|;[+-=%@\+\-=%@]"
18
35
  COMMA = ","
@@ -20,6 +37,8 @@ class Const:
20
37
  OFF = 'OFF'
21
38
  BACKWARD = 'backward'
22
39
  FORWARD = 'forward'
40
+ PROGRESS_TIMEOUT = 3000
41
+ EXCEPTION_NONE = None
23
42
  JIT = 'Jit'
24
43
  PRIMITIVE_PREFIX = 'Primitive'
25
44
  DEFAULT_LIST = []
@@ -47,6 +66,7 @@ class Const:
47
66
  ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF]
48
67
  SUMMARY = "summary"
49
68
  MD5 = "md5"
69
+ VALUE = "value"
50
70
  SUMMARY_MODE = [ALL, SUMMARY, MD5]
51
71
 
52
72
  WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
@@ -55,6 +75,7 @@ class Const:
55
75
 
56
76
  PKL_SUFFIX = ".pkl"
57
77
  NUMPY_SUFFIX = ".npy"
78
+ NUMPY_PATTERN = "*.npy"
58
79
  PT_SUFFIX = ".pt"
59
80
  ONE_GB = 1073741824 # 1 * 1024 * 1024 * 1024
60
81
  TEN_GB = 10737418240 # 10 * 1024 * 1024 * 1024
@@ -69,6 +90,8 @@ class Const:
69
90
  INPUT_KWARGS = 'input_kwargs'
70
91
  GRAD_INPUT = 'grad_input'
71
92
  GRAD_OUTPUT = 'grad_output'
93
+ PARAMS = 'parameters'
94
+ PARAMS_GRAD = 'parameters_grad'
72
95
  START = "start"
73
96
  STOP = "stop"
74
97
  ENV_ENABLE = "1"
@@ -82,6 +105,7 @@ class Const:
82
105
  GRAD_PROBE = "grad_probe"
83
106
  TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE]
84
107
  DUMP_DATA_COLLECTION_LIST = [STATISTICS, TENSOR]
108
+ DUMP_DATA_MODE_LIST = [ALL, INPUT, OUTPUT, FORWARD, BACKWARD]
85
109
  LEVEL_L0 = "L0"
86
110
  LEVEL_L1 = "L1"
87
111
  LEVEL_L2 = "L2"
@@ -93,6 +117,8 @@ class Const:
93
117
  DATA = "data"
94
118
  PT_FRAMEWORK = "pytorch"
95
119
  MS_FRAMEWORK = "mindspore"
120
+ MT_FRAMEWORK = "mindtorch"
121
+ UNKNOWN_FRAMEWORK = "unknown"
96
122
  DIRECTORY_LENGTH = 4096
97
123
  FILE_NAME_LENGTH = 255
98
124
  FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble, np.float32, np.float16]
@@ -102,7 +128,12 @@ class Const:
102
128
  NPU_LOWERCASE = 'npu'
103
129
  CPU_LOWERCASE = 'cpu'
104
130
  CUDA_LOWERCASE = 'cuda'
131
+ DEVICE = 'device'
105
132
  DISTRIBUTED = 'Distributed'
133
+ DUMP_PREFIX = ["Distributed", "Functional", "Torch", "Tensor", "Mint", "MintFunctional", "Primitive",
134
+ "Aten", "VF", "NPU", "Jit"]
135
+ MODULE_PREFIX = ["Module", "Cell"]
136
+ FORWARD_NAME_SUFFIX = ".forward"
106
137
 
107
138
  # struct json param
108
139
  ORIGIN_DATA = "origin_data"
@@ -113,21 +144,28 @@ class Const:
113
144
  MODULE_WHITE_LIST = ["torch", "numpy"]
114
145
 
115
146
  FUNC_SKIP_LIST = ["construct", "__call__"]
116
-
117
- FILE_SKIP_LIST = ["site-packages/mindspore", "package/mindspore", "msprobe", "site-packages/torch", "package/torch"]
147
+ FILE_SKIP_LIST = ["msprobe", "MindSpeed"]
148
+ DATA_TYPE_SKIP_LIST = ["Primitive", "Jit"]
118
149
 
119
150
  STACK_FILE_INDEX = 0
120
-
121
151
  STACK_FUNC_INDEX = 2
122
-
123
152
  STACK_FUNC_ELE_INDEX = 1
124
153
 
125
- CONSTRUCT_NAME_INDEX = -3
126
-
127
- NAME_FIRST_POSSIBLE_INDEX = -4
128
-
129
- NAME_SECOND_POSSIBLE_INDEX = -5
130
-
154
+ SCOPE_ID_INDEX = -1
155
+ SCOPE_DIRECTION_INDEX = -2
156
+ TYPE_NAME_INDEX = -3
157
+ PARAMS_GRAD_TYPE_NAME_INDEX = -2
158
+ LAYER_NAME_INDEX = -4
159
+ PARAMS_GRAD_NAME_INDEX = -3
160
+ API_TYPE_INDEX = 0
161
+ LEFT_MOVE_INDEX = -1
162
+ RIGHT_MOVE_INDEX = 1
163
+ LAST_INDEX = -1
164
+
165
+ TOP_LAYER = "TopLayer"
166
+ CELL = "Cell"
167
+ MODULE = "Module"
168
+ FRAME_FILE_LIST = ["site-packages/torch", "package/torch", "site-packages/mindspore", "package/mindspore"]
131
169
  INPLACE_LIST = [
132
170
  "broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
133
171
  "_reduce_scatter_base", "_all_gather_base", "send", "recv", "irecv", "isend", "all_to_all_single", "all_to_all",
@@ -136,22 +174,29 @@ class Const:
136
174
 
137
175
  CONVERT = {
138
176
  "int32_to_int64": ["torch.int32", "torch.int64"],
177
+ "int64_to_fp32": ["torch.int64", "torch.float32"]
139
178
  }
140
179
 
141
180
  CONVERT_API = {
142
- "int32_to_int64": ["cross_entropy"]
181
+ "int32_to_int64": ["cross_entropy"],
182
+ "int64_to_fp32": ["histc"]
143
183
  }
144
184
 
185
+ FA_SPECIAL_SPARSE_MODE = [2, 3, 4]
186
+
145
187
  FILL_CHAR_NUMS = 50
146
188
  TOOL_ENDS_SUCCESSFULLY = f"{TOOL_NAME} ends successfully."
147
189
  WITHOUT_CALL_STACK = "The call stack retrieval failed."
148
-
190
+
149
191
  STEP = "step"
150
192
  RANK = "rank"
151
193
  HYPHEN = "-"
152
- STEP_RANK_MAXIMUM_RANGE = [int(0), int(1e6)]
194
+ STEP_RANK_MINIMUM_VALUE = 0
195
+ STEP_RANK_MAXIMUM_VALUE = int(1e6)
153
196
 
154
197
  # data type const
198
+ TORCH_INT_DTYPE = ["torch.int8", "torch.int32", "torch.int64"]
199
+ TORCH_FLOAT_DTYPE = ["torch.bfloat16", "torch.float16", "torch.float32", "torch.float64"]
155
200
  FLOAT16 = "Float16"
156
201
  FLOAT32 = "Float32"
157
202
  BFLOAT16 = "BFloat16"
@@ -159,6 +204,30 @@ class Const:
159
204
  TORCH_FLOAT32 = "torch.float32"
160
205
  TORCH_BFLOAT16 = "torch.bfloat16"
161
206
 
207
+ DTYPE = 'dtype'
208
+ SHAPE = 'shape'
209
+ MAX = 'Max'
210
+ MIN = 'Min'
211
+ MEAN = 'Mean'
212
+ NORM = 'Norm'
213
+
214
+ CODE_STACK = 'Code Stack'
215
+ OP_NAME = 'Op Name'
216
+ SCOPE_NAME = 'Scope Name'
217
+ CODE_STACKS = 'Code Stacks'
218
+ FILE_PATH = 'File Path'
219
+ NEW_LINE = '\n'
220
+ CSV_NEWLINE_SEPARATOR = ',\n'
221
+ # 分隔符常量
222
+ SCOPE_SEPARATOR = "/"
223
+ REPLACEMENT_CHARACTER = "_"
224
+
225
+ OPTIMIZER = "optimizer"
226
+ CLIP_GRAD = "clip_grad"
227
+ END_PREFIX = "end_"
228
+
229
+ TENSOR_STAT_LEN = 2
230
+
162
231
 
163
232
  class CompareConst:
164
233
  """
@@ -201,10 +270,62 @@ class CompareConst:
201
270
  RESULT = "Result"
202
271
  MAGNITUDE = 0.5
203
272
  OP_NAME = "op_name"
273
+ STRUCT = "struct"
204
274
  INPUT_STRUCT = "input_struct"
275
+ KWARGS_STRUCT = "kwargs_struct"
205
276
  OUTPUT_STRUCT = "output_struct"
277
+ PARAMS_STRUCT = "params_struct"
278
+ PARAMS_GRAD_STRUCT = "params_grad_struct"
206
279
  SUMMARY = "summary"
280
+ COMPARE_RESULT = "compare_result"
281
+ COMPARE_MESSAGE = "compare_message"
207
282
  MAX_EXCEL_LENGTH = 1048576
283
+ YES = "Yes"
284
+ NO = "No"
285
+ STATISTICS_INDICATOR_NUM = 4
286
+ EPSILON = 1e-10
287
+ COMPARE_ENDS_SUCCESSFULLY = "msprobe compare ends successfully."
288
+ DEFAULT_RATIO_VALUE = 10000
289
+ THOUSANDTH_PASS_VALUE = 0.999
290
+ ZERO_SHAPE = '(0,)'
291
+
292
+ BENCHMARK_COMPARE_ALGORITHM_NAME = "标杆比对法"
293
+ ULP_COMPARE_ALGORITHM_NAME = "ULP误差比对法"
294
+ BINARY_CONSISTENCY_ALGORITHM_NAME = "二进制一致法"
295
+ ABSOLUTE_THRESHOLD_ALGORITHM_NAME = "绝对阈值法"
296
+ THOUSANDTH_STANDARD_ALGORITHM_NAME = "双千指标法"
297
+ ACCUMULATIVE_ERROR_COMPARE_ALGORITHM_NAME = "累积误差比对法"
298
+
299
+ ABSOLUTE_THRESHOLD = 'absolute_threshold'
300
+ BINARY_CONSISTENCY = 'binary_consistency'
301
+ ULP_COMPARE = 'ulp_compare'
302
+ THOUSANDTH_STANDARD = 'thousandth_threshold'
303
+ BENCHMARK = 'benchmark'
304
+ ACCUMULATIVE_ERROR_COMPARE = 'accumulative_error_compare'
305
+
306
+ SMALL_VALUE_ERR_RATIO = "small_value_err_ratio"
307
+ RMSE_RATIO = "rmse_ratio"
308
+ MAX_REL_ERR_RATIO = "max_rel_err_ratio"
309
+ MEAN_REL_ERR_RATIO = "mean_rel_err_ratio"
310
+ EB_RATIO = "eb_ratio"
311
+
312
+ SMALL_VALUE = "small_value"
313
+ RMSE = "rmse"
314
+ MAX_REL_ERR = "max_rel_err"
315
+ MEAN_REL_ERR = "mean_rel_err"
316
+ EB = "eb"
317
+
318
+ SMALL_VALUE_ERR_STATUS = "small_value_err_status"
319
+ RMSE_STATUS = "rmse_status"
320
+ MAX_REL_ERR_STATUS = "max_rel_err_status"
321
+ MEAN_REL_ERR_STATUS = "mean_rel_err_status"
322
+ EB_STATUS = "eb_status"
323
+
324
+ MEAN_ULP_ERR = "mean_ulp_err"
325
+ ULP_ERR_PROPORTION = "ulp_err_proportion"
326
+ ULP_ERR_PROPORTION_RATIO = "ulp_err_proportion_ratio"
327
+
328
+ ULP_ERR_STATUS = "ulp_err_status"
208
329
 
209
330
  COMPARE_RESULT_HEADER = [
210
331
  NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR,
@@ -222,6 +343,57 @@ class CompareConst:
222
343
  NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, NPU_MD5, BENCH_MD5, RESULT
223
344
  ]
224
345
 
346
+ COMPARE_RESULT_HEADER_STACK = COMPARE_RESULT_HEADER + [STACK]
347
+
348
+ SUMMARY_COMPARE_RESULT_HEADER_STACK = SUMMARY_COMPARE_RESULT_HEADER + [STACK]
349
+
350
+ MD5_COMPARE_RESULT_HEADER_STACK = MD5_COMPARE_RESULT_HEADER + [STACK]
351
+
352
+ HEAD_OF_COMPARE_MODE = {
353
+ Const.ALL: COMPARE_RESULT_HEADER,
354
+ Const.SUMMARY: SUMMARY_COMPARE_RESULT_HEADER,
355
+ Const.MD5: MD5_COMPARE_RESULT_HEADER
356
+ }
357
+
358
+ ALL_COMPARE_INDEX = [COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR, ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO]
359
+ SUMMARY_COMPARE_INDEX = [MAX_DIFF, MIN_DIFF, MEAN_DIFF, NORM_DIFF,
360
+ MAX_RELATIVE_ERR, MIN_RELATIVE_ERR, MEAN_RELATIVE_ERR, NORM_RELATIVE_ERR]
361
+
362
+ # dtype match
363
+ MS_TYPE = [
364
+ [Const.FLOAT16, Const.FLOAT32], [Const.FLOAT32, Const.FLOAT16],
365
+ [Const.FLOAT16, Const.BFLOAT16], [Const.BFLOAT16, Const.FLOAT16]
366
+ ]
367
+ TORCH_TYPE = [
368
+ [Const.TORCH_FLOAT16, Const.TORCH_FLOAT32], [Const.TORCH_FLOAT32, Const.TORCH_FLOAT16],
369
+ [Const.TORCH_FLOAT16, Const.TORCH_BFLOAT16], [Const.TORCH_BFLOAT16, Const.TORCH_FLOAT16]
370
+ ]
371
+
372
+ # read_op
373
+ IO_NAME_MAPPING = {
374
+ Const.INPUT_ARGS: '.input',
375
+ Const.INPUT_KWARGS: '.input',
376
+ Const.INPUT: '.input',
377
+ Const.OUTPUT: '.output',
378
+ Const.PARAMS: '.parameters'
379
+ }
380
+
381
+ # state to struct mapping
382
+ STATE_TO_STRUCT_MAPPING = {
383
+ Const.INPUT: INPUT_STRUCT,
384
+ Const.KWARGS: INPUT_STRUCT,
385
+ Const.OUTPUT: OUTPUT_STRUCT,
386
+ Const.PARAMS: PARAMS_STRUCT,
387
+ Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT
388
+ }
389
+
390
+ STRUCT_COMPARE_KEY = [
391
+ INPUT_STRUCT,
392
+ OUTPUT_STRUCT,
393
+ PARAMS_STRUCT,
394
+ PARAMS_GRAD_STRUCT
395
+ ]
396
+
225
397
  # compare standard
226
398
  HUNDRED_RATIO_THRESHOLD = 0.01
227
399
  THOUSAND_RATIO_THRESHOLD = 0.001
@@ -241,6 +413,8 @@ class CompareConst:
241
413
  PASS = 'pass'
242
414
  WARNING = 'Warning'
243
415
  ERROR = 'error'
416
+ TRUE = 'TRUE'
417
+ FALSE = 'FALSE'
244
418
  SKIP = 'SKIP'
245
419
  N_A = 'N/A'
246
420
  INF = 'inf'
@@ -298,6 +472,15 @@ class CompareConst:
298
472
  MAX_DIFF: None, MIN_DIFF: None, MEAN_DIFF: None, NORM_DIFF: None, MAX_RELATIVE_ERR: None,
299
473
  MIN_RELATIVE_ERR: None, MEAN_RELATIVE_ERR: None, NORM_RELATIVE_ERR: None
300
474
  }
475
+ INPUT_PATTERN = Const.SEP + Const.INPUT + Const.SEP
476
+ KWARGS_PATTERN = Const.SEP + Const.KWARGS + Const.SEP
477
+ OUTPUT_PATTERN = Const.SEP + Const.OUTPUT + Const.SEP
478
+ PARAMS_PATTERN = Const.SEP + Const.PARAMS + Const.SEP
479
+ PARAMS_GRAD_PATTERN = Const.SEP + Const.PARAMS_GRAD + Const.SEP
480
+ COMPARE_KEY = 'compare_key'
481
+ COMPARE_SHAPE = 'compare_shape'
482
+ INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml'
483
+ UNREADABLE = 'unreadable data'
301
484
 
302
485
 
303
486
  class FileCheckConst:
@@ -316,13 +499,17 @@ class FileCheckConst:
316
499
  JSON_SUFFIX = ".json"
317
500
  PT_SUFFIX = ".pt"
318
501
  CSV_SUFFIX = ".csv"
502
+ XLSX_SUFFIX = ".xlsx"
319
503
  YAML_SUFFIX = ".yaml"
504
+ IR_SUFFIX = ".ir"
320
505
  MAX_PKL_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
321
506
  MAX_NUMPY_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024
322
507
  MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
323
508
  MAX_PT_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024
324
509
  MAX_CSV_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
325
- MAX_YAML_SIZE = 1048576 # 1 * 1024 * 1024
510
+ MAX_XLSX_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
511
+ MAX_YAML_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
512
+ MAX_IR_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024
326
513
  COMMOM_FILE_SIZE = 1048576 # 1 * 1024 * 1024
327
514
  DIR = "dir"
328
515
  FILE = "file"
@@ -334,7 +521,9 @@ class FileCheckConst:
334
521
  JSON_SUFFIX: MAX_JSON_SIZE,
335
522
  PT_SUFFIX: MAX_PT_SIZE,
336
523
  CSV_SUFFIX: MAX_CSV_SIZE,
337
- YAML_SUFFIX: MAX_YAML_SIZE
524
+ XLSX_SUFFIX: MAX_XLSX_SIZE,
525
+ YAML_SUFFIX: MAX_YAML_SIZE,
526
+ IR_SUFFIX: MAX_IR_SIZE
338
527
  }
339
528
  CSV_BLACK_LIST = r'^[+-=%@\+\-=%@]|;[+-=%@\+\-=%@]'
340
529
 
@@ -351,6 +540,9 @@ class MsCompareConst:
351
540
  # api_info field
352
541
  MINT = "Mint"
353
542
  MINT_FUNCTIONAL = "MintFunctional"
543
+ TENSOR_API = "Tensor"
544
+
545
+ API_NAME_STR_LENGTH = 4
354
546
 
355
547
  TASK_FIELD = "task"
356
548
  STATISTICS_TASK = "statistics"
@@ -358,6 +550,10 @@ class MsCompareConst:
358
550
  DUMP_DATA_DIR_FIELD = "dump_data_dir"
359
551
  DATA_FIELD = "data"
360
552
 
553
+ # supported api yaml
554
+ SUPPORTED_API_LIST_FILE = "checker_support_api.yaml"
555
+ SUPPORTED_TENSOR_LIST_KEY = "tensor"
556
+
361
557
  # detail_csv
362
558
  DETAIL_CSV_API_NAME = "API Name"
363
559
  DETAIL_CSV_BENCH_DTYPE = "Bench Dtype"
@@ -374,6 +570,11 @@ class MsCompareConst:
374
570
 
375
571
  EPSILON = 1e-8
376
572
 
573
+ class ProcessStatus:
574
+ SUCCESS = "success"
575
+ API_NOT_FOUND = "api_not_found"
576
+ EXCEPTION_SKIP = "exception_skip"
577
+
377
578
 
378
579
  class MsgConst:
379
580
  """
@@ -382,15 +583,20 @@ class MsgConst:
382
583
  MSPROBE_LOG_LEVEL = "MSPROBE_LOG_LEVEL"
383
584
  LOG_LEVEL_ENUM = ["0", "1", "2", "3", "4"]
384
585
  LOG_LEVEL = ["DEBUG", "INFO", "WARNING", "ERROR"]
586
+
385
587
  class LogLevel:
386
588
  class DEBUG:
387
589
  value = 0
590
+
388
591
  class INFO:
389
592
  value = 1
593
+
390
594
  class WARNING:
391
595
  value = 2
596
+
392
597
  class ERROR:
393
598
  value = 3
599
+
394
600
  SPECIAL_CHAR = ["\n", "\r", "\u007F", "\b", "\f", "\t", "\u000B", "%08", "%0a", "%0b", "%0c", "%0d", "%7f"]
395
601
 
396
602
  NOT_CREATED_INSTANCE = "PrecisionDebugger instance is not created."
@@ -400,3 +606,46 @@ class GraphMode:
400
606
  NPY_MODE = "NPY_MODE"
401
607
  STATISTIC_MODE = "STATISTIC_MODE"
402
608
  ERROR_MODE = "ERROR_MODE"
609
+
610
+
611
+ class MonitorConst:
612
+ """
613
+ Class for monitor const
614
+ """
615
+ OP_LIST = ["norm", "min", "max", "zeros", "nans", "id", "mean"]
616
+ MONITOR_OUTPUT_DIR = "MONITOR_OUTPUT_DIR"
617
+ DEFAULT_MONITOR_OUTPUT_DIR = "./monitor_output"
618
+ DATABASE = "database"
619
+ EMAIL = "email"
620
+ OPT_TY = ['Megatron_DistributedOptimizer', 'Megatron_Float16OptimizerWithFloat16Params']
621
+ DEEPSPEED_OPT_TY = (
622
+ "DeepSpeedZeroOptimizer_Stage0",
623
+ "DeepSpeedZeroOptimizer_Stage1_or_2",
624
+ "DeepSpeedZeroOptimizer_Stage3"
625
+ )
626
+ RULE_NAME = ['AnomalyTurbulence']
627
+
628
+ SLICE_SIZE = 20480
629
+ DOT = "."
630
+ VPP_SEP = ":"
631
+ ACTV_IN = "input"
632
+ ACTV_OUT = "output"
633
+ ACTVGRAD_IN = "input_grad"
634
+ ACTVGRAD_OUT = "output_grad"
635
+ POST_GRAD = "post_grad"
636
+ PRE_GRAD = "pre_grad"
637
+ ACC_GRAD = "acc_grad"
638
+ PREFIX_POST = "post"
639
+ PREFIX_PRE = "pre"
640
+ OUTPUT_DIR_PATTERN = r"([\w-]{0,20})-rank(\d{1,5})-"
641
+
642
+ EXP_AVG = "exp_avg"
643
+ EFXP_AVG_SQ = "efxp_avg_sq"
644
+
645
+ ANOMALY_JSON = "anomaly.json"
646
+ ANALYSE_JSON = "anomaly_analyse.json"
647
+ TENSORBOARD = "tensorboard"
648
+ CSV = "csv"
649
+ API = "api"
650
+ OPS_START_INDEX = 3
651
+ HEADER_NAME_INDEX = 1
@@ -1,3 +1,18 @@
1
+ # Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
1
16
  class CodedException(Exception):
2
17
  def __init__(self, code, error_info=''):
3
18
  super().__init__()
@@ -11,10 +26,14 @@ class CodedException(Exception):
11
26
  class MsprobeException(CodedException):
12
27
  INVALID_PARAM_ERROR = 0
13
28
  OVERFLOW_NUMS_ERROR = 1
29
+ RECURSION_LIMIT_ERROR = 2
30
+ INTERFACE_USAGE_ERROR = 3
14
31
 
15
32
  err_strs = {
16
33
  INVALID_PARAM_ERROR: "[msprobe] 无效参数:",
17
- OVERFLOW_NUMS_ERROR: "[msprobe] 超过预设溢出次数 当前溢出次数:"
34
+ OVERFLOW_NUMS_ERROR: "[msprobe] 超过预设溢出次数 当前溢出次数:",
35
+ RECURSION_LIMIT_ERROR: "[msprobe] 递归调用超过限制:",
36
+ INTERFACE_USAGE_ERROR: "[msprobe] Invalid interface usage: "
18
37
  }
19
38
 
20
39
 
@@ -41,7 +60,7 @@ class ParseJsonException(CodedException):
41
60
  InvalidDumpJson = 1
42
61
  err_strs = {
43
62
  UnexpectedNameStruct: "[msprobe] Unexpected name in json: ",
44
- InvalidDumpJson: "[msprobe] json格式不正确: ",
63
+ InvalidDumpJson: "[msprobe] Invalid dump.json format: ",
45
64
  }
46
65
 
47
66
 
@@ -73,9 +92,13 @@ class StepException(CodedException):
73
92
  class FreeBenchmarkException(CodedException):
74
93
  UnsupportedType = 0
75
94
  InvalidGrad = 1
95
+ InvalidPerturbedOutput = 2
96
+ OutputIndexError = 3
76
97
  err_strs = {
77
98
  UnsupportedType: "[msprobe] Free benchmark get unsupported type: ",
78
99
  InvalidGrad: "[msprobe] Free benchmark gradient invalid: ",
100
+ InvalidPerturbedOutput: "[msprobe] Free benchmark invalid perturbed output: ",
101
+ OutputIndexError: "[msprobe] Free benchmark output index out of bounds: ",
79
102
  }
80
103
 
81
104
 
@@ -87,6 +110,7 @@ class DistributedNotInitializedError(Exception):
87
110
  def __str__(self):
88
111
  return self.msg
89
112
 
113
+
90
114
  class ApiAccuracyCheckerException(CodedException):
91
115
  ParseJsonFailed = 0
92
116
  UnsupportType = 1
@@ -97,4 +121,4 @@ class ApiAccuracyCheckerException(CodedException):
97
121
  UnsupportType: "[msprobe] Api Accuracy Checker get unsupported type: ",
98
122
  WrongValue: "[msprobe] Api Accuracy Checker get wrong value: ",
99
123
  ApiWrong: "[msprobe] Api Accuracy Checker something wrong with api: ",
100
- }
124
+ }