mindstudio-probe 8.3.2__py3-none-any.whl → 26.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (689) hide show
  1. {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/METADATA +26 -14
  2. mindstudio_probe-26.0.0a1.dist-info/RECORD +498 -0
  3. {mindstudio_probe-8.3.2.dist-info → mindstudio_probe-26.0.0a1.dist-info}/WHEEL +1 -1
  4. mindstudio_probe-26.0.0a1.dist-info/entry_points.txt +5 -0
  5. mindstudio_probe-26.0.0a1.dist-info/licenses/LICENSE +124 -0
  6. mindstudio_probe-26.0.0a1.dist-info/top_level.txt +2 -0
  7. msprobe/__init__.py +12 -13
  8. msprobe/config.json +9 -31
  9. msprobe/core/__init__.py +12 -11
  10. msprobe/core/acc_check/acc_check_cli.py +145 -0
  11. msprobe/core/common/const.py +97 -38
  12. msprobe/core/common/db_manager.py +133 -12
  13. msprobe/core/common/decorator.py +12 -11
  14. msprobe/core/common/exceptions.py +12 -11
  15. msprobe/core/common/file_utils.py +101 -25
  16. msprobe/core/common/framework_adapter.py +36 -25
  17. msprobe/core/common/global_lock.py +12 -11
  18. msprobe/core/common/inplace_op_checker.py +12 -11
  19. msprobe/core/common/log.py +22 -11
  20. msprobe/core/common/megatron_utils.py +566 -11
  21. msprobe/core/common/parallel_state.py +12 -11
  22. msprobe/core/common/runtime.py +12 -11
  23. msprobe/core/common/utils.py +41 -41
  24. msprobe/core/compare/acc_compare.py +361 -104
  25. msprobe/core/compare/atb_data_compare.py +422 -0
  26. msprobe/core/compare/auto_compare.py +134 -0
  27. msprobe/core/compare/check.py +14 -17
  28. msprobe/core/compare/compare_cli.py +72 -149
  29. msprobe/core/compare/config.py +12 -13
  30. msprobe/core/compare/diff_analyze/first_diff_analyze.py +28 -15
  31. msprobe/core/compare/diff_analyze/ignore_op_list.yaml +3 -0
  32. msprobe/core/compare/find_first/analyzer.py +18 -18
  33. msprobe/core/compare/find_first/graph.py +12 -11
  34. msprobe/core/compare/find_first/utils.py +13 -12
  35. msprobe/core/compare/indicator_analysis/__init__.py +15 -0
  36. msprobe/core/compare/indicator_analysis/algorithm.py +363 -0
  37. msprobe/core/compare/indicator_analysis/api_data.py +141 -0
  38. msprobe/core/compare/indicator_analysis/calculator.py +181 -0
  39. msprobe/core/compare/indicator_analysis/utils.py +116 -0
  40. msprobe/core/compare/layer_mapping/__init__.py +12 -11
  41. msprobe/core/compare/layer_mapping/data_scope_parser.py +20 -11
  42. msprobe/core/compare/layer_mapping/layer_mapping.py +14 -13
  43. msprobe/core/compare/layer_mapping/postprocess_pass.py +13 -11
  44. msprobe/core/compare/merge_result/merge_result.py +12 -11
  45. msprobe/core/compare/merge_result/merge_result_cli.py +12 -11
  46. msprobe/core/compare/merge_result/utils.py +12 -11
  47. msprobe/core/compare/multiprocessing_compute.py +13 -14
  48. msprobe/core/compare/npy_compare.py +13 -11
  49. msprobe/core/compare/offline_data_compare.py +160 -0
  50. msprobe/core/compare/stats_diff_calc.py +39 -0
  51. msprobe/core/compare/torchair_acc_cmp.py +764 -0
  52. msprobe/core/compare/torchair_cmp_utils.py +338 -0
  53. msprobe/core/compare/utils.py +140 -49
  54. msprobe/core/config_check/__init__.py +12 -11
  55. msprobe/core/config_check/checkers/__init__.py +12 -11
  56. msprobe/core/config_check/checkers/base_checker.py +15 -14
  57. msprobe/core/config_check/checkers/dataset_checker.py +13 -12
  58. msprobe/core/config_check/checkers/env_args_checker.py +13 -12
  59. msprobe/core/config_check/checkers/hyperparameter_checker.py +16 -15
  60. msprobe/core/config_check/checkers/pip_checker.py +15 -15
  61. msprobe/core/config_check/checkers/random_checker.py +13 -12
  62. msprobe/core/config_check/checkers/weights_checker.py +14 -12
  63. msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +13 -17
  64. msprobe/core/config_check/ckpt_compare/megatron_loader.py +13 -12
  65. msprobe/core/config_check/ckpt_compare/metrics.py +12 -11
  66. msprobe/core/config_check/config_check_cli.py +18 -17
  67. msprobe/core/config_check/config_checker.py +16 -14
  68. msprobe/core/config_check/resource/dependency.yaml +15 -12
  69. msprobe/core/config_check/resource/env.yaml +12 -11
  70. msprobe/core/config_check/utils/hyperparameter_parser.py +12 -11
  71. msprobe/core/config_check/utils/utils.py +12 -11
  72. msprobe/core/{data_dump → dump/api_dump}/api_registry.py +12 -11
  73. msprobe/core/{common_config.py → dump/common_config.py} +13 -24
  74. msprobe/core/dump/data_dump/data_collector.py +257 -0
  75. msprobe/core/{data_dump → dump/data_dump}/data_processor/base.py +45 -36
  76. msprobe/core/{data_dump → dump/data_dump}/data_processor/factory.py +33 -25
  77. msprobe/core/{data_dump → dump/data_dump}/data_processor/mindspore_processor.py +37 -113
  78. msprobe/core/{data_dump → dump/data_dump}/data_processor/pytorch_processor.py +364 -131
  79. msprobe/core/{data_dump → dump/data_dump}/json_writer.py +24 -31
  80. msprobe/core/{data_dump → dump/data_dump}/scope.py +12 -13
  81. msprobe/core/{debugger → dump/debugger}/precision_debugger.py +15 -23
  82. msprobe/core/dump/dump2db/db_utils.py +215 -0
  83. msprobe/core/dump/dump2db/dump2db.py +409 -0
  84. msprobe/core/{hook_manager.py → dump/hook_manager.py} +38 -87
  85. msprobe/core/dump/kernel_dump/kernel_config.py +34 -0
  86. msprobe/core/{service.py → dump/service.py} +43 -27
  87. msprobe/core/install_deps/install_deps.py +51 -0
  88. msprobe/core/monitor/anomaly_processor.py +13 -11
  89. msprobe/core/monitor/csv2db.py +73 -93
  90. msprobe/core/monitor/db_utils.py +140 -205
  91. msprobe/core/monitor/utils.py +18 -17
  92. msprobe/core/monitor_v2/__init__.py +20 -0
  93. msprobe/core/monitor_v2/base.py +83 -0
  94. msprobe/core/monitor_v2/cc.py +287 -0
  95. msprobe/core/monitor_v2/factory.py +81 -0
  96. msprobe/core/monitor_v2/module.py +201 -0
  97. msprobe/core/monitor_v2/optimizer.py +245 -0
  98. msprobe/core/monitor_v2/param.py +154 -0
  99. msprobe/core/monitor_v2/trainer.py +326 -0
  100. msprobe/core/monitor_v2/utils.py +122 -0
  101. msprobe/core/monitor_v2/weight_grad.py +419 -0
  102. msprobe/core/monitor_v2/writer.py +162 -0
  103. msprobe/core/overflow_check/abnormal_scene.py +12 -11
  104. msprobe/core/overflow_check/api_info.py +12 -11
  105. msprobe/core/overflow_check/checker.py +12 -11
  106. msprobe/core/overflow_check/filter.py +13 -11
  107. msprobe/core/overflow_check/level.py +12 -11
  108. msprobe/core/overflow_check/utils.py +12 -11
  109. msprobe/core/single_save/single_comparator.py +12 -11
  110. msprobe/core/single_save/single_saver.py +12 -11
  111. msprobe/infer/__init__.py +16 -0
  112. msprobe/infer/offline/__init__.py +16 -0
  113. msprobe/infer/offline/compare/__init__.py +16 -0
  114. msprobe/infer/offline/compare/msquickcmp/__init__.py +16 -0
  115. msprobe/infer/offline/compare/msquickcmp/adapter_cli/__init__.py +16 -0
  116. msprobe/infer/offline/compare/msquickcmp/adapter_cli/args_adapter.py +46 -0
  117. msprobe/infer/offline/compare/msquickcmp/atc/__init__.py +16 -0
  118. msprobe/infer/offline/compare/msquickcmp/atc/atc_utils.py +98 -0
  119. msprobe/infer/offline/compare/msquickcmp/cmp_process.py +328 -0
  120. msprobe/infer/offline/compare/msquickcmp/common/__init__.py +16 -0
  121. msprobe/infer/offline/compare/msquickcmp/common/args_check.py +112 -0
  122. msprobe/infer/offline/compare/msquickcmp/common/convert.py +74 -0
  123. msprobe/infer/offline/compare/msquickcmp/common/dump_data.py +121 -0
  124. msprobe/infer/offline/compare/msquickcmp/common/dynamic_argument_bean.py +39 -0
  125. msprobe/infer/offline/compare/msquickcmp/common/utils.py +669 -0
  126. msprobe/infer/offline/compare/msquickcmp/config.ini +6 -0
  127. msprobe/infer/offline/compare/msquickcmp/dump/__init__.py +16 -0
  128. msprobe/infer/offline/compare/msquickcmp/dump/args_adapter.py +50 -0
  129. msprobe/infer/offline/compare/msquickcmp/dump/dump_process.py +91 -0
  130. msprobe/infer/offline/compare/msquickcmp/install_aclruntime_aisbench.sh +180 -0
  131. msprobe/infer/offline/compare/msquickcmp/main.py +199 -0
  132. msprobe/infer/offline/compare/msquickcmp/net_compare/__init__.py +16 -0
  133. msprobe/infer/offline/compare/msquickcmp/net_compare/net_compare.py +277 -0
  134. msprobe/infer/offline/compare/msquickcmp/npu/__init__.py +16 -0
  135. msprobe/infer/offline/compare/msquickcmp/npu/npu_dump_data.py +558 -0
  136. msprobe/infer/offline/compare/msquickcmp/npu/om_parser.py +416 -0
  137. msprobe/infer/offline/compare/msquickcmp/onnx_model/__init__.py +16 -0
  138. msprobe/infer/offline/compare/msquickcmp/onnx_model/onnx_dump_data.py +374 -0
  139. msprobe/infer/utils/__init__.py +15 -0
  140. msprobe/infer/utils/acc_cmp.py +94 -0
  141. msprobe/infer/utils/check/__init__.py +37 -0
  142. msprobe/infer/utils/check/args_checker.py +35 -0
  143. msprobe/infer/utils/check/checker.py +227 -0
  144. msprobe/infer/utils/check/dict_checker.py +78 -0
  145. msprobe/infer/utils/check/func_wrapper.py +96 -0
  146. msprobe/infer/utils/check/list_checker.py +56 -0
  147. msprobe/infer/utils/check/number_checker.py +64 -0
  148. msprobe/infer/utils/check/obj_checker.py +41 -0
  149. msprobe/infer/utils/check/path_checker.py +249 -0
  150. msprobe/infer/utils/check/rule.py +126 -0
  151. msprobe/infer/utils/check/string_checker.py +66 -0
  152. msprobe/infer/utils/cmp_algorithm.py +261 -0
  153. msprobe/infer/utils/constants.py +112 -0
  154. msprobe/infer/utils/file_open_check.py +337 -0
  155. msprobe/infer/utils/util.py +177 -0
  156. msprobe/mindspore/__init__.py +14 -13
  157. msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +14 -13
  158. msprobe/mindspore/api_accuracy_checker/api_info.py +12 -11
  159. msprobe/mindspore/api_accuracy_checker/api_runner.py +12 -11
  160. msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +12 -11
  161. msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +12 -11
  162. msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +12 -11
  163. msprobe/mindspore/api_accuracy_checker/checker_support_api.yaml +12 -11
  164. msprobe/mindspore/api_accuracy_checker/cmd_parser.py +15 -14
  165. msprobe/mindspore/api_accuracy_checker/compute_element.py +12 -11
  166. msprobe/mindspore/api_accuracy_checker/data_manager.py +13 -11
  167. msprobe/mindspore/api_accuracy_checker/main.py +12 -11
  168. msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +14 -12
  169. msprobe/mindspore/api_accuracy_checker/multi_data_manager.py +13 -11
  170. msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +12 -11
  171. msprobe/mindspore/api_accuracy_checker/type_mapping.py +12 -11
  172. msprobe/mindspore/api_accuracy_checker/utils.py +12 -11
  173. msprobe/mindspore/common/const.py +15 -74
  174. msprobe/mindspore/common/log.py +12 -11
  175. msprobe/mindspore/common/utils.py +30 -15
  176. msprobe/mindspore/compare/common_dir_compare.py +21 -23
  177. msprobe/mindspore/compare/distributed_compare.py +18 -16
  178. msprobe/mindspore/compare/ms_compare.py +14 -14
  179. msprobe/mindspore/compare/ms_graph_compare.py +26 -20
  180. msprobe/mindspore/compare/utils.py +14 -12
  181. msprobe/mindspore/{cell_processor.py → dump/cell_processor.py} +15 -14
  182. msprobe/mindspore/{debugger → dump/debugger}/debugger_config.py +12 -30
  183. msprobe/mindspore/{debugger → dump/debugger}/precision_debugger.py +43 -45
  184. msprobe/mindspore/dump/{cell_dump_process.py → dump_processor/cell_dump_process.py} +31 -17
  185. msprobe/mindspore/dump/{cell_dump_with_insert_gradient.py → dump_processor/cell_dump_with_insert_gradient.py} +18 -14
  186. msprobe/mindspore/dump/{dump_tool_factory.py → dump_processor/dump_tool_factory.py} +16 -15
  187. msprobe/mindspore/dump/{graph_mode_cell_dump.py → dump_processor/graph_mode_cell_dump.py} +16 -15
  188. msprobe/mindspore/dump/{graph_tensor_dump.py → dump_processor/graph_tensor_dump.py} +134 -133
  189. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/api_register.py +15 -14
  190. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/hook_cell.py +12 -11
  191. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/ms_hook_manager.py +47 -20
  192. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/primitive_hooks.py +14 -13
  193. msprobe/mindspore/dump/{hook_cell → dump_processor/hook_cell}/support_wrap_ops.yaml +13 -11
  194. msprobe/mindspore/dump/{jit_dump.py → dump_processor/jit_dump.py} +14 -13
  195. msprobe/mindspore/dump/{kernel_graph_dump.py → dump_processor/kernel_graph_dump.py} +13 -12
  196. msprobe/mindspore/dump/{kernel_kbyk_dump.py → dump_processor/kernel_kbyk_dump.py} +13 -12
  197. msprobe/mindspore/{exception_dump → dump/exception_dump}/exception_dump_tool_factory.py +14 -13
  198. msprobe/mindspore/{exception_dump → dump/exception_dump}/kernel_graph_exception_dump.py +13 -12
  199. msprobe/mindspore/{mindspore_service.py → dump/mindspore_service.py} +18 -17
  200. msprobe/mindspore/dump/mindtorch/__init__.py +19 -0
  201. msprobe/mindspore/dump/ms_config.py +105 -0
  202. msprobe/mindspore/{overflow_check → dump/overflow_check}/kernel_graph_overflow_check.py +13 -12
  203. msprobe/mindspore/{overflow_check → dump/overflow_check}/overflow_check_tool_factory.py +14 -13
  204. msprobe/mindspore/dump/task_handler_factory.py +43 -0
  205. msprobe/mindspore/monitor/common_func.py +12 -11
  206. msprobe/mindspore/monitor/data_writers.py +12 -11
  207. msprobe/mindspore/monitor/distributed/wrap_distributed.py +93 -39
  208. msprobe/mindspore/monitor/features.py +12 -11
  209. msprobe/mindspore/monitor/module_hook.py +19 -22
  210. msprobe/mindspore/monitor/optimizer_collect.py +29 -25
  211. msprobe/mindspore/monitor/utils.py +13 -11
  212. msprobe/msaccucmp/advisor/__init__.py +16 -0
  213. msprobe/msaccucmp/advisor/advisor_const.py +65 -0
  214. msprobe/msaccucmp/advisor/advisor_result.py +73 -0
  215. msprobe/msaccucmp/advisor/compare_advisor.py +99 -0
  216. msprobe/msaccucmp/advisor/input_advisor.py +66 -0
  217. msprobe/msaccucmp/advisor/node_advisor.py +68 -0
  218. msprobe/msaccucmp/advisor/overflow_advisor.py +58 -0
  219. msprobe/msaccucmp/algorithm_manager/__init__.py +16 -0
  220. msprobe/msaccucmp/algorithm_manager/algorithm_manager.py +464 -0
  221. msprobe/msaccucmp/algorithm_manager/algorithm_parameter.py +42 -0
  222. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_AccumulatedRelativeError.py +46 -0
  223. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_CosineSimilarity.py +58 -0
  224. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_KullbackLeiblerDivergence.py +84 -0
  225. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxAbsoluteError.py +41 -0
  226. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MaxRelativeError.py +46 -0
  227. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanAbsoluteError.py +41 -0
  228. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_MeanRelativeError.py +46 -0
  229. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RelativeEuclideanDistance.py +46 -0
  230. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_RootMeanSquareError.py +40 -0
  231. msprobe/msaccucmp/algorithm_manager/builtin_algorithm/alg_StandardDeviation.py +47 -0
  232. msprobe/msaccucmp/cmp_utils/__init__.py +16 -0
  233. msprobe/msaccucmp/cmp_utils/common.py +113 -0
  234. msprobe/msaccucmp/cmp_utils/constant/__init__.py +16 -0
  235. msprobe/msaccucmp/cmp_utils/constant/compare_error.py +81 -0
  236. msprobe/msaccucmp/cmp_utils/constant/const_manager.py +530 -0
  237. msprobe/msaccucmp/cmp_utils/file_utils.py +497 -0
  238. msprobe/msaccucmp/cmp_utils/log.py +257 -0
  239. msprobe/msaccucmp/cmp_utils/multi_process/__init__.py +16 -0
  240. msprobe/msaccucmp/cmp_utils/multi_process/multi_convert_process.py +140 -0
  241. msprobe/msaccucmp/cmp_utils/multi_process/progress.py +78 -0
  242. msprobe/msaccucmp/cmp_utils/path_check.py +274 -0
  243. msprobe/msaccucmp/cmp_utils/reg_manager.py +98 -0
  244. msprobe/msaccucmp/cmp_utils/tlv_parse.py +279 -0
  245. msprobe/msaccucmp/cmp_utils/utils.py +356 -0
  246. msprobe/msaccucmp/cmp_utils/utils_type.py +63 -0
  247. msprobe/msaccucmp/compare_vector.py +48 -0
  248. msprobe/msaccucmp/conversion/__init__.py +16 -0
  249. msprobe/msaccucmp/conversion/data_conversion.py +277 -0
  250. msprobe/msaccucmp/conversion/dtype_conversion.py +99 -0
  251. msprobe/msaccucmp/conversion/shape_format_conversion.py +477 -0
  252. msprobe/msaccucmp/conversion/tensor_conversion.py +369 -0
  253. msprobe/msaccucmp/dump_data_conversion.py +46 -0
  254. msprobe/msaccucmp/dump_parse/__init__.py +16 -0
  255. msprobe/msaccucmp/dump_parse/big_dump_data.py +317 -0
  256. msprobe/msaccucmp/dump_parse/dump.py +423 -0
  257. msprobe/msaccucmp/dump_parse/dump_data_object.py +322 -0
  258. msprobe/msaccucmp/dump_parse/dump_data_parser.py +436 -0
  259. msprobe/msaccucmp/dump_parse/dump_utils.py +246 -0
  260. msprobe/msaccucmp/dump_parse/ffts_parser.py +137 -0
  261. msprobe/msaccucmp/dump_parse/mapping.py +62 -0
  262. msprobe/msaccucmp/dump_parse/nano_dump_data.py +392 -0
  263. msprobe/msaccucmp/dump_parse/proto_dump_data.py +308 -0
  264. msprobe/msaccucmp/dump_parser.py +90 -0
  265. msprobe/msaccucmp/format_manager/__init__.py +16 -0
  266. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NCHW.py +53 -0
  267. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_ND.py +52 -0
  268. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_NZ_to_NHWC.py +53 -0
  269. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_HWCN.py +47 -0
  270. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_FRACTAL_Z_to_NCHW.py +47 -0
  271. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_FRACTAL_Z.py +89 -0
  272. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NCHW.py +37 -0
  273. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_HWCN_to_NHWC.py +37 -0
  274. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_HWCN.py +43 -0
  275. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NCHW.py +48 -0
  276. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NC1HWC0_to_NHWC.py +43 -0
  277. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_FRACTAL_Z.py +87 -0
  278. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NCHW_to_NHWC.py +37 -0
  279. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_NCDHW.py +48 -0
  280. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NDC1HWC0_to_ND.py +44 -0
  281. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_FRACTAL_Z.py +87 -0
  282. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_HWCN.py +37 -0
  283. msprobe/msaccucmp/format_manager/builtin_format_convert/convert_NHWC_to_NCHW.py +37 -0
  284. msprobe/msaccucmp/format_manager/format_manager.py +307 -0
  285. msprobe/msaccucmp/inplace_layer_process.py +186 -0
  286. msprobe/msaccucmp/msaccucmp.py +532 -0
  287. msprobe/msaccucmp/mscmp_advisor.py +128 -0
  288. msprobe/msaccucmp/overflow/__init__.py +16 -0
  289. msprobe/msaccucmp/overflow/overflow_analyse.py +305 -0
  290. msprobe/msaccucmp/overflow/overflow_detection.py +143 -0
  291. msprobe/msaccucmp/pytorch_cmp/__init__.py +16 -0
  292. msprobe/msaccucmp/pytorch_cmp/compare_pytorch.py +389 -0
  293. msprobe/msaccucmp/pytorch_cmp/hdf5_parser.py +377 -0
  294. msprobe/msaccucmp/pytorch_cmp/pytorch_dump_data.py +461 -0
  295. msprobe/msaccucmp/shape_conversion.py +41 -0
  296. msprobe/msaccucmp/vector_cmp/__init__.py +16 -0
  297. msprobe/msaccucmp/vector_cmp/batch_compare.py +197 -0
  298. msprobe/msaccucmp/vector_cmp/compare_detail/__init__.py +16 -0
  299. msprobe/msaccucmp/vector_cmp/compare_detail/compare_detail.py +245 -0
  300. msprobe/msaccucmp/vector_cmp/compare_detail/detail.py +182 -0
  301. msprobe/msaccucmp/vector_cmp/compare_detail/detail_writer.py +580 -0
  302. msprobe/msaccucmp/vector_cmp/fusion_manager/__init__.py +16 -0
  303. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_fusion_op.py +588 -0
  304. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_npu_vs_npu.py +339 -0
  305. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_result.py +326 -0
  306. msprobe/msaccucmp/vector_cmp/fusion_manager/compare_rule.py +156 -0
  307. msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_op.py +204 -0
  308. msprobe/msaccucmp/vector_cmp/fusion_manager/fusion_rule_parser.py +635 -0
  309. msprobe/msaccucmp/vector_cmp/fusion_manager/quant_filter.py +187 -0
  310. msprobe/msaccucmp/vector_cmp/range_manager/__init__.py +16 -0
  311. msprobe/msaccucmp/vector_cmp/range_manager/range_manager.py +100 -0
  312. msprobe/msaccucmp/vector_cmp/range_manager/range_mode.py +94 -0
  313. msprobe/msaccucmp/vector_cmp/range_manager/select_mode.py +86 -0
  314. msprobe/msaccucmp/vector_cmp/vector_comparison.py +535 -0
  315. msprobe/msprobe.py +101 -130
  316. msprobe/overflow_check/__init__.py +15 -0
  317. msprobe/{nan_analyze → overflow_check}/analyzer.py +38 -27
  318. msprobe/{nan_analyze → overflow_check}/graph.py +30 -27
  319. msprobe/{nan_analyze → overflow_check}/utils.py +15 -14
  320. msprobe/pytorch/__init__.py +20 -14
  321. msprobe/pytorch/aclgraph_dump/__init__.py +45 -0
  322. msprobe/pytorch/aclgraph_dump/_meta.py +26 -0
  323. msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut.py → acc_check/acc_check.py} +50 -45
  324. msprobe/pytorch/api_accuracy_checker/{run_ut/run_ut_utils.py → acc_check/acc_check_utils.py} +201 -30
  325. msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/data_generate.py +56 -16
  326. msprobe/pytorch/api_accuracy_checker/{run_ut/multi_run_ut.py → acc_check/multi_acc_check.py} +32 -47
  327. msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/run_overflow_check.py +19 -18
  328. msprobe/pytorch/api_accuracy_checker/common/config.py +22 -20
  329. msprobe/pytorch/api_accuracy_checker/common/utils.py +72 -13
  330. msprobe/pytorch/api_accuracy_checker/compare/algorithm.py +41 -11
  331. msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +23 -14
  332. msprobe/pytorch/api_accuracy_checker/compare/compare.py +45 -32
  333. msprobe/pytorch/api_accuracy_checker/compare/compare_column.py +12 -11
  334. msprobe/pytorch/api_accuracy_checker/compare/compare_input.py +14 -12
  335. msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +14 -12
  336. msprobe/pytorch/api_accuracy_checker/precision_standard/absolute_threshold.py +12 -11
  337. msprobe/pytorch/api_accuracy_checker/precision_standard/accumulative_error_compare.py +12 -11
  338. msprobe/pytorch/api_accuracy_checker/precision_standard/base_standard.py +21 -19
  339. msprobe/pytorch/api_accuracy_checker/precision_standard/benchmark_compare.py +14 -13
  340. msprobe/pytorch/api_accuracy_checker/precision_standard/binary_consistency.py +12 -11
  341. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_config.py +60 -11
  342. msprobe/pytorch/api_accuracy_checker/precision_standard/standard_register.py +27 -16
  343. msprobe/pytorch/api_accuracy_checker/precision_standard/thousandth_standard.py +13 -11
  344. msprobe/pytorch/api_accuracy_checker/precision_standard/ulp_compare.py +39 -18
  345. msprobe/pytorch/bench_functions/__init__.py +12 -11
  346. msprobe/pytorch/bench_functions/apply_adam.py +12 -11
  347. msprobe/pytorch/bench_functions/apply_adam_w.py +12 -11
  348. msprobe/pytorch/bench_functions/confusion_transpose.py +12 -11
  349. msprobe/pytorch/bench_functions/fast_gelu.py +12 -11
  350. msprobe/pytorch/bench_functions/group_norm_silu.py +12 -11
  351. msprobe/pytorch/bench_functions/layer_norm_eval.py +12 -11
  352. msprobe/pytorch/bench_functions/linear.py +12 -11
  353. msprobe/pytorch/bench_functions/matmul_backward.py +12 -11
  354. msprobe/pytorch/bench_functions/mish.py +12 -11
  355. msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +12 -11
  356. msprobe/pytorch/bench_functions/npu_fusion_attention.py +12 -11
  357. msprobe/pytorch/bench_functions/rms_norm.py +12 -11
  358. msprobe/pytorch/bench_functions/rotary_mul.py +12 -11
  359. msprobe/pytorch/bench_functions/scaled_mask_softmax.py +12 -11
  360. msprobe/pytorch/bench_functions/sort_v2.py +12 -11
  361. msprobe/pytorch/bench_functions/swiglu.py +12 -11
  362. msprobe/pytorch/common/__init__.py +12 -11
  363. msprobe/pytorch/common/log.py +12 -11
  364. msprobe/pytorch/common/parse_json.py +12 -11
  365. msprobe/pytorch/common/utils.py +52 -19
  366. msprobe/pytorch/compare/distributed_compare.py +13 -13
  367. msprobe/pytorch/compare/match.py +12 -11
  368. msprobe/pytorch/compare/pt_compare.py +14 -20
  369. msprobe/pytorch/compare/pt_diff_analyze.py +12 -11
  370. msprobe/pytorch/compare/utils.py +12 -11
  371. msprobe/pytorch/{hook_module → dump/api_dump}/api_register.py +18 -16
  372. msprobe/pytorch/{hook_module → dump/api_dump}/hook_module.py +14 -13
  373. msprobe/pytorch/{hook_module → dump/api_dump}/pt_hook_manager.py +68 -23
  374. msprobe/pytorch/{hook_module → dump/api_dump}/register_optimizer_hook.py +13 -11
  375. msprobe/pytorch/{hook_module → dump/api_dump}/script_wrapper.py +17 -14
  376. msprobe/pytorch/{hook_module → dump/api_dump}/utils.py +12 -11
  377. msprobe/pytorch/{debugger → dump/debugger}/debugger_config.py +23 -38
  378. msprobe/pytorch/dump/debugger/precision_debugger.py +130 -0
  379. msprobe/pytorch/{function_factory.py → dump/function_factory.py} +12 -11
  380. msprobe/pytorch/dump/module_dump/hook_wrapper.py +17 -13
  381. msprobe/pytorch/dump/module_dump/module_dump.py +16 -15
  382. msprobe/pytorch/dump/module_dump/{module_processer.py → module_processor.py} +54 -42
  383. msprobe/pytorch/dump/pt_config.py +128 -0
  384. msprobe/pytorch/{pytorch_service.py → dump/pytorch_service.py} +22 -21
  385. msprobe/pytorch/monitor/csv2tb.py +13 -11
  386. msprobe/pytorch/monitor/data_writers.py +13 -11
  387. msprobe/pytorch/monitor/distributed/wrap_distributed.py +13 -11
  388. msprobe/pytorch/monitor/features.py +12 -11
  389. msprobe/pytorch/monitor/module_hook.py +67 -59
  390. msprobe/pytorch/monitor/module_metric.py +13 -11
  391. msprobe/pytorch/monitor/optimizer_collect.py +37 -35
  392. msprobe/pytorch/monitor/utils.py +13 -11
  393. msprobe/pytorch/monitor/visualizer.py +12 -11
  394. msprobe/pytorch/torchair_dump/__init__.py +17 -0
  395. msprobe/pytorch/torchair_dump/torchair_dump.py +114 -0
  396. msprobe/scripts/atb/config_example.json +10 -0
  397. msprobe/scripts/atb/load_atb_probe.sh +101 -0
  398. msprobe/scripts/atb/unload_atb_probe.sh +27 -0
  399. msprobe/scripts/build_msaccucmp.sh +186 -0
  400. msprobe/scripts/conf/help.info +6 -0
  401. msprobe/scripts/conf/version.info +3 -0
  402. msprobe/scripts/run_script/common.sh +538 -0
  403. msprobe/scripts/run_script/main_msaccucmp.sh +232 -0
  404. msprobe/visualization/__init__.py +12 -11
  405. msprobe/visualization/builder/__init__.py +12 -11
  406. msprobe/visualization/builder/graph_builder.py +45 -30
  407. msprobe/visualization/builder/graph_merger.py +53 -32
  408. msprobe/visualization/builder/msprobe_adapter.py +34 -44
  409. msprobe/visualization/compare/__init__.py +12 -11
  410. msprobe/visualization/compare/graph_comparator.py +63 -51
  411. msprobe/visualization/compare/mode_adapter.py +28 -113
  412. msprobe/visualization/db_utils.py +133 -22
  413. msprobe/visualization/graph/__init__.py +12 -11
  414. msprobe/visualization/graph/base_node.py +15 -27
  415. msprobe/visualization/graph/distributed_analyzer.py +97 -40
  416. msprobe/visualization/graph/graph.py +14 -16
  417. msprobe/visualization/graph/node_colors.py +34 -31
  418. msprobe/visualization/graph/node_op.py +12 -11
  419. msprobe/visualization/graph_service.py +580 -205
  420. msprobe/visualization/utils.py +278 -31
  421. tb_graph_ascend/secure_build.py +175 -0
  422. tb_graph_ascend/server/__init__.py +15 -0
  423. tb_graph_ascend/server/app/__init__.py +15 -0
  424. tb_graph_ascend/server/app/model/__init__.py +15 -0
  425. tb_graph_ascend/server/app/model/hierarchy.py +348 -0
  426. tb_graph_ascend/server/app/model/layout_hierarchy_model.py +69 -0
  427. tb_graph_ascend/server/app/model/match_nodes_model.py +573 -0
  428. tb_graph_ascend/server/app/repositories/__init__.py +15 -0
  429. tb_graph_ascend/server/app/repositories/graph_repo_base.py +32 -0
  430. tb_graph_ascend/server/app/repositories/graph_repo_db.py +879 -0
  431. tb_graph_ascend/server/app/repositories/graph_repo_vis.py +83 -0
  432. tb_graph_ascend/server/app/service/__init__.py +18 -0
  433. tb_graph_ascend/server/app/service/graph_service_base.py +158 -0
  434. tb_graph_ascend/server/app/service/graph_service_db.py +438 -0
  435. tb_graph_ascend/server/app/service/graph_service_factory.py +54 -0
  436. tb_graph_ascend/server/app/service/graph_service_vis.py +480 -0
  437. tb_graph_ascend/server/app/utils/__init__.py +15 -0
  438. tb_graph_ascend/server/app/utils/constant.py +80 -0
  439. tb_graph_ascend/server/app/utils/file_check_wrapper.py +46 -0
  440. tb_graph_ascend/server/app/utils/global_state.py +95 -0
  441. tb_graph_ascend/server/app/utils/graph_utils.py +661 -0
  442. tb_graph_ascend/server/app/utils/i18n.py +153 -0
  443. tb_graph_ascend/server/app/utils/request_method.py +46 -0
  444. tb_graph_ascend/server/app/views/__init__.py +15 -0
  445. tb_graph_ascend/server/app/views/graph_views.py +304 -0
  446. tb_graph_ascend/server/plugin.py +108 -0
  447. tb_graph_ascend/server/static/index.html +9250 -0
  448. tb_graph_ascend/server/static/index.js +21 -0
  449. tb_graph_ascend/setup.py +57 -0
  450. mindstudio_probe-8.3.2.dist-info/LICENSE +0 -201
  451. mindstudio_probe-8.3.2.dist-info/RECORD +0 -491
  452. mindstudio_probe-8.3.2.dist-info/entry_points.txt +0 -2
  453. mindstudio_probe-8.3.2.dist-info/top_level.txt +0 -1
  454. msprobe/CMakeLists.txt +0 -5
  455. msprobe/README.md +0 -203
  456. msprobe/core/advisor/advisor.py +0 -129
  457. msprobe/core/advisor/advisor_const.py +0 -58
  458. msprobe/core/advisor/advisor_result.py +0 -58
  459. msprobe/core/compare/find_first/data_processor.py +0 -35
  460. msprobe/core/compare/highlight.py +0 -390
  461. msprobe/core/data_dump/data_collector.py +0 -356
  462. msprobe/core/grad_probe/constant.py +0 -90
  463. msprobe/core/grad_probe/grad_compare.py +0 -187
  464. msprobe/core/grad_probe/utils.py +0 -105
  465. msprobe/core/kernel_dump/kernel_config.py +0 -33
  466. msprobe/docs/01.installation.md +0 -250
  467. msprobe/docs/02.config_introduction.md +0 -221
  468. msprobe/docs/03.config_examples.md +0 -281
  469. msprobe/docs/04.kernel_dump_PyTorch.md +0 -73
  470. msprobe/docs/05.data_dump_PyTorch.md +0 -518
  471. msprobe/docs/06.data_dump_MindSpore.md +0 -618
  472. msprobe/docs/07.accuracy_checker_PyTorch.md +0 -310
  473. msprobe/docs/09.accuracy_checker_MindSpore.md +0 -120
  474. msprobe/docs/10.accuracy_compare_PyTorch.md +0 -637
  475. msprobe/docs/11.accuracy_compare_MindSpore.md +0 -769
  476. msprobe/docs/12.overflow_check_PyTorch.md +0 -82
  477. msprobe/docs/13.overflow_check_MindSpore.md +0 -33
  478. msprobe/docs/14.data_parse_PyTorch.md +0 -282
  479. msprobe/docs/15.free_benchmarking_PyTorch.md +0 -169
  480. msprobe/docs/16.free_benchmarking_MindSpore.md +0 -159
  481. msprobe/docs/17.grad_probe.md +0 -205
  482. msprobe/docs/18.online_dispatch.md +0 -89
  483. msprobe/docs/19.monitor.md +0 -753
  484. msprobe/docs/20.monitor_performance_baseline.md +0 -52
  485. msprobe/docs/21.visualization_PyTorch.md +0 -519
  486. msprobe/docs/22.visualization_MindSpore.md +0 -515
  487. msprobe/docs/23.generate_operator_PyTorch.md +0 -107
  488. msprobe/docs/24.code_mapping_Mindspore.md +0 -29
  489. msprobe/docs/25.tool_function_introduction.md +0 -29
  490. msprobe/docs/26.data_dump_PyTorch_baseline.md +0 -48
  491. msprobe/docs/27.dump_json_instruction.md +0 -795
  492. msprobe/docs/28.debugger_save_instruction.md +0 -288
  493. msprobe/docs/28.kernel_dump_MindSpore.md +0 -69
  494. msprobe/docs/29.data_dump_MSAdapter.md +0 -235
  495. msprobe/docs/30.overflow_check_MSAdapter.md +0 -31
  496. msprobe/docs/31.config_check.md +0 -107
  497. msprobe/docs/32.ckpt_compare.md +0 -69
  498. msprobe/docs/33.generate_operator_MindSpore.md +0 -181
  499. msprobe/docs/34.RL_collect.md +0 -101
  500. msprobe/docs/35.nan_analyze.md +0 -73
  501. msprobe/docs/36.calculation_result_change.md +0 -75
  502. msprobe/docs/FAQ.md +0 -232
  503. msprobe/docs/S02.report_free_benchmarking_validation_performance_baseline.md +0 -146
  504. msprobe/docs/accuracy_checker_MindSpore/accuracy_checker_MindSpore_baseline.md +0 -14
  505. msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +0 -33
  506. msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +0 -217
  507. msprobe/docs/img/BLOOM-7B_1.png +0 -0
  508. msprobe/docs/img/BLOOM-7B_2.png +0 -0
  509. msprobe/docs/img/BLOOM-7B_3.png +0 -0
  510. msprobe/docs/img/BLOOM-7B_4.png +0 -0
  511. msprobe/docs/img/GPT-3_1.png +0 -0
  512. msprobe/docs/img/GPT-3_2.png +0 -0
  513. msprobe/docs/img/GPT-3_3.png +0 -0
  514. msprobe/docs/img/GPT-3_4.png +0 -0
  515. msprobe/docs/img/GPT-3_5.png +0 -0
  516. msprobe/docs/img/GPT-3_6.png +0 -0
  517. msprobe/docs/img/GPT-3_7.png +0 -0
  518. msprobe/docs/img/GPT-3_8.png +0 -0
  519. msprobe/docs/img/YOLOV5S_1.png +0 -0
  520. msprobe/docs/img/YOLOV5S_2.png +0 -0
  521. msprobe/docs/img/accuracy_checking_details.png +0 -0
  522. msprobe/docs/img/accuracy_checking_result.png +0 -0
  523. msprobe/docs/img/api_precision_compare_details.png +0 -0
  524. msprobe/docs/img/api_precision_compare_result.png +0 -0
  525. msprobe/docs/img/auto_analyze_log.png +0 -0
  526. msprobe/docs/img/compare_result.png +0 -0
  527. msprobe/docs/img/compare_result_pkl.png +0 -0
  528. msprobe/docs/img/compare_result_pkl_md5.png.png +0 -0
  529. msprobe/docs/img/cpu_info.png +0 -0
  530. msprobe/docs/img/free_benchmark.png +0 -0
  531. msprobe/docs/img/free_benchmark_framework.png +0 -0
  532. msprobe/docs/img/grad_probe_image-1.png +0 -0
  533. msprobe/docs/img/grad_probe_image-2.png +0 -0
  534. msprobe/docs/img/grad_probe_image-3.png +0 -0
  535. msprobe/docs/img/grad_probe_image-4.png +0 -0
  536. msprobe/docs/img/grad_probe_image.png +0 -0
  537. msprobe/docs/img/merge_result.png +0 -0
  538. msprobe/docs/img/module_compare.png +0 -0
  539. msprobe/docs/img/monitor/cpu_info.png +0 -0
  540. msprobe/docs/img/monitor/step_count_per_record.png +0 -0
  541. msprobe/docs/img/ms_dump.png +0 -0
  542. msprobe/docs/img/ms_layer.png +0 -0
  543. msprobe/docs/img/pt_dump.png +0 -0
  544. msprobe/docs/img/save_compare_result_sample.png +0 -0
  545. msprobe/docs/img/visualization/fuzzy_match_ms.png +0 -0
  546. msprobe/docs/img/visualization/fuzzy_match_pt.png +0 -0
  547. msprobe/docs/img/visualization/proxy.png +0 -0
  548. msprobe/docs/img/visualization/tensorboard_1.png +0 -0
  549. msprobe/docs/img/visualization/tensorboard_2.png +0 -0
  550. msprobe/docs/img/visualization/vis_browser_1.png +0 -0
  551. msprobe/docs/img/visualization/vis_browser_2.png +0 -0
  552. msprobe/docs/img/visualization/vis_match_info.png +0 -0
  553. msprobe/docs/img/visualization/vis_precision_info.png +0 -0
  554. msprobe/docs/img/visualization/vis_search_info.png +0 -0
  555. msprobe/docs/img/visualization/vis_show_info.png +0 -0
  556. msprobe/docs/img/visualization/vis_showcase.png +0 -0
  557. msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
  558. msprobe/docs/visualization/GPTModel.png +0 -0
  559. msprobe/docs/visualization/ParallelMLP.png +0 -0
  560. msprobe/docs/visualization/layer_mapping_example.md +0 -132
  561. msprobe/docs/visualization/mapping.png +0 -0
  562. msprobe/docs/visualization/mapping1.png +0 -0
  563. msprobe/docs/visualization/mindspeed_llamafactoary_img/1.png +0 -0
  564. msprobe/docs/visualization/mindspeed_llamafactoary_img/2.png +0 -0
  565. msprobe/docs/visualization/mindspeed_llamafactoary_img/3.png +0 -0
  566. msprobe/docs/visualization/mindspeed_llamafactoary_img/4.png +0 -0
  567. msprobe/docs/visualization/mindspeed_llamafactoary_img/5.png +0 -0
  568. msprobe/docs/visualization/mindspeed_llamafactoary_img/6.png +0 -0
  569. msprobe/docs/visualization/mindspeed_llamafactoary_img/7.png +0 -0
  570. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory-qwen25vl.txt +0 -59
  571. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory1.png +0 -0
  572. msprobe/docs/visualization/mindspeed_llamafactoary_img/llamafactory2.png +0 -0
  573. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed-mm-qwen25vl.txt +0 -80
  574. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed1.png +0 -0
  575. msprobe/docs/visualization/mindspeed_llamafactoary_img/mindspeed2.png +0 -0
  576. msprobe/docs/visualization/mindspeed_llamafactory_mapping.md +0 -330
  577. msprobe/docs/visualization/module_name.png +0 -0
  578. msprobe/docs/visualization/module_name1.png +0 -0
  579. msprobe/docs/visualization/no_mapping.png +0 -0
  580. msprobe/docs/visualization/no_mapping1.png +0 -0
  581. msprobe/docs/visualization/no_mapping_analyze.png +0 -0
  582. msprobe/docs/visualization/top_layer.png +0 -0
  583. msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +0 -460
  584. msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +0 -2081
  585. msprobe/mindspore/code_mapping/bind.py +0 -283
  586. msprobe/mindspore/code_mapping/cmd_parser.py +0 -40
  587. msprobe/mindspore/code_mapping/graph.py +0 -49
  588. msprobe/mindspore/code_mapping/graph_parser.py +0 -211
  589. msprobe/mindspore/code_mapping/main.py +0 -24
  590. msprobe/mindspore/code_mapping/processor.py +0 -34
  591. msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +0 -111
  592. msprobe/mindspore/dym_loader/hook_dynamic_loader.h +0 -52
  593. msprobe/mindspore/free_benchmark/api_pynative_self_check.py +0 -257
  594. msprobe/mindspore/free_benchmark/common/config.py +0 -27
  595. msprobe/mindspore/free_benchmark/common/handler_params.py +0 -31
  596. msprobe/mindspore/free_benchmark/common/utils.py +0 -100
  597. msprobe/mindspore/free_benchmark/data/support_wrap_ops.yaml +0 -638
  598. msprobe/mindspore/free_benchmark/handler/base_handler.py +0 -105
  599. msprobe/mindspore/free_benchmark/handler/check_handler.py +0 -55
  600. msprobe/mindspore/free_benchmark/handler/fix_handler.py +0 -51
  601. msprobe/mindspore/free_benchmark/handler/handler_factory.py +0 -36
  602. msprobe/mindspore/free_benchmark/perturbation/add_noise.py +0 -82
  603. msprobe/mindspore/free_benchmark/perturbation/base_perturbation.py +0 -45
  604. msprobe/mindspore/free_benchmark/perturbation/bit_noise.py +0 -78
  605. msprobe/mindspore/free_benchmark/perturbation/exchange_value.py +0 -77
  606. msprobe/mindspore/free_benchmark/perturbation/improve_precision.py +0 -56
  607. msprobe/mindspore/free_benchmark/perturbation/no_change.py +0 -27
  608. msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +0 -46
  609. msprobe/mindspore/free_benchmark/self_check_tool_factory.py +0 -51
  610. msprobe/mindspore/grad_probe/global_context.py +0 -127
  611. msprobe/mindspore/grad_probe/grad_analyzer.py +0 -260
  612. msprobe/mindspore/grad_probe/grad_monitor.py +0 -42
  613. msprobe/mindspore/grad_probe/grad_stat_csv.py +0 -161
  614. msprobe/mindspore/grad_probe/hook.py +0 -115
  615. msprobe/mindspore/grad_probe/utils.py +0 -43
  616. msprobe/mindspore/mindtorch/__init__.py +0 -18
  617. msprobe/mindspore/ms_config.py +0 -153
  618. msprobe/mindspore/task_handler_factory.py +0 -44
  619. msprobe/nan_analyze/__init__.py +0 -14
  620. msprobe/pytorch/api_accuracy_checker/generate_op_script/config_op.json +0 -9
  621. msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +0 -480
  622. msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +0 -567
  623. msprobe/pytorch/debugger/precision_debugger.py +0 -181
  624. msprobe/pytorch/free_benchmark/__init__.py +0 -23
  625. msprobe/pytorch/free_benchmark/common/constant.py +0 -85
  626. msprobe/pytorch/free_benchmark/common/counter.py +0 -87
  627. msprobe/pytorch/free_benchmark/common/enums.py +0 -80
  628. msprobe/pytorch/free_benchmark/common/params.py +0 -152
  629. msprobe/pytorch/free_benchmark/common/utils.py +0 -143
  630. msprobe/pytorch/free_benchmark/compare/grad_saver.py +0 -215
  631. msprobe/pytorch/free_benchmark/compare/single_benchmark.py +0 -121
  632. msprobe/pytorch/free_benchmark/main.py +0 -123
  633. msprobe/pytorch/free_benchmark/perturbed_layers/base_layer.py +0 -28
  634. msprobe/pytorch/free_benchmark/perturbed_layers/layer_factory.py +0 -56
  635. msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +0 -107
  636. msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +0 -121
  637. msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +0 -89
  638. msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +0 -87
  639. msprobe/pytorch/free_benchmark/perturbed_layers/npu/no_change.py +0 -43
  640. msprobe/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py +0 -60
  641. msprobe/pytorch/free_benchmark/perturbed_layers/run_cpu.py +0 -34
  642. msprobe/pytorch/free_benchmark/result_handlers/base_handler.py +0 -252
  643. msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +0 -54
  644. msprobe/pytorch/free_benchmark/result_handlers/fix_handler.py +0 -40
  645. msprobe/pytorch/free_benchmark/result_handlers/handler_factory.py +0 -45
  646. msprobe/pytorch/free_benchmark/result_handlers/preheat_handler.py +0 -181
  647. msprobe/pytorch/grad_probe/__init__.py +0 -0
  648. msprobe/pytorch/grad_probe/grad_monitor.py +0 -108
  649. msprobe/pytorch/grad_probe/grad_stat_csv.py +0 -160
  650. msprobe/pytorch/hook_module/__init__.py +0 -16
  651. msprobe/pytorch/hook_module/wrap_aten.py +0 -111
  652. msprobe/pytorch/online_dispatch/__init__.py +0 -19
  653. msprobe/pytorch/online_dispatch/compare.py +0 -224
  654. msprobe/pytorch/online_dispatch/dispatch.py +0 -332
  655. msprobe/pytorch/online_dispatch/dump_compare.py +0 -179
  656. msprobe/pytorch/online_dispatch/single_compare.py +0 -412
  657. msprobe/pytorch/online_dispatch/torch_ops_config.yaml +0 -58
  658. msprobe/pytorch/online_dispatch/utils.py +0 -158
  659. msprobe/pytorch/parse_tool/__init__.py +0 -0
  660. msprobe/pytorch/parse_tool/cli.py +0 -31
  661. msprobe/pytorch/parse_tool/lib/__init__.py +0 -0
  662. msprobe/pytorch/parse_tool/lib/compare.py +0 -253
  663. msprobe/pytorch/parse_tool/lib/config.py +0 -50
  664. msprobe/pytorch/parse_tool/lib/file_desc.py +0 -45
  665. msprobe/pytorch/parse_tool/lib/interactive_cli.py +0 -97
  666. msprobe/pytorch/parse_tool/lib/parse_exception.py +0 -54
  667. msprobe/pytorch/parse_tool/lib/parse_tool.py +0 -161
  668. msprobe/pytorch/parse_tool/lib/utils.py +0 -299
  669. msprobe/pytorch/parse_tool/lib/visualization.py +0 -85
  670. msprobe/pytorch/pt_config.py +0 -299
  671. /msprobe/core/{grad_probe → dump}/__init__.py +0 -0
  672. /msprobe/{mindspore/code_mapping → core/dump/api_dump}/__init__.py +0 -0
  673. /msprobe/{mindspore/debugger → core/dump/data_dump}/__init__.py +0 -0
  674. /msprobe/{mindspore/exception_dump → core/dump/data_dump/data_processor}/__init__.py +0 -0
  675. /msprobe/{mindspore/free_benchmark → core/dump/debugger}/__init__.py +0 -0
  676. /msprobe/{mindspore/free_benchmark/common → core/dump/kernel_dump}/__init__.py +0 -0
  677. /msprobe/mindspore/{free_benchmark/handler → dump/debugger}/__init__.py +0 -0
  678. /msprobe/mindspore/{grad_probe → dump/dump_processor}/__init__.py +0 -0
  679. /msprobe/mindspore/{overflow_check → dump/exception_dump}/__init__.py +0 -0
  680. /msprobe/mindspore/{mindtorch → dump/mindtorch}/mindtorch_adaptor.py +0 -0
  681. /msprobe/{pytorch/api_accuracy_checker/run_ut → mindspore/dump/overflow_check}/__init__.py +0 -0
  682. /msprobe/{pytorch/debugger → mindspore/monitor}/__init__.py +0 -0
  683. /msprobe/{pytorch/free_benchmark/common → msaccucmp}/__init__.py +0 -0
  684. /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/.keep +0 -0
  685. /msprobe/pytorch/{free_benchmark/perturbed_layers → api_accuracy_checker/acc_check}/__init__.py +0 -0
  686. /msprobe/pytorch/api_accuracy_checker/{run_ut → acc_check}/torch_ut_setting.json +0 -0
  687. /msprobe/pytorch/{free_benchmark/perturbed_layers/npu → dump/api_dump}/__init__.py +0 -0
  688. /msprobe/pytorch/{hook_module → dump/api_dump}/support_wrap_ops.yaml +0 -0
  689. /msprobe/pytorch/{free_benchmark/result_handlers → dump/debugger}/__init__.py +0 -0
msprobe/msprobe.py CHANGED
@@ -1,163 +1,134 @@
1
- # Copyright (c) 2024, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
3
4
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
7
8
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
+ # http://license.coscl.org.cn/MulanPSL2
9
10
  #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
15
16
 
16
17
  import argparse
17
- import importlib.util
18
18
  import sys
19
19
 
20
- from msprobe.core.common.const import Const
21
- from msprobe.core.common.file_utils import root_privilege_warning
22
20
  from msprobe.core.common.log import logger
21
+ from msprobe.core.compare.utils import _compare_parser
23
22
  from msprobe.core.compare.compare_cli import compare_cli
24
23
  from msprobe.core.compare.merge_result.merge_result_cli import _merge_result_parser, merge_result_cli
25
- from msprobe.core.compare.utils import _compare_parser
26
24
  from msprobe.core.config_check.config_check_cli import _config_checking_parser, _run_config_checking_command
27
-
28
-
29
- def is_module_available(module_name):
30
- spec = importlib.util.find_spec(module_name)
31
- return spec is not None
25
+ from msprobe.overflow_check.analyzer import _overflow_check_parser, _run_overflow_check
26
+ from msprobe.core.acc_check.acc_check_cli import acc_check_cli, multi_acc_check_cli
27
+ from msprobe.visualization.graph_service import _graph_service_parser, _graph_service_command
28
+ from msprobe.core.dump.dump2db.dump2db import _data2db_service_parser, _data2db_command
29
+ from msprobe.infer.offline.compare.msquickcmp.main import _offline_dump_parser, offline_dump_cli
30
+ from msprobe.core.install_deps.install_deps import _install_deps_parser, install_deps_cli
32
31
 
33
32
 
34
33
  def main():
35
34
  parser = argparse.ArgumentParser(
36
35
  formatter_class=argparse.RawDescriptionHelpFormatter,
37
36
  description="msprobe(mindstudio probe), [Powered by MindStudio].\n"
38
- "Providing one-site accuracy difference debugging toolkit for training on Ascend Devices.\n"
37
+ "A full-process, all-scenario precision tool base on Ascend products.\n"
39
38
  f"For any issue, refer README.md first",
40
39
  )
41
40
 
42
41
  parser.set_defaults(print_help=parser.print_help)
43
- parser.add_argument('-f', '--framework', required=True, choices=[Const.PT_FRAMEWORK, Const.MS_FRAMEWORK],
44
- help='Deep learning framework.')
45
42
  subparsers = parser.add_subparsers()
46
- subparsers.add_parser('parse')
47
- compare_cmd_parser = subparsers.add_parser('compare')
48
- run_ut_cmd_parser = subparsers.add_parser('run_ut')
49
- multi_run_ut_cmd_parser = subparsers.add_parser('multi_run_ut')
50
- api_precision_compare_cmd_parser = subparsers.add_parser('api_precision_compare')
51
- run_overflow_check_cmd_parser = subparsers.add_parser('run_overflow_check')
52
- code_mapping_cmd_parser = subparsers.add_parser('code_mapping')
53
- graph_service_cmd_parser = subparsers.add_parser('graph')
54
- op_generate_cmd_parser = subparsers.add_parser('op_generate')
43
+
44
+ compare_parser = subparsers.add_parser('compare')
45
+ _compare_parser(compare_parser)
46
+ acc_check_cmd_parser = subparsers.add_parser('acc_check')
47
+ multi_acc_check_cmd_parser = subparsers.add_parser('multi_acc_check')
55
48
  merge_result_parser = subparsers.add_parser('merge_result')
49
+ _merge_result_parser(merge_result_parser)
50
+ overflow_check_parse = subparsers.add_parser('overflow_check')
51
+ _overflow_check_parser(overflow_check_parse)
56
52
  config_checking_parser = subparsers.add_parser('config_check')
57
- nan_analyze_parser = subparsers.add_parser('nan_analyze')
58
53
  _config_checking_parser(config_checking_parser)
59
- _compare_parser(compare_cmd_parser)
60
- _merge_result_parser(merge_result_parser)
61
-
62
- is_torch_available = is_module_available("torch")
63
-
64
- if len(sys.argv) < 4:
54
+ # api_precision_compare 是 PyTorch 相关能力,延迟加载
55
+ api_precision_compare_cmd_parser = subparsers.add_parser('api_precision_compare')
56
+ try:
57
+ from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import (
58
+ _api_precision_compare_parser
59
+ )
60
+ _api_precision_compare_parser(api_precision_compare_cmd_parser)
61
+ except ImportError as e:
62
+ # torch 不存在时,parser 仍然存在,但给提示
63
+ def _no_torch_parser(_):
64
+ api_precision_compare_cmd_parser.set_defaults(
65
+ func=lambda *_: (
66
+ logger.error(
67
+ "api_precision_compare requires PyTorch environment. "
68
+ "Please install torch / torch_npu first."
69
+ ),
70
+ sys.exit(1)
71
+ )
72
+ )
73
+ _no_torch_parser(api_precision_compare_cmd_parser)
74
+ graph_service_cmd_parser = subparsers.add_parser('graph_visualize')
75
+ _graph_service_parser(graph_service_cmd_parser)
76
+ graph_service_cmd_parser_deprecated = subparsers.add_parser('graph')
77
+ _graph_service_parser(graph_service_cmd_parser_deprecated)
78
+ data2db_parser = subparsers.add_parser('data2db')
79
+ _data2db_service_parser(data2db_parser)
80
+
81
+ offline_dump_parser = subparsers.add_parser('offline_dump')
82
+ _offline_dump_parser(offline_dump_parser)
83
+
84
+ install_deps_cmd_parser = subparsers.add_parser('install_deps')
85
+ _install_deps_parser(install_deps_cmd_parser)
86
+
87
+ if len(sys.argv) >= 2 and sys.argv[1] == "acc_check":
88
+ acc_check_cli(sys.argv[2:])
89
+ return
90
+ elif len(sys.argv) >= 2 and sys.argv[1] == "multi_acc_check":
91
+ multi_acc_check_cli(sys.argv[2:])
92
+ return
93
+ elif len(sys.argv) < 2:
65
94
  parser.print_help()
66
95
  sys.exit(0)
67
96
 
68
- root_privilege_warning()
69
- framework_args = parser.parse_args(sys.argv[1:3])
70
- if framework_args.framework == Const.PT_FRAMEWORK:
71
- from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command
72
- from msprobe.pytorch.parse_tool.cli import parse as cli_parse
73
- from msprobe.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut
74
- from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, \
75
- _api_precision_compare_command
76
- from msprobe.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, \
77
- _run_overflow_check_command
78
- from msprobe.visualization.graph_service import _pt_graph_service_parser, _pt_graph_service_command
79
- from msprobe.pytorch.api_accuracy_checker.generate_op_script.op_generator import _op_generator_parser, \
80
- _run_operator_generate_commond
81
- from msprobe.nan_analyze.analyzer import _nan_analyze_parser, _run_nan_analyze
82
-
83
- _run_ut_parser(run_ut_cmd_parser)
84
- _run_ut_parser(multi_run_ut_cmd_parser)
85
- multi_run_ut_cmd_parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8,
86
- help='Number of splits for parallel processing. Range: 1-64')
87
- _api_precision_compare_parser(api_precision_compare_cmd_parser)
88
- _run_overflow_check_parser(run_overflow_check_cmd_parser)
89
- _pt_graph_service_parser(graph_service_cmd_parser)
90
- _op_generator_parser(op_generate_cmd_parser)
91
- _nan_analyze_parser(nan_analyze_parser)
92
- elif framework_args.framework == Const.MS_FRAMEWORK:
93
- from msprobe.mindspore.api_accuracy_checker.cmd_parser import add_api_accuracy_checker_argument
94
- from msprobe.visualization.graph_service import _ms_graph_service_parser, _ms_graph_service_command
95
- add_api_accuracy_checker_argument(run_ut_cmd_parser)
96
- from msprobe.mindspore.api_accuracy_checker.cmd_parser import multi_add_api_accuracy_checker_argument
97
- multi_add_api_accuracy_checker_argument(multi_run_ut_cmd_parser)
98
- from msprobe.mindspore.code_mapping.cmd_parser import add_ir_parser_arguments
99
- add_ir_parser_arguments(code_mapping_cmd_parser)
100
-
101
- _ms_graph_service_parser(graph_service_cmd_parser)
102
-
103
- from msprobe.mindspore.api_accuracy_checker.generate_op_script.op_generator import _op_generator_parser, \
104
- _run_operator_generate_commond
105
- _op_generator_parser(op_generate_cmd_parser)
106
-
107
97
  args = parser.parse_args(sys.argv[1:])
108
- if sys.argv[2] == Const.PT_FRAMEWORK:
109
- if not is_torch_available:
110
- logger.error("PyTorch does not exist, please install PyTorch library")
111
- raise Exception("PyTorch does not exist, please install PyTorch library")
112
- if sys.argv[3] == "run_ut":
113
- run_ut_command(args)
114
- elif sys.argv[3] == "parse":
115
- cli_parse()
116
- elif sys.argv[3] == "multi_run_ut":
117
- config = prepare_config(args)
118
- run_parallel_ut(config)
119
- elif sys.argv[3] == "api_precision_compare":
98
+ if sys.argv[1] == "compare":
99
+ compare_cli(args, sys.argv[1:])
100
+ elif sys.argv[1] == "merge_result":
101
+ merge_result_cli(args)
102
+ elif sys.argv[1] == "overflow_check":
103
+ _run_overflow_check(args)
104
+ elif sys.argv[1] == "graph_visualize":
105
+ _graph_service_command(args)
106
+ elif sys.argv[1] == "api_precision_compare":
107
+ try:
108
+ from msprobe.pytorch.api_accuracy_checker.compare.api_precision_compare import (
109
+ _api_precision_compare_command
110
+ )
120
111
  _api_precision_compare_command(args)
121
- elif sys.argv[3] == "run_overflow_check":
122
- _run_overflow_check_command(args)
123
- elif sys.argv[3] == "graph":
124
- _pt_graph_service_command(args)
125
- elif sys.argv[3] == 'op_generate':
126
- _run_operator_generate_commond(args)
127
- elif sys.argv[3] == "compare":
128
- if args.cell_mapping is not None or args.api_mapping is not None:
129
- logger.error("Argument -cm or -am is not supported in PyTorch framework")
130
- raise Exception("Argument -cm or -am is not supported in PyTorch framework")
131
- compare_cli(args)
132
- elif sys.argv[3] == "merge_result":
133
- merge_result_cli(args)
134
- elif sys.argv[3] == "config_check":
135
- _run_config_checking_command(args)
136
- elif sys.argv[3] == "nan_analyze":
137
- _run_nan_analyze(args)
138
- else:
139
- if not is_module_available(Const.MS_FRAMEWORK):
140
- logger.error("MindSpore does not exist, please install MindSpore library")
141
- raise Exception("MindSpore does not exist, please install MindSpore library")
142
- if sys.argv[3] == "compare":
143
- compare_cli(args)
144
- elif sys.argv[3] == "merge_result":
145
- merge_result_cli(args)
146
- elif sys.argv[3] == "run_ut":
147
- from msprobe.mindspore.api_accuracy_checker.main import api_checker_main
148
- api_checker_main(args)
149
- elif sys.argv[3] == "multi_run_ut":
150
- from msprobe.mindspore.api_accuracy_checker.main import mul_api_checker_main
151
- mul_api_checker_main(args)
152
- elif sys.argv[3] == "graph":
153
- _ms_graph_service_command(args)
154
- elif sys.argv[3] == 'op_generate':
155
- _run_operator_generate_commond(args)
156
- elif sys.argv[3] == "code_mapping":
157
- from msprobe.mindspore.code_mapping.main import code_mapping_main
158
- code_mapping_main(args)
159
- elif sys.argv[3] == "config_check":
160
- _run_config_checking_command(args)
112
+ except ImportError:
113
+ logger.error(
114
+ "api_precision_compare requires PyTorch environment. "
115
+ "Please install torch / torch_npu first."
116
+ )
117
+ sys.exit(1)
118
+
119
+ elif sys.argv[1] == "graph":
120
+ logger.warning('The "graph" parameter has been deprecated and will be removed in future versions. '
121
+ 'Please use the "graph_visualize" parameter instead.')
122
+ _graph_service_command(args)
123
+ elif sys.argv[1] == "config_check":
124
+ _run_config_checking_command(args)
125
+ elif sys.argv[1] == "data2db":
126
+ _data2db_command(args)
127
+
128
+ elif sys.argv[1] == "offline_dump":
129
+ offline_dump_cli(args)
130
+ elif sys.argv[1] == "install_deps":
131
+ install_deps_cli(args)
161
132
 
162
133
 
163
134
  if __name__ == "__main__":
@@ -0,0 +1,15 @@
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
4
+ #
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
8
+ #
9
+ # http://license.coscl.org.cn/MulanPSL2
10
+ #
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
@@ -1,37 +1,39 @@
1
- # Copyright (c) 2025, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
3
4
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
7
8
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
+ # http://license.coscl.org.cn/MulanPSL2
9
10
  #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
15
16
 
16
17
  import time
17
18
  from collections import defaultdict
18
19
  import os
19
20
  from itertools import dropwhile, chain
20
21
 
21
- from msprobe.core.common.file_utils import check_file_or_directory_path, save_json, make_dir
22
+ from msprobe.core.common.file_utils import check_file_or_directory_path, save_json, make_dir, load_json
22
23
  from msprobe.core.common.log import logger
23
24
  from msprobe.core.common.const import Const
24
- from msprobe.nan_analyze.utils import (RankPath, FileCache, is_communication_op, is_ignore_op, NanAnalyseConst,
25
- analyze_anomaly_in_group)
26
- from msprobe.nan_analyze.graph import DataNode, CommunicationNode
25
+ from msprobe.overflow_check.utils import (RankPath, FileCache, is_communication_op, is_ignore_op, OverFlowCheckConst,
26
+ analyze_anomaly_in_group)
27
+ from msprobe.overflow_check.graph import DataNode, CommunicationNode
27
28
 
28
29
 
29
- class NanAnalyzer:
30
+ class OverFlowCheck:
30
31
  def __init__(self, input_path, output_path):
31
32
  self._input_path = input_path
32
33
  self._output_path = output_path
33
34
  self._paths = {}
34
35
  self._resolve_input_path()
36
+ self._check_framework()
35
37
  self._anomaly_nodes = [] # 记录所有异常节点
36
38
  self._cache = FileCache()
37
39
  self._first_comm_nodes = {} # 记录各rank下首个通信节点的node_id
@@ -51,18 +53,27 @@ class NanAnalyzer:
51
53
  for path in contents:
52
54
  if not path.startswith('rank'):
53
55
  continue
54
- rank_str = path.strip('rank')
56
+ rank_str = path[len('rank'):]
55
57
  if not rank_str:
56
58
  rank = 0
57
59
  elif not rank_str.isdigit():
58
60
  continue
59
61
  else:
60
62
  rank = int(rank_str)
61
- dump_path = os.path.join(self._input_path, path, NanAnalyseConst.DUMP_FILE)
62
- construct_path = os.path.join(self._input_path, path, NanAnalyseConst.CONSTRUCT_FILE)
63
- stack_path = os.path.join(self._input_path, path, NanAnalyseConst.STACK_FILE)
63
+ dump_path = os.path.join(self._input_path, path, OverFlowCheckConst.DUMP_FILE)
64
+ construct_path = os.path.join(self._input_path, path, OverFlowCheckConst.CONSTRUCT_FILE)
65
+ stack_path = os.path.join(self._input_path, path, OverFlowCheckConst.STACK_FILE)
64
66
  self._paths[rank] = RankPath(rank, dump_path, construct_path, stack_path)
65
67
 
68
+ def _check_framework(self):
69
+ for _, rankpath in self._paths.items():
70
+ data = load_json(rankpath.dump_path)
71
+ if data.get('framework', '') != "pytorch":
72
+ logger.error('overflow check is only available for PyTorch, though the data is stored in '
73
+ 'a different framework.')
74
+ raise RuntimeError(f'invalid dump_path {rankpath.dump_path}')
75
+ break
76
+
66
77
  def _pre_analyze(self):
67
78
  logger.info('Start searching anomaly node before communication.')
68
79
  for path in self._paths.values():
@@ -148,9 +159,9 @@ class NanAnalyzer:
148
159
  def _find_connection(self, conn_info, cur_node, searched_ranks, seen_nodes):
149
160
  def connect():
150
161
  seen_nodes.add(search_node.node_id)
151
- if search_node.type == NanAnalyseConst.DST:
162
+ if search_node.type == OverFlowCheckConst.DST:
152
163
  cur_node.add_dst(search_node)
153
- elif search_node.type == NanAnalyseConst.SRC:
164
+ elif search_node.type == OverFlowCheckConst.SRC:
154
165
  search_node.layer = cur_node.layer
155
166
  search_node.add_dst(cur_node)
156
167
  else:
@@ -167,7 +178,7 @@ class NanAnalyzer:
167
178
  if not (search_id.startswith(tar_id_prefix) and search_node.type == conn_info.get('type')):
168
179
  continue
169
180
  search_conn_ranks = search_node.find_connected_nodes().get('ranks')
170
- if ((not search_conn_ranks and search_node.api not in NanAnalyseConst.DIRECTED_API) or
181
+ if ((not search_conn_ranks and search_node.api not in OverFlowCheckConst.DIRECTED_API) or
171
182
  cur_node.rank in search_conn_ranks): # 有些无向通信算子没有填ProcessGroup,默认连接所有rank
172
183
  connect()
173
184
  found = True
@@ -241,15 +252,15 @@ class NanAnalyzer:
241
252
  return self._rank_comm_nodes_dict.get(rank, {}).get(node_id)
242
253
 
243
254
 
244
- def _nan_analyze_parser(parser):
255
+ def _overflow_check_parser(parser):
245
256
  parser.add_argument("-i", "--input_path", dest="input_path", default="", type=str,
246
257
  help="<Required> The dump file path, over step level. eg: \"xxx/step_0/\".",
247
258
  required=True)
248
259
  parser.add_argument("-o", "--output_path", dest="output_path", default="./output", type=str,
249
- help="<optional> The nan inf analyze result output file path.",
260
+ help="<optional> The overflow check result output file path.",
250
261
  required=False)
251
262
 
252
263
 
253
- def _run_nan_analyze(args):
264
+ def _run_overflow_check(args):
254
265
  check_file_or_directory_path(args.input_path, True)
255
- NanAnalyzer(args.input_path, args.output_path).analyze()
266
+ OverFlowCheck(args.input_path, args.output_path).analyze()
@@ -1,22 +1,23 @@
1
- # Copyright (c) 2025, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
3
4
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
7
8
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
+ # http://license.coscl.org.cn/MulanPSL2
9
10
  #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
15
16
 
16
17
  from dataclasses import dataclass
17
18
  from msprobe.core.common.const import Const
18
19
  from msprobe.core.common.log import logger
19
- from msprobe.nan_analyze.utils import FileCache, RankPath, is_ignore_op, check_item_anomaly, NanAnalyseConst
20
+ from msprobe.overflow_check.utils import FileCache, RankPath, is_ignore_op, check_item_anomaly, OverFlowCheckConst
20
21
  from msprobe.core.common.exceptions import MsprobeException
21
22
 
22
23
 
@@ -46,6 +47,8 @@ class DataNode:
46
47
  seen = set(op_name)
47
48
  while True:
48
49
  op_name = construct_info.get(op_name)
50
+ if isinstance(op_name, list):
51
+ op_name = op_name[0]
49
52
  if not op_name or op_name in seen:
50
53
  return construct
51
54
  construct.insert(0, op_name)
@@ -140,7 +143,7 @@ class CommunicationNode:
140
143
 
141
144
  def has_nan_inf(self):
142
145
  return self.input_has_nan_inf() or check_item_anomaly(self.data.outputs)
143
-
146
+
144
147
  def input_has_nan_inf(self):
145
148
  return check_item_anomaly(self.data.input_args) or check_item_anomaly(self.data.input_kwargs)
146
149
 
@@ -148,15 +151,15 @@ class CommunicationNode:
148
151
  """
149
152
  根据 api/类型/入参/调用次数 确定相连接的node的op_name
150
153
  """
151
- tar_api = NanAnalyseConst.P2P_API_MAPPING.get(self.api, self.api)
154
+ tar_api = OverFlowCheckConst.P2P_API_MAPPING.get(self.api, self.api)
152
155
  ranks = set()
153
- for dst in [NanAnalyseConst.DST, NanAnalyseConst.DST_GROUP]:
156
+ for dst in [OverFlowCheckConst.DST, OverFlowCheckConst.DST_GROUP]:
154
157
  if dst in self.data.input_kwargs:
155
158
  dst_value = self.data.input_kwargs.get(dst)
156
159
  if dst_value:
157
160
  ranks.add(dst_value.get('value'))
158
161
  break
159
- for src in [NanAnalyseConst.SRC, NanAnalyseConst.SRC_GROUP]:
162
+ for src in [OverFlowCheckConst.SRC, OverFlowCheckConst.SRC_GROUP]:
160
163
  if src in self.data.input_kwargs:
161
164
  src_value = self.data.input_kwargs.get(src)
162
165
  if src_value:
@@ -170,24 +173,24 @@ class CommunicationNode:
170
173
  if group:
171
174
  ranks.update(group.get('group_ranks'))
172
175
  return {'ranks': ranks, 'api': f'Distributed.{tar_api}',
173
- 'type': NanAnalyseConst.OPPOSITE_DIR.get(self.type, NanAnalyseConst.LINK)}
176
+ 'type': OverFlowCheckConst.OPPOSITE_DIR.get(self.type, OverFlowCheckConst.LINK)}
174
177
 
175
178
  def _resolve_type(self):
176
- for src in [NanAnalyseConst.SRC, NanAnalyseConst.SRC_GROUP]:
179
+ for src in [OverFlowCheckConst.SRC, OverFlowCheckConst.SRC_GROUP]:
177
180
  if src in self.data.input_kwargs and self.data.input_kwargs[src]:
178
181
  if self.data.input_kwargs[src].get('value') == self.rank:
179
- return NanAnalyseConst.SRC
182
+ return OverFlowCheckConst.SRC
180
183
  else:
181
- return NanAnalyseConst.DST
182
- for dst in [NanAnalyseConst.DST, NanAnalyseConst.DST_GROUP]:
184
+ return OverFlowCheckConst.DST
185
+ for dst in [OverFlowCheckConst.DST, OverFlowCheckConst.DST_GROUP]:
183
186
  if dst in self.data.input_kwargs and self.data.input_kwargs[dst]:
184
187
  if self.data.input_kwargs[dst].get('value') == self.rank:
185
- return NanAnalyseConst.DST
188
+ return OverFlowCheckConst.DST
186
189
  else:
187
- return NanAnalyseConst.SRC
188
- if self.api in NanAnalyseConst.DIRECTED_API:
190
+ return OverFlowCheckConst.SRC
191
+ if self.api in OverFlowCheckConst.DIRECTED_API:
189
192
  for item in self.data.input_args:
190
193
  if item.get(Const.TYPE) == 'int':
191
- node_type = NanAnalyseConst.DIRECTED_API[self.api]
192
- return node_type if item.get('value') == self.rank else NanAnalyseConst.OPPOSITE_DIR[node_type]
193
- return NanAnalyseConst.LINK
194
+ node_type = OverFlowCheckConst.DIRECTED_API[self.api]
195
+ return node_type if item.get('value') == self.rank else OverFlowCheckConst.OPPOSITE_DIR[node_type]
196
+ return OverFlowCheckConst.LINK
@@ -1,17 +1,18 @@
1
- # Copyright (c) 2025, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
3
4
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
7
8
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
+ # http://license.coscl.org.cn/MulanPSL2
9
10
  #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
15
16
 
16
17
  from collections import OrderedDict
17
18
  from dataclasses import dataclass
@@ -120,7 +121,7 @@ def is_communication_op(op_name):
120
121
  # 定义通信算子的关键字,覆盖各种通信操作,如all_reduce, send, broadcast等
121
122
  # 从wrap文件中读取,先硬编码在文件中
122
123
  return (op_name.startswith('Distributed.') and
123
- any(keyword in op_name for keyword in NanAnalyseConst.COMMUNICATION_KEYWORDS))
124
+ any(keyword in op_name for keyword in OverFlowCheckConst.COMMUNICATION_KEYWORDS))
124
125
 
125
126
 
126
127
  def is_ignore_op(op_name):
@@ -163,7 +164,7 @@ def analyze_anomaly_in_group(nodes_group):
163
164
  anomaly_nodes.append(node.data)
164
165
 
165
166
  # 先看src或link中input是否有异常
166
- src_list = list(filter(lambda node: node.type in [NanAnalyseConst.SRC, NanAnalyseConst.LINK], nodes_group))
167
+ src_list = list(filter(lambda node: node.type in [OverFlowCheckConst.SRC, OverFlowCheckConst.LINK], nodes_group))
167
168
  input_anomaly_nodes = list(filter(lambda node: node.input_has_nan_inf(), src_list))
168
169
  # 如果有异常回溯计算节点找到异常来源
169
170
  # 使用cpu模拟节点进行计算,查看结果是否有问题。需要对所有计算节点录入/映射,暂不实现。
@@ -174,7 +175,7 @@ def analyze_anomaly_in_group(nodes_group):
174
175
  return anomaly_nodes
175
176
 
176
177
 
177
- class NanAnalyseConst:
178
+ class OverFlowCheckConst:
178
179
  COMMUNICATION_KEYWORDS = {
179
180
  'send', # send 算子
180
181
  'recv', # recv 算子
@@ -1,24 +1,30 @@
1
- # Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
2
- # All rights reserved.
1
+ # -------------------------------------------------------------------------
2
+ # This file is part of the MindStudio project.
3
+ # Copyright (c) 2025 Huawei Technologies Co.,Ltd.
3
4
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
5
+ # MindStudio is licensed under Mulan PSL v2.
6
+ # You can use this software according to the terms and conditions of the Mulan PSL v2.
7
+ # You may obtain a copy of Mulan PSL v2 at:
7
8
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
+ # http://license.coscl.org.cn/MulanPSL2
9
10
  #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
11
+ # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12
+ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13
+ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14
+ # See the Mulan PSL v2 for more details.
15
+ # -------------------------------------------------------------------------
15
16
 
16
17
  import torch
17
- from .compare.distributed_compare import compare_distributed
18
- from .compare.pt_compare import compare
18
+
19
+ from msprobe.pytorch.dump.debugger.precision_debugger import PrecisionDebugger, module_dump, module_dump_end
19
20
  from .common.utils import seed_all
20
- from .debugger.precision_debugger import PrecisionDebugger, module_dump, module_dump_end
21
+ from .torchair_dump import set_fx_dump_config, set_ge_dump_config
21
22
 
22
23
  torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
23
24
  if torch_version_above_or_equal_2:
24
25
  from msprobe.pytorch.monitor.module_hook import TrainerMon
26
+
27
+ try:
28
+ from msprobe.pytorch.aclgraph_dump import acl_save # noqa: F401
29
+ except Exception:
30
+ acl_save = None